1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10 // and generates target-independent LLVM-IR.
11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12 // of instructions in order to estimate the profitability of vectorization.
13 //
14 // The loop vectorizer combines consecutive loop iterations into a single
15 // 'wide' iteration. After this transformation the index is incremented
16 // by the SIMD vector width, and not by one.
17 //
18 // This pass has three parts:
19 // 1. The main loop pass that drives the different parts.
20 // 2. LoopVectorizationLegality - A unit that checks for the legality
21 //    of the vectorization.
22 // 3. InnerLoopVectorizer - A unit that performs the actual
23 //    widening of instructions.
24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
25 //    of vectorization. It decides on the optimal vector width, which
26 //    can be one, if vectorization is not profitable.
27 //
28 // There is a development effort going on to migrate loop vectorizer to the
29 // VPlan infrastructure and to introduce outer loop vectorization support (see
30 // docs/Proposal/VectorizationPlan.rst and
31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32 // purpose, we temporarily introduced the VPlan-native vectorization path: an
33 // alternative vectorization path that is natively implemented on top of the
34 // VPlan infrastructure. See EnableVPlanNativePath for enabling.
35 //
36 //===----------------------------------------------------------------------===//
37 //
38 // The reduction-variable vectorization is based on the paper:
39 //  D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40 //
41 // Variable uniformity checks are inspired by:
42 //  Karrenberg, R. and Hack, S. Whole Function Vectorization.
43 //
44 // The interleaved access vectorization is based on the paper:
45 //  Dorit Nuzman, Ira Rosen and Ayal Zaks.  Auto-Vectorization of Interleaved
46 //  Data for SIMD
47 //
48 // Other ideas/concepts are from:
49 //  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50 //
51 //  S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua.  An Evaluation of
52 //  Vectorizing Compilers.
53 //
54 //===----------------------------------------------------------------------===//
55 
56 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
57 #include "LoopVectorizationPlanner.h"
58 #include "VPRecipeBuilder.h"
59 #include "VPlan.h"
60 #include "VPlanHCFGBuilder.h"
61 #include "VPlanPredicator.h"
62 #include "VPlanTransforms.h"
63 #include "llvm/ADT/APInt.h"
64 #include "llvm/ADT/ArrayRef.h"
65 #include "llvm/ADT/DenseMap.h"
66 #include "llvm/ADT/DenseMapInfo.h"
67 #include "llvm/ADT/Hashing.h"
68 #include "llvm/ADT/MapVector.h"
69 #include "llvm/ADT/None.h"
70 #include "llvm/ADT/Optional.h"
71 #include "llvm/ADT/STLExtras.h"
72 #include "llvm/ADT/SetVector.h"
73 #include "llvm/ADT/SmallPtrSet.h"
74 #include "llvm/ADT/SmallVector.h"
75 #include "llvm/ADT/Statistic.h"
76 #include "llvm/ADT/StringRef.h"
77 #include "llvm/ADT/Twine.h"
78 #include "llvm/ADT/iterator_range.h"
79 #include "llvm/Analysis/AssumptionCache.h"
80 #include "llvm/Analysis/BasicAliasAnalysis.h"
81 #include "llvm/Analysis/BlockFrequencyInfo.h"
82 #include "llvm/Analysis/CFG.h"
83 #include "llvm/Analysis/CodeMetrics.h"
84 #include "llvm/Analysis/DemandedBits.h"
85 #include "llvm/Analysis/GlobalsModRef.h"
86 #include "llvm/Analysis/LoopAccessAnalysis.h"
87 #include "llvm/Analysis/LoopAnalysisManager.h"
88 #include "llvm/Analysis/LoopInfo.h"
89 #include "llvm/Analysis/LoopIterator.h"
90 #include "llvm/Analysis/MemorySSA.h"
91 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
92 #include "llvm/Analysis/ProfileSummaryInfo.h"
93 #include "llvm/Analysis/ScalarEvolution.h"
94 #include "llvm/Analysis/ScalarEvolutionExpander.h"
95 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
96 #include "llvm/Analysis/TargetLibraryInfo.h"
97 #include "llvm/Analysis/TargetTransformInfo.h"
98 #include "llvm/Analysis/VectorUtils.h"
99 #include "llvm/IR/Attributes.h"
100 #include "llvm/IR/BasicBlock.h"
101 #include "llvm/IR/CFG.h"
102 #include "llvm/IR/Constant.h"
103 #include "llvm/IR/Constants.h"
104 #include "llvm/IR/DataLayout.h"
105 #include "llvm/IR/DebugInfoMetadata.h"
106 #include "llvm/IR/DebugLoc.h"
107 #include "llvm/IR/DerivedTypes.h"
108 #include "llvm/IR/DiagnosticInfo.h"
109 #include "llvm/IR/Dominators.h"
110 #include "llvm/IR/Function.h"
111 #include "llvm/IR/IRBuilder.h"
112 #include "llvm/IR/InstrTypes.h"
113 #include "llvm/IR/Instruction.h"
114 #include "llvm/IR/Instructions.h"
115 #include "llvm/IR/IntrinsicInst.h"
116 #include "llvm/IR/Intrinsics.h"
117 #include "llvm/IR/LLVMContext.h"
118 #include "llvm/IR/Metadata.h"
119 #include "llvm/IR/Module.h"
120 #include "llvm/IR/Operator.h"
121 #include "llvm/IR/Type.h"
122 #include "llvm/IR/Use.h"
123 #include "llvm/IR/User.h"
124 #include "llvm/IR/Value.h"
125 #include "llvm/IR/ValueHandle.h"
126 #include "llvm/IR/Verifier.h"
127 #include "llvm/InitializePasses.h"
128 #include "llvm/Pass.h"
129 #include "llvm/Support/Casting.h"
130 #include "llvm/Support/CommandLine.h"
131 #include "llvm/Support/Compiler.h"
132 #include "llvm/Support/Debug.h"
133 #include "llvm/Support/ErrorHandling.h"
134 #include "llvm/Support/MathExtras.h"
135 #include "llvm/Support/raw_ostream.h"
136 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
137 #include "llvm/Transforms/Utils/LoopSimplify.h"
138 #include "llvm/Transforms/Utils/LoopUtils.h"
139 #include "llvm/Transforms/Utils/LoopVersioning.h"
140 #include "llvm/Transforms/Utils/SizeOpts.h"
141 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
142 #include <algorithm>
143 #include <cassert>
144 #include <cstdint>
145 #include <cstdlib>
146 #include <functional>
147 #include <iterator>
148 #include <limits>
149 #include <memory>
150 #include <string>
151 #include <tuple>
152 #include <utility>
153 
154 using namespace llvm;
155 
156 #define LV_NAME "loop-vectorize"
157 #define DEBUG_TYPE LV_NAME
158 
159 /// @{
160 /// Metadata attribute names
161 static const char *const LLVMLoopVectorizeFollowupAll =
162     "llvm.loop.vectorize.followup_all";
163 static const char *const LLVMLoopVectorizeFollowupVectorized =
164     "llvm.loop.vectorize.followup_vectorized";
165 static const char *const LLVMLoopVectorizeFollowupEpilogue =
166     "llvm.loop.vectorize.followup_epilogue";
167 /// @}
168 
169 STATISTIC(LoopsVectorized, "Number of loops vectorized");
170 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
171 
172 /// Loops with a known constant trip count below this number are vectorized only
173 /// if no scalar iteration overheads are incurred.
174 static cl::opt<unsigned> TinyTripCountVectorThreshold(
175     "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
176     cl::desc("Loops with a constant trip count that is smaller than this "
177              "value are vectorized only if no scalar iteration overheads "
178              "are incurred."));
179 
180 // Indicates that an epilogue is undesired, predication is preferred.
181 // This means that the vectorizer will try to fold the loop-tail (epilogue)
182 // into the loop and predicate the loop body accordingly.
183 static cl::opt<bool> PreferPredicateOverEpilog(
184     "prefer-predicate-over-epilog", cl::init(false), cl::Hidden,
185     cl::desc("Indicate that an epilogue is undesired, predication should be "
186              "used instead."));
187 
188 static cl::opt<bool> MaximizeBandwidth(
189     "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
190     cl::desc("Maximize bandwidth when selecting vectorization factor which "
191              "will be determined by the smallest type in loop."));
192 
193 static cl::opt<bool> EnableInterleavedMemAccesses(
194     "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
195     cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
196 
197 /// An interleave-group may need masking if it resides in a block that needs
198 /// predication, or in order to mask away gaps.
199 static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
200     "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
201     cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
202 
203 static cl::opt<unsigned> TinyTripCountInterleaveThreshold(
204     "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden,
205     cl::desc("We don't interleave loops with a estimated constant trip count "
206              "below this number"));
207 
208 static cl::opt<unsigned> ForceTargetNumScalarRegs(
209     "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
210     cl::desc("A flag that overrides the target's number of scalar registers."));
211 
212 static cl::opt<unsigned> ForceTargetNumVectorRegs(
213     "force-target-num-vector-regs", cl::init(0), cl::Hidden,
214     cl::desc("A flag that overrides the target's number of vector registers."));
215 
216 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
217     "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
218     cl::desc("A flag that overrides the target's max interleave factor for "
219              "scalar loops."));
220 
221 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
222     "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
223     cl::desc("A flag that overrides the target's max interleave factor for "
224              "vectorized loops."));
225 
226 static cl::opt<unsigned> ForceTargetInstructionCost(
227     "force-target-instruction-cost", cl::init(0), cl::Hidden,
228     cl::desc("A flag that overrides the target's expected cost for "
229              "an instruction to a single constant value. Mostly "
230              "useful for getting consistent testing."));
231 
232 static cl::opt<unsigned> SmallLoopCost(
233     "small-loop-cost", cl::init(20), cl::Hidden,
234     cl::desc(
235         "The cost of a loop that is considered 'small' by the interleaver."));
236 
237 static cl::opt<bool> LoopVectorizeWithBlockFrequency(
238     "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
239     cl::desc("Enable the use of the block frequency analysis to access PGO "
240              "heuristics minimizing code growth in cold regions and being more "
241              "aggressive in hot regions."));
242 
243 // Runtime interleave loops for load/store throughput.
244 static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
245     "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
246     cl::desc(
247         "Enable runtime interleaving until load/store ports are saturated"));
248 
249 /// The number of stores in a loop that are allowed to need predication.
250 static cl::opt<unsigned> NumberOfStoresToPredicate(
251     "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
252     cl::desc("Max number of stores to be predicated behind an if."));
253 
254 static cl::opt<bool> EnableIndVarRegisterHeur(
255     "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
256     cl::desc("Count the induction variable only once when interleaving"));
257 
258 static cl::opt<bool> EnableCondStoresVectorization(
259     "enable-cond-stores-vec", cl::init(true), cl::Hidden,
260     cl::desc("Enable if predication of stores during vectorization."));
261 
262 static cl::opt<unsigned> MaxNestedScalarReductionIC(
263     "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
264     cl::desc("The maximum interleave count to use when interleaving a scalar "
265              "reduction in a nested loop."));
266 
267 cl::opt<bool> EnableVPlanNativePath(
268     "enable-vplan-native-path", cl::init(false), cl::Hidden,
269     cl::desc("Enable VPlan-native vectorization path with "
270              "support for outer loop vectorization."));
271 
272 // FIXME: Remove this switch once we have divergence analysis. Currently we
273 // assume divergent non-backedge branches when this switch is true.
274 cl::opt<bool> EnableVPlanPredication(
275     "enable-vplan-predication", cl::init(false), cl::Hidden,
276     cl::desc("Enable VPlan-native vectorization path predicator with "
277              "support for outer loop vectorization."));
278 
279 // This flag enables the stress testing of the VPlan H-CFG construction in the
280 // VPlan-native vectorization path. It must be used in conjuction with
281 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
282 // verification of the H-CFGs built.
283 static cl::opt<bool> VPlanBuildStressTest(
284     "vplan-build-stress-test", cl::init(false), cl::Hidden,
285     cl::desc(
286         "Build VPlan for every supported loop nest in the function and bail "
287         "out right after the build (stress test the VPlan H-CFG construction "
288         "in the VPlan-native vectorization path)."));
289 
290 cl::opt<bool> llvm::EnableLoopInterleaving(
291     "interleave-loops", cl::init(true), cl::Hidden,
292     cl::desc("Enable loop interleaving in Loop vectorization passes"));
293 cl::opt<bool> llvm::EnableLoopVectorization(
294     "vectorize-loops", cl::init(true), cl::Hidden,
295     cl::desc("Run the Loop vectorization passes"));
296 
297 /// A helper function for converting Scalar types to vector types.
298 /// If the incoming type is void, we return void. If the VF is 1, we return
299 /// the scalar type.
300 static Type *ToVectorTy(Type *Scalar, unsigned VF) {
301   if (Scalar->isVoidTy() || VF == 1)
302     return Scalar;
303   return VectorType::get(Scalar, VF);
304 }
305 
306 /// A helper function that returns the type of loaded or stored value.
307 static Type *getMemInstValueType(Value *I) {
308   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
309          "Expected Load or Store instruction");
310   if (auto *LI = dyn_cast<LoadInst>(I))
311     return LI->getType();
312   return cast<StoreInst>(I)->getValueOperand()->getType();
313 }
314 
315 /// A helper function that returns true if the given type is irregular. The
316 /// type is irregular if its allocated size doesn't equal the store size of an
317 /// element of the corresponding vector type at the given vectorization factor.
318 static bool hasIrregularType(Type *Ty, const DataLayout &DL, unsigned VF) {
319   // Determine if an array of VF elements of type Ty is "bitcast compatible"
320   // with a <VF x Ty> vector.
321   if (VF > 1) {
322     auto *VectorTy = VectorType::get(Ty, VF);
323     return VF * DL.getTypeAllocSize(Ty) != DL.getTypeStoreSize(VectorTy);
324   }
325 
326   // If the vectorization factor is one, we just check if an array of type Ty
327   // requires padding between elements.
328   return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
329 }
330 
331 /// A helper function that returns the reciprocal of the block probability of
332 /// predicated blocks. If we return X, we are assuming the predicated block
333 /// will execute once for every X iterations of the loop header.
334 ///
335 /// TODO: We should use actual block probability here, if available. Currently,
336 ///       we always assume predicated blocks have a 50% chance of executing.
337 static unsigned getReciprocalPredBlockProb() { return 2; }
338 
339 /// A helper function that adds a 'fast' flag to floating-point operations.
340 static Value *addFastMathFlag(Value *V) {
341   if (isa<FPMathOperator>(V))
342     cast<Instruction>(V)->setFastMathFlags(FastMathFlags::getFast());
343   return V;
344 }
345 
346 static Value *addFastMathFlag(Value *V, FastMathFlags FMF) {
347   if (isa<FPMathOperator>(V))
348     cast<Instruction>(V)->setFastMathFlags(FMF);
349   return V;
350 }
351 
352 /// A helper function that returns an integer or floating-point constant with
353 /// value C.
354 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
355   return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
356                            : ConstantFP::get(Ty, C);
357 }
358 
359 /// Returns "best known" trip count for the specified loop \p L as defined by
360 /// the following procedure:
361 ///   1) Returns exact trip count if it is known.
362 ///   2) Returns expected trip count according to profile data if any.
363 ///   3) Returns upper bound estimate if it is known.
364 ///   4) Returns None if all of the above failed.
365 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) {
366   // Check if exact trip count is known.
367   if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
368     return ExpectedTC;
369 
370   // Check if there is an expected trip count available from profile data.
371   if (LoopVectorizeWithBlockFrequency)
372     if (auto EstimatedTC = getLoopEstimatedTripCount(L))
373       return EstimatedTC;
374 
375   // Check if upper bound estimate is known.
376   if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
377     return ExpectedTC;
378 
379   return None;
380 }
381 
382 namespace llvm {
383 
384 /// InnerLoopVectorizer vectorizes loops which contain only one basic
385 /// block to a specified vectorization factor (VF).
386 /// This class performs the widening of scalars into vectors, or multiple
387 /// scalars. This class also implements the following features:
388 /// * It inserts an epilogue loop for handling loops that don't have iteration
389 ///   counts that are known to be a multiple of the vectorization factor.
390 /// * It handles the code generation for reduction variables.
391 /// * Scalarization (implementation using scalars) of un-vectorizable
392 ///   instructions.
393 /// InnerLoopVectorizer does not perform any vectorization-legality
394 /// checks, and relies on the caller to check for the different legality
395 /// aspects. The InnerLoopVectorizer relies on the
396 /// LoopVectorizationLegality class to provide information about the induction
397 /// and reduction variables that were found to a given vectorization factor.
398 class InnerLoopVectorizer {
399 public:
400   InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
401                       LoopInfo *LI, DominatorTree *DT,
402                       const TargetLibraryInfo *TLI,
403                       const TargetTransformInfo *TTI, AssumptionCache *AC,
404                       OptimizationRemarkEmitter *ORE, unsigned VecWidth,
405                       unsigned UnrollFactor, LoopVectorizationLegality *LVL,
406                       LoopVectorizationCostModel *CM)
407       : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
408         AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
409         Builder(PSE.getSE()->getContext()),
410         VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM) {}
411   virtual ~InnerLoopVectorizer() = default;
412 
413   /// Create a new empty loop. Unlink the old loop and connect the new one.
414   /// Return the pre-header block of the new loop.
415   BasicBlock *createVectorizedLoopSkeleton();
416 
417   /// Widen a single instruction within the innermost loop.
418   void widenInstruction(Instruction &I);
419 
420   /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
421   void fixVectorizedLoop();
422 
423   // Return true if any runtime check is added.
424   bool areSafetyChecksAdded() { return AddedSafetyChecks; }
425 
426   /// A type for vectorized values in the new loop. Each value from the
427   /// original loop, when vectorized, is represented by UF vector values in the
428   /// new unrolled loop, where UF is the unroll factor.
429   using VectorParts = SmallVector<Value *, 2>;
430 
431   /// Vectorize a single GetElementPtrInst based on information gathered and
432   /// decisions taken during planning.
433   void widenGEP(GetElementPtrInst *GEP, unsigned UF, unsigned VF,
434                 bool IsPtrLoopInvariant, SmallBitVector &IsIndexLoopInvariant);
435 
436   /// Vectorize a single PHINode in a block. This method handles the induction
437   /// variable canonicalization. It supports both VF = 1 for unrolled loops and
438   /// arbitrary length vectors.
439   void widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF);
440 
441   /// A helper function to scalarize a single Instruction in the innermost loop.
442   /// Generates a sequence of scalar instances for each lane between \p MinLane
443   /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
444   /// inclusive..
445   void scalarizeInstruction(Instruction *Instr, const VPIteration &Instance,
446                             bool IfPredicateInstr);
447 
448   /// Widen an integer or floating-point induction variable \p IV. If \p Trunc
449   /// is provided, the integer induction variable will first be truncated to
450   /// the corresponding type.
451   void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr);
452 
453   /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a
454   /// vector or scalar value on-demand if one is not yet available. When
455   /// vectorizing a loop, we visit the definition of an instruction before its
456   /// uses. When visiting the definition, we either vectorize or scalarize the
457   /// instruction, creating an entry for it in the corresponding map. (In some
458   /// cases, such as induction variables, we will create both vector and scalar
459   /// entries.) Then, as we encounter uses of the definition, we derive values
460   /// for each scalar or vector use unless such a value is already available.
461   /// For example, if we scalarize a definition and one of its uses is vector,
462   /// we build the required vector on-demand with an insertelement sequence
463   /// when visiting the use. Otherwise, if the use is scalar, we can use the
464   /// existing scalar definition.
465   ///
466   /// Return a value in the new loop corresponding to \p V from the original
467   /// loop at unroll index \p Part. If the value has already been vectorized,
468   /// the corresponding vector entry in VectorLoopValueMap is returned. If,
469   /// however, the value has a scalar entry in VectorLoopValueMap, we construct
470   /// a new vector value on-demand by inserting the scalar values into a vector
471   /// with an insertelement sequence. If the value has been neither vectorized
472   /// nor scalarized, it must be loop invariant, so we simply broadcast the
473   /// value into a vector.
474   Value *getOrCreateVectorValue(Value *V, unsigned Part);
475 
476   /// Return a value in the new loop corresponding to \p V from the original
477   /// loop at unroll and vector indices \p Instance. If the value has been
478   /// vectorized but not scalarized, the necessary extractelement instruction
479   /// will be generated.
480   Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance);
481 
482   /// Construct the vector value of a scalarized value \p V one lane at a time.
483   void packScalarIntoVectorValue(Value *V, const VPIteration &Instance);
484 
485   /// Try to vectorize the interleaved access group that \p Instr belongs to,
486   /// optionally masking the vector operations if \p BlockInMask is non-null.
487   void vectorizeInterleaveGroup(Instruction *Instr,
488                                 VectorParts *BlockInMask = nullptr);
489 
490   /// Vectorize Load and Store instructions, optionally masking the vector
491   /// operations if \p BlockInMask is non-null.
492   void vectorizeMemoryInstruction(Instruction *Instr,
493                                   VectorParts *BlockInMask = nullptr);
494 
495   /// Set the debug location in the builder using the debug location in
496   /// the instruction.
497   void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr);
498 
499   /// Fix the non-induction PHIs in the OrigPHIsToFix vector.
500   void fixNonInductionPHIs(void);
501 
502 protected:
503   friend class LoopVectorizationPlanner;
504 
505   /// A small list of PHINodes.
506   using PhiVector = SmallVector<PHINode *, 4>;
507 
508   /// A type for scalarized values in the new loop. Each value from the
509   /// original loop, when scalarized, is represented by UF x VF scalar values
510   /// in the new unrolled loop, where UF is the unroll factor and VF is the
511   /// vectorization factor.
512   using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
513 
514   /// Set up the values of the IVs correctly when exiting the vector loop.
515   void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
516                     Value *CountRoundDown, Value *EndValue,
517                     BasicBlock *MiddleBlock);
518 
519   /// Create a new induction variable inside L.
520   PHINode *createInductionVariable(Loop *L, Value *Start, Value *End,
521                                    Value *Step, Instruction *DL);
522 
523   /// Handle all cross-iteration phis in the header.
524   void fixCrossIterationPHIs();
525 
526   /// Fix a first-order recurrence. This is the second phase of vectorizing
527   /// this phi node.
528   void fixFirstOrderRecurrence(PHINode *Phi);
529 
530   /// Fix a reduction cross-iteration phi. This is the second phase of
531   /// vectorizing this phi node.
532   void fixReduction(PHINode *Phi);
533 
534   /// The Loop exit block may have single value PHI nodes with some
535   /// incoming value. While vectorizing we only handled real values
536   /// that were defined inside the loop and we should have one value for
537   /// each predecessor of its parent basic block. See PR14725.
538   void fixLCSSAPHIs();
539 
540   /// Iteratively sink the scalarized operands of a predicated instruction into
541   /// the block that was created for it.
542   void sinkScalarOperands(Instruction *PredInst);
543 
544   /// Shrinks vector element sizes to the smallest bitwidth they can be legally
545   /// represented as.
546   void truncateToMinimalBitwidths();
547 
548   /// Insert the new loop to the loop hierarchy and pass manager
549   /// and update the analysis passes.
550   void updateAnalysis();
551 
552   /// Create a broadcast instruction. This method generates a broadcast
553   /// instruction (shuffle) for loop invariant values and for the induction
554   /// value. If this is the induction variable then we extend it to N, N+1, ...
555   /// this is needed because each iteration in the loop corresponds to a SIMD
556   /// element.
557   virtual Value *getBroadcastInstrs(Value *V);
558 
559   /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...)
560   /// to each vector element of Val. The sequence starts at StartIndex.
561   /// \p Opcode is relevant for FP induction variable.
562   virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step,
563                                Instruction::BinaryOps Opcode =
564                                Instruction::BinaryOpsEnd);
565 
566   /// Compute scalar induction steps. \p ScalarIV is the scalar induction
567   /// variable on which to base the steps, \p Step is the size of the step, and
568   /// \p EntryVal is the value from the original loop that maps to the steps.
569   /// Note that \p EntryVal doesn't have to be an induction variable - it
570   /// can also be a truncate instruction.
571   void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal,
572                         const InductionDescriptor &ID);
573 
574   /// Create a vector induction phi node based on an existing scalar one. \p
575   /// EntryVal is the value from the original loop that maps to the vector phi
576   /// node, and \p Step is the loop-invariant step. If \p EntryVal is a
577   /// truncate instruction, instead of widening the original IV, we widen a
578   /// version of the IV truncated to \p EntryVal's type.
579   void createVectorIntOrFpInductionPHI(const InductionDescriptor &II,
580                                        Value *Step, Instruction *EntryVal);
581 
582   /// Returns true if an instruction \p I should be scalarized instead of
583   /// vectorized for the chosen vectorization factor.
584   bool shouldScalarizeInstruction(Instruction *I) const;
585 
586   /// Returns true if we should generate a scalar version of \p IV.
587   bool needsScalarInduction(Instruction *IV) const;
588 
589   /// If there is a cast involved in the induction variable \p ID, which should
590   /// be ignored in the vectorized loop body, this function records the
591   /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the
592   /// cast. We had already proved that the casted Phi is equal to the uncasted
593   /// Phi in the vectorized loop (under a runtime guard), and therefore
594   /// there is no need to vectorize the cast - the same value can be used in the
595   /// vector loop for both the Phi and the cast.
596   /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified,
597   /// Otherwise, \p VectorLoopValue is a widened/vectorized value.
598   ///
599   /// \p EntryVal is the value from the original loop that maps to the vector
600   /// phi node and is used to distinguish what is the IV currently being
601   /// processed - original one (if \p EntryVal is a phi corresponding to the
602   /// original IV) or the "newly-created" one based on the proof mentioned above
603   /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the
604   /// latter case \p EntryVal is a TruncInst and we must not record anything for
605   /// that IV, but it's error-prone to expect callers of this routine to care
606   /// about that, hence this explicit parameter.
607   void recordVectorLoopValueForInductionCast(const InductionDescriptor &ID,
608                                              const Instruction *EntryVal,
609                                              Value *VectorLoopValue,
610                                              unsigned Part,
611                                              unsigned Lane = UINT_MAX);
612 
613   /// Generate a shuffle sequence that will reverse the vector Vec.
614   virtual Value *reverseVector(Value *Vec);
615 
616   /// Returns (and creates if needed) the original loop trip count.
617   Value *getOrCreateTripCount(Loop *NewLoop);
618 
619   /// Returns (and creates if needed) the trip count of the widened loop.
620   Value *getOrCreateVectorTripCount(Loop *NewLoop);
621 
622   /// Returns a bitcasted value to the requested vector type.
623   /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
624   Value *createBitOrPointerCast(Value *V, VectorType *DstVTy,
625                                 const DataLayout &DL);
626 
627   /// Emit a bypass check to see if the vector trip count is zero, including if
628   /// it overflows.
629   void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass);
630 
631   /// Emit a bypass check to see if all of the SCEV assumptions we've
632   /// had to make are correct.
633   void emitSCEVChecks(Loop *L, BasicBlock *Bypass);
634 
635   /// Emit bypass checks to check any memory assumptions we may have made.
636   void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass);
637 
638   /// Compute the transformed value of Index at offset StartValue using step
639   /// StepValue.
640   /// For integer induction, returns StartValue + Index * StepValue.
641   /// For pointer induction, returns StartValue[Index * StepValue].
642   /// FIXME: The newly created binary instructions should contain nsw/nuw
643   /// flags, which can be found from the original scalar operations.
644   Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE,
645                               const DataLayout &DL,
646                               const InductionDescriptor &ID) const;
647 
648   /// Add additional metadata to \p To that was not present on \p Orig.
649   ///
650   /// Currently this is used to add the noalias annotations based on the
651   /// inserted memchecks.  Use this for instructions that are *cloned* into the
652   /// vector loop.
653   void addNewMetadata(Instruction *To, const Instruction *Orig);
654 
655   /// Add metadata from one instruction to another.
656   ///
657   /// This includes both the original MDs from \p From and additional ones (\see
658   /// addNewMetadata).  Use this for *newly created* instructions in the vector
659   /// loop.
660   void addMetadata(Instruction *To, Instruction *From);
661 
662   /// Similar to the previous function but it adds the metadata to a
663   /// vector of instructions.
664   void addMetadata(ArrayRef<Value *> To, Instruction *From);
665 
666   /// The original loop.
667   Loop *OrigLoop;
668 
669   /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
670   /// dynamic knowledge to simplify SCEV expressions and converts them to a
671   /// more usable form.
672   PredicatedScalarEvolution &PSE;
673 
674   /// Loop Info.
675   LoopInfo *LI;
676 
677   /// Dominator Tree.
678   DominatorTree *DT;
679 
680   /// Alias Analysis.
681   AliasAnalysis *AA;
682 
683   /// Target Library Info.
684   const TargetLibraryInfo *TLI;
685 
686   /// Target Transform Info.
687   const TargetTransformInfo *TTI;
688 
689   /// Assumption Cache.
690   AssumptionCache *AC;
691 
692   /// Interface to emit optimization remarks.
693   OptimizationRemarkEmitter *ORE;
694 
695   /// LoopVersioning.  It's only set up (non-null) if memchecks were
696   /// used.
697   ///
698   /// This is currently only used to add no-alias metadata based on the
699   /// memchecks.  The actually versioning is performed manually.
700   std::unique_ptr<LoopVersioning> LVer;
701 
702   /// The vectorization SIMD factor to use. Each vector will have this many
703   /// vector elements.
704   unsigned VF;
705 
706   /// The vectorization unroll factor to use. Each scalar is vectorized to this
707   /// many different vector instructions.
708   unsigned UF;
709 
710   /// The builder that we use
711   IRBuilder<> Builder;
712 
713   // --- Vectorization state ---
714 
715   /// The vector-loop preheader.
716   BasicBlock *LoopVectorPreHeader;
717 
718   /// The scalar-loop preheader.
719   BasicBlock *LoopScalarPreHeader;
720 
721   /// Middle Block between the vector and the scalar.
722   BasicBlock *LoopMiddleBlock;
723 
724   /// The ExitBlock of the scalar loop.
725   BasicBlock *LoopExitBlock;
726 
727   /// The vector loop body.
728   BasicBlock *LoopVectorBody;
729 
730   /// The scalar loop body.
731   BasicBlock *LoopScalarBody;
732 
733   /// A list of all bypass blocks. The first block is the entry of the loop.
734   SmallVector<BasicBlock *, 4> LoopBypassBlocks;
735 
736   /// The new Induction variable which was added to the new block.
737   PHINode *Induction = nullptr;
738 
739   /// The induction variable of the old basic block.
740   PHINode *OldInduction = nullptr;
741 
742   /// Maps values from the original loop to their corresponding values in the
743   /// vectorized loop. A key value can map to either vector values, scalar
744   /// values or both kinds of values, depending on whether the key was
745   /// vectorized and scalarized.
746   VectorizerValueMap VectorLoopValueMap;
747 
748   /// Store instructions that were predicated.
749   SmallVector<Instruction *, 4> PredicatedInstructions;
750 
751   /// Trip count of the original loop.
752   Value *TripCount = nullptr;
753 
754   /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
755   Value *VectorTripCount = nullptr;
756 
757   /// The legality analysis.
758   LoopVectorizationLegality *Legal;
759 
760   /// The profitablity analysis.
761   LoopVectorizationCostModel *Cost;
762 
763   // Record whether runtime checks are added.
764   bool AddedSafetyChecks = false;
765 
766   // Holds the end values for each induction variable. We save the end values
767   // so we can later fix-up the external users of the induction variables.
768   DenseMap<PHINode *, Value *> IVEndValues;
769 
770   // Vector of original scalar PHIs whose corresponding widened PHIs need to be
771   // fixed up at the end of vector code generation.
772   SmallVector<PHINode *, 8> OrigPHIsToFix;
773 };
774 
775 class InnerLoopUnroller : public InnerLoopVectorizer {
776 public:
777   InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
778                     LoopInfo *LI, DominatorTree *DT,
779                     const TargetLibraryInfo *TLI,
780                     const TargetTransformInfo *TTI, AssumptionCache *AC,
781                     OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
782                     LoopVectorizationLegality *LVL,
783                     LoopVectorizationCostModel *CM)
784       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1,
785                             UnrollFactor, LVL, CM) {}
786 
787 private:
788   Value *getBroadcastInstrs(Value *V) override;
789   Value *getStepVector(Value *Val, int StartIdx, Value *Step,
790                        Instruction::BinaryOps Opcode =
791                        Instruction::BinaryOpsEnd) override;
792   Value *reverseVector(Value *Vec) override;
793 };
794 
795 } // end namespace llvm
796 
797 /// Look for a meaningful debug location on the instruction or it's
798 /// operands.
799 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
800   if (!I)
801     return I;
802 
803   DebugLoc Empty;
804   if (I->getDebugLoc() != Empty)
805     return I;
806 
807   for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) {
808     if (Instruction *OpInst = dyn_cast<Instruction>(*OI))
809       if (OpInst->getDebugLoc() != Empty)
810         return OpInst;
811   }
812 
813   return I;
814 }
815 
816 void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) {
817   if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) {
818     const DILocation *DIL = Inst->getDebugLoc();
819     if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
820         !isa<DbgInfoIntrinsic>(Inst)) {
821       auto NewDIL = DIL->cloneByMultiplyingDuplicationFactor(UF * VF);
822       if (NewDIL)
823         B.SetCurrentDebugLocation(NewDIL.getValue());
824       else
825         LLVM_DEBUG(dbgs()
826                    << "Failed to create new discriminator: "
827                    << DIL->getFilename() << " Line: " << DIL->getLine());
828     }
829     else
830       B.SetCurrentDebugLocation(DIL);
831   } else
832     B.SetCurrentDebugLocation(DebugLoc());
833 }
834 
835 /// Write a record \p DebugMsg about vectorization failure to the debug
836 /// output stream. If \p I is passed, it is an instruction that prevents
837 /// vectorization.
838 #ifndef NDEBUG
839 static void debugVectorizationFailure(const StringRef DebugMsg,
840     Instruction *I) {
841   dbgs() << "LV: Not vectorizing: " << DebugMsg;
842   if (I != nullptr)
843     dbgs() << " " << *I;
844   else
845     dbgs() << '.';
846   dbgs() << '\n';
847 }
848 #endif
849 
850 /// Create an analysis remark that explains why vectorization failed
851 ///
852 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint).  \p
853 /// RemarkName is the identifier for the remark.  If \p I is passed it is an
854 /// instruction that prevents vectorization.  Otherwise \p TheLoop is used for
855 /// the location of the remark.  \return the remark object that can be
856 /// streamed to.
857 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
858     StringRef RemarkName, Loop *TheLoop, Instruction *I) {
859   Value *CodeRegion = TheLoop->getHeader();
860   DebugLoc DL = TheLoop->getStartLoc();
861 
862   if (I) {
863     CodeRegion = I->getParent();
864     // If there is no debug location attached to the instruction, revert back to
865     // using the loop's.
866     if (I->getDebugLoc())
867       DL = I->getDebugLoc();
868   }
869 
870   OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion);
871   R << "loop not vectorized: ";
872   return R;
873 }
874 
875 namespace llvm {
876 
877 void reportVectorizationFailure(const StringRef DebugMsg,
878     const StringRef OREMsg, const StringRef ORETag,
879     OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) {
880   LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I));
881   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
882   ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(),
883                 ORETag, TheLoop, I) << OREMsg);
884 }
885 
886 } // end namespace llvm
887 
888 #ifndef NDEBUG
889 /// \return string containing a file name and a line # for the given loop.
890 static std::string getDebugLocString(const Loop *L) {
891   std::string Result;
892   if (L) {
893     raw_string_ostream OS(Result);
894     if (const DebugLoc LoopDbgLoc = L->getStartLoc())
895       LoopDbgLoc.print(OS);
896     else
897       // Just print the module name.
898       OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
899     OS.flush();
900   }
901   return Result;
902 }
903 #endif
904 
905 void InnerLoopVectorizer::addNewMetadata(Instruction *To,
906                                          const Instruction *Orig) {
907   // If the loop was versioned with memchecks, add the corresponding no-alias
908   // metadata.
909   if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig)))
910     LVer->annotateInstWithNoAlias(To, Orig);
911 }
912 
913 void InnerLoopVectorizer::addMetadata(Instruction *To,
914                                       Instruction *From) {
915   propagateMetadata(To, From);
916   addNewMetadata(To, From);
917 }
918 
919 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,
920                                       Instruction *From) {
921   for (Value *V : To) {
922     if (Instruction *I = dyn_cast<Instruction>(V))
923       addMetadata(I, From);
924   }
925 }
926 
927 namespace llvm {
928 
929 // Loop vectorization cost-model hints how the scalar epilogue loop should be
930 // lowered.
931 enum ScalarEpilogueLowering {
932 
933   // The default: allowing scalar epilogues.
934   CM_ScalarEpilogueAllowed,
935 
936   // Vectorization with OptForSize: don't allow epilogues.
937   CM_ScalarEpilogueNotAllowedOptSize,
938 
939   // A special case of vectorisation with OptForSize: loops with a very small
940   // trip count are considered for vectorization under OptForSize, thereby
941   // making sure the cost of their loop body is dominant, free of runtime
942   // guards and scalar iteration overheads.
943   CM_ScalarEpilogueNotAllowedLowTripLoop,
944 
945   // Loop hint predicate indicating an epilogue is undesired.
946   CM_ScalarEpilogueNotNeededUsePredicate
947 };
948 
949 /// LoopVectorizationCostModel - estimates the expected speedups due to
950 /// vectorization.
951 /// In many cases vectorization is not profitable. This can happen because of
952 /// a number of reasons. In this class we mainly attempt to predict the
953 /// expected speedup/slowdowns due to the supported instruction set. We use the
954 /// TargetTransformInfo to query the different backends for the cost of
955 /// different operations.
956 class LoopVectorizationCostModel {
957 public:
958   LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
959                              PredicatedScalarEvolution &PSE, LoopInfo *LI,
960                              LoopVectorizationLegality *Legal,
961                              const TargetTransformInfo &TTI,
962                              const TargetLibraryInfo *TLI, DemandedBits *DB,
963                              AssumptionCache *AC,
964                              OptimizationRemarkEmitter *ORE, const Function *F,
965                              const LoopVectorizeHints *Hints,
966                              InterleavedAccessInfo &IAI)
967       : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
968         TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
969         Hints(Hints), InterleaveInfo(IAI) {}
970 
971   /// \return An upper bound for the vectorization factor, or None if
972   /// vectorization and interleaving should be avoided up front.
973   Optional<unsigned> computeMaxVF();
974 
975   /// \return True if runtime checks are required for vectorization, and false
976   /// otherwise.
977   bool runtimeChecksRequired();
978 
979   /// \return The most profitable vectorization factor and the cost of that VF.
980   /// This method checks every power of two up to MaxVF. If UserVF is not ZERO
981   /// then this vectorization factor will be selected if vectorization is
982   /// possible.
983   VectorizationFactor selectVectorizationFactor(unsigned MaxVF);
984 
985   /// Setup cost-based decisions for user vectorization factor.
986   void selectUserVectorizationFactor(unsigned UserVF) {
987     collectUniformsAndScalars(UserVF);
988     collectInstsToScalarize(UserVF);
989   }
990 
991   /// \return The size (in bits) of the smallest and widest types in the code
992   /// that needs to be vectorized. We ignore values that remain scalar such as
993   /// 64 bit loop indices.
994   std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
995 
996   /// \return The desired interleave count.
997   /// If interleave count has been specified by metadata it will be returned.
998   /// Otherwise, the interleave count is computed and returned. VF and LoopCost
999   /// are the selected vectorization factor and the cost of the selected VF.
1000   unsigned selectInterleaveCount(unsigned VF, unsigned LoopCost);
1001 
1002   /// Memory access instruction may be vectorized in more than one way.
1003   /// Form of instruction after vectorization depends on cost.
1004   /// This function takes cost-based decisions for Load/Store instructions
1005   /// and collects them in a map. This decisions map is used for building
1006   /// the lists of loop-uniform and loop-scalar instructions.
1007   /// The calculated cost is saved with widening decision in order to
1008   /// avoid redundant calculations.
1009   void setCostBasedWideningDecision(unsigned VF);
1010 
1011   /// A struct that represents some properties of the register usage
1012   /// of a loop.
1013   struct RegisterUsage {
1014     /// Holds the number of loop invariant values that are used in the loop.
1015     /// The key is ClassID of target-provided register class.
1016     SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs;
1017     /// Holds the maximum number of concurrent live intervals in the loop.
1018     /// The key is ClassID of target-provided register class.
1019     SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
1020   };
1021 
1022   /// \return Returns information about the register usages of the loop for the
1023   /// given vectorization factors.
1024   SmallVector<RegisterUsage, 8> calculateRegisterUsage(ArrayRef<unsigned> VFs);
1025 
1026   /// Collect values we want to ignore in the cost model.
1027   void collectValuesToIgnore();
1028 
1029   /// \returns The smallest bitwidth each instruction can be represented with.
1030   /// The vector equivalents of these instructions should be truncated to this
1031   /// type.
1032   const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
1033     return MinBWs;
1034   }
1035 
1036   /// \returns True if it is more profitable to scalarize instruction \p I for
1037   /// vectorization factor \p VF.
1038   bool isProfitableToScalarize(Instruction *I, unsigned VF) const {
1039     assert(VF > 1 && "Profitable to scalarize relevant only for VF > 1.");
1040 
1041     // Cost model is not run in the VPlan-native path - return conservative
1042     // result until this changes.
1043     if (EnableVPlanNativePath)
1044       return false;
1045 
1046     auto Scalars = InstsToScalarize.find(VF);
1047     assert(Scalars != InstsToScalarize.end() &&
1048            "VF not yet analyzed for scalarization profitability");
1049     return Scalars->second.find(I) != Scalars->second.end();
1050   }
1051 
1052   /// Returns true if \p I is known to be uniform after vectorization.
1053   bool isUniformAfterVectorization(Instruction *I, unsigned VF) const {
1054     if (VF == 1)
1055       return true;
1056 
1057     // Cost model is not run in the VPlan-native path - return conservative
1058     // result until this changes.
1059     if (EnableVPlanNativePath)
1060       return false;
1061 
1062     auto UniformsPerVF = Uniforms.find(VF);
1063     assert(UniformsPerVF != Uniforms.end() &&
1064            "VF not yet analyzed for uniformity");
1065     return UniformsPerVF->second.find(I) != UniformsPerVF->second.end();
1066   }
1067 
1068   /// Returns true if \p I is known to be scalar after vectorization.
1069   bool isScalarAfterVectorization(Instruction *I, unsigned VF) const {
1070     if (VF == 1)
1071       return true;
1072 
1073     // Cost model is not run in the VPlan-native path - return conservative
1074     // result until this changes.
1075     if (EnableVPlanNativePath)
1076       return false;
1077 
1078     auto ScalarsPerVF = Scalars.find(VF);
1079     assert(ScalarsPerVF != Scalars.end() &&
1080            "Scalar values are not calculated for VF");
1081     return ScalarsPerVF->second.find(I) != ScalarsPerVF->second.end();
1082   }
1083 
1084   /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1085   /// for vectorization factor \p VF.
1086   bool canTruncateToMinimalBitwidth(Instruction *I, unsigned VF) const {
1087     return VF > 1 && MinBWs.find(I) != MinBWs.end() &&
1088            !isProfitableToScalarize(I, VF) &&
1089            !isScalarAfterVectorization(I, VF);
1090   }
1091 
1092   /// Decision that was taken during cost calculation for memory instruction.
1093   enum InstWidening {
1094     CM_Unknown,
1095     CM_Widen,         // For consecutive accesses with stride +1.
1096     CM_Widen_Reverse, // For consecutive accesses with stride -1.
1097     CM_Interleave,
1098     CM_GatherScatter,
1099     CM_Scalarize
1100   };
1101 
1102   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1103   /// instruction \p I and vector width \p VF.
1104   void setWideningDecision(Instruction *I, unsigned VF, InstWidening W,
1105                            unsigned Cost) {
1106     assert(VF >= 2 && "Expected VF >=2");
1107     WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1108   }
1109 
1110   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1111   /// interleaving group \p Grp and vector width \p VF.
1112   void setWideningDecision(const InterleaveGroup<Instruction> *Grp, unsigned VF,
1113                            InstWidening W, unsigned Cost) {
1114     assert(VF >= 2 && "Expected VF >=2");
1115     /// Broadcast this decicion to all instructions inside the group.
1116     /// But the cost will be assigned to one instruction only.
1117     for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1118       if (auto *I = Grp->getMember(i)) {
1119         if (Grp->getInsertPos() == I)
1120           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1121         else
1122           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1123       }
1124     }
1125   }
1126 
1127   /// Return the cost model decision for the given instruction \p I and vector
1128   /// width \p VF. Return CM_Unknown if this instruction did not pass
1129   /// through the cost modeling.
1130   InstWidening getWideningDecision(Instruction *I, unsigned VF) {
1131     assert(VF >= 2 && "Expected VF >=2");
1132 
1133     // Cost model is not run in the VPlan-native path - return conservative
1134     // result until this changes.
1135     if (EnableVPlanNativePath)
1136       return CM_GatherScatter;
1137 
1138     std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
1139     auto Itr = WideningDecisions.find(InstOnVF);
1140     if (Itr == WideningDecisions.end())
1141       return CM_Unknown;
1142     return Itr->second.first;
1143   }
1144 
1145   /// Return the vectorization cost for the given instruction \p I and vector
1146   /// width \p VF.
1147   unsigned getWideningCost(Instruction *I, unsigned VF) {
1148     assert(VF >= 2 && "Expected VF >=2");
1149     std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
1150     assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&
1151            "The cost is not calculated");
1152     return WideningDecisions[InstOnVF].second;
1153   }
1154 
1155   /// Return True if instruction \p I is an optimizable truncate whose operand
1156   /// is an induction variable. Such a truncate will be removed by adding a new
1157   /// induction variable with the destination type.
1158   bool isOptimizableIVTruncate(Instruction *I, unsigned VF) {
1159     // If the instruction is not a truncate, return false.
1160     auto *Trunc = dyn_cast<TruncInst>(I);
1161     if (!Trunc)
1162       return false;
1163 
1164     // Get the source and destination types of the truncate.
1165     Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1166     Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1167 
1168     // If the truncate is free for the given types, return false. Replacing a
1169     // free truncate with an induction variable would add an induction variable
1170     // update instruction to each iteration of the loop. We exclude from this
1171     // check the primary induction variable since it will need an update
1172     // instruction regardless.
1173     Value *Op = Trunc->getOperand(0);
1174     if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1175       return false;
1176 
1177     // If the truncated value is not an induction variable, return false.
1178     return Legal->isInductionPhi(Op);
1179   }
1180 
1181   /// Collects the instructions to scalarize for each predicated instruction in
1182   /// the loop.
1183   void collectInstsToScalarize(unsigned VF);
1184 
1185   /// Collect Uniform and Scalar values for the given \p VF.
1186   /// The sets depend on CM decision for Load/Store instructions
1187   /// that may be vectorized as interleave, gather-scatter or scalarized.
1188   void collectUniformsAndScalars(unsigned VF) {
1189     // Do the analysis once.
1190     if (VF == 1 || Uniforms.find(VF) != Uniforms.end())
1191       return;
1192     setCostBasedWideningDecision(VF);
1193     collectLoopUniforms(VF);
1194     collectLoopScalars(VF);
1195   }
1196 
1197   /// Returns true if the target machine supports masked store operation
1198   /// for the given \p DataType and kind of access to \p Ptr.
1199   bool isLegalMaskedStore(Type *DataType, Value *Ptr, MaybeAlign Alignment) {
1200     return Legal->isConsecutivePtr(Ptr) &&
1201            TTI.isLegalMaskedStore(DataType, Alignment);
1202   }
1203 
1204   /// Returns true if the target machine supports masked load operation
1205   /// for the given \p DataType and kind of access to \p Ptr.
1206   bool isLegalMaskedLoad(Type *DataType, Value *Ptr, MaybeAlign Alignment) {
1207     return Legal->isConsecutivePtr(Ptr) &&
1208            TTI.isLegalMaskedLoad(DataType, Alignment);
1209   }
1210 
1211   /// Returns true if the target machine supports masked scatter operation
1212   /// for the given \p DataType.
1213   bool isLegalMaskedScatter(Type *DataType, MaybeAlign Alignment) {
1214     return TTI.isLegalMaskedScatter(DataType, Alignment);
1215   }
1216 
1217   /// Returns true if the target machine supports masked gather operation
1218   /// for the given \p DataType.
1219   bool isLegalMaskedGather(Type *DataType, MaybeAlign Alignment) {
1220     return TTI.isLegalMaskedGather(DataType, Alignment);
1221   }
1222 
1223   /// Returns true if the target machine can represent \p V as a masked gather
1224   /// or scatter operation.
1225   bool isLegalGatherOrScatter(Value *V) {
1226     bool LI = isa<LoadInst>(V);
1227     bool SI = isa<StoreInst>(V);
1228     if (!LI && !SI)
1229       return false;
1230     auto *Ty = getMemInstValueType(V);
1231     MaybeAlign Align = getLoadStoreAlignment(V);
1232     return (LI && isLegalMaskedGather(Ty, Align)) ||
1233            (SI && isLegalMaskedScatter(Ty, Align));
1234   }
1235 
1236   /// Returns true if \p I is an instruction that will be scalarized with
1237   /// predication. Such instructions include conditional stores and
1238   /// instructions that may divide by zero.
1239   /// If a non-zero VF has been calculated, we check if I will be scalarized
1240   /// predication for that VF.
1241   bool isScalarWithPredication(Instruction *I, unsigned VF = 1);
1242 
1243   // Returns true if \p I is an instruction that will be predicated either
1244   // through scalar predication or masked load/store or masked gather/scatter.
1245   // Superset of instructions that return true for isScalarWithPredication.
1246   bool isPredicatedInst(Instruction *I) {
1247     if (!blockNeedsPredication(I->getParent()))
1248       return false;
1249     // Loads and stores that need some form of masked operation are predicated
1250     // instructions.
1251     if (isa<LoadInst>(I) || isa<StoreInst>(I))
1252       return Legal->isMaskRequired(I);
1253     return isScalarWithPredication(I);
1254   }
1255 
1256   /// Returns true if \p I is a memory instruction with consecutive memory
1257   /// access that can be widened.
1258   bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1);
1259 
1260   /// Returns true if \p I is a memory instruction in an interleaved-group
1261   /// of memory accesses that can be vectorized with wide vector loads/stores
1262   /// and shuffles.
1263   bool interleavedAccessCanBeWidened(Instruction *I, unsigned VF = 1);
1264 
1265   /// Check if \p Instr belongs to any interleaved access group.
1266   bool isAccessInterleaved(Instruction *Instr) {
1267     return InterleaveInfo.isInterleaved(Instr);
1268   }
1269 
1270   /// Get the interleaved access group that \p Instr belongs to.
1271   const InterleaveGroup<Instruction> *
1272   getInterleavedAccessGroup(Instruction *Instr) {
1273     return InterleaveInfo.getInterleaveGroup(Instr);
1274   }
1275 
1276   /// Returns true if an interleaved group requires a scalar iteration
1277   /// to handle accesses with gaps, and there is nothing preventing us from
1278   /// creating a scalar epilogue.
1279   bool requiresScalarEpilogue() const {
1280     return isScalarEpilogueAllowed() && InterleaveInfo.requiresScalarEpilogue();
1281   }
1282 
1283   /// Returns true if a scalar epilogue is not allowed due to optsize or a
1284   /// loop hint annotation.
1285   bool isScalarEpilogueAllowed() const {
1286     return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1287   }
1288 
1289   /// Returns true if all loop blocks should be masked to fold tail loop.
1290   bool foldTailByMasking() const { return FoldTailByMasking; }
1291 
1292   bool blockNeedsPredication(BasicBlock *BB) {
1293     return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1294   }
1295 
1296   /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1297   /// with factor VF.  Return the cost of the instruction, including
1298   /// scalarization overhead if it's needed.
1299   unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF);
1300 
1301   /// Estimate cost of a call instruction CI if it were vectorized with factor
1302   /// VF. Return the cost of the instruction, including scalarization overhead
1303   /// if it's needed. The flag NeedToScalarize shows if the call needs to be
1304   /// scalarized -
1305   /// i.e. either vector version isn't available, or is too expensive.
1306   unsigned getVectorCallCost(CallInst *CI, unsigned VF, bool &NeedToScalarize);
1307 
1308 private:
1309   unsigned NumPredStores = 0;
1310 
1311   /// \return An upper bound for the vectorization factor, larger than zero.
1312   /// One is returned if vectorization should best be avoided due to cost.
1313   unsigned computeFeasibleMaxVF(unsigned ConstTripCount);
1314 
1315   /// The vectorization cost is a combination of the cost itself and a boolean
1316   /// indicating whether any of the contributing operations will actually
1317   /// operate on
1318   /// vector values after type legalization in the backend. If this latter value
1319   /// is
1320   /// false, then all operations will be scalarized (i.e. no vectorization has
1321   /// actually taken place).
1322   using VectorizationCostTy = std::pair<unsigned, bool>;
1323 
1324   /// Returns the expected execution cost. The unit of the cost does
1325   /// not matter because we use the 'cost' units to compare different
1326   /// vector widths. The cost that is returned is *not* normalized by
1327   /// the factor width.
1328   VectorizationCostTy expectedCost(unsigned VF);
1329 
1330   /// Returns the execution time cost of an instruction for a given vector
1331   /// width. Vector width of one means scalar.
1332   VectorizationCostTy getInstructionCost(Instruction *I, unsigned VF);
1333 
1334   /// The cost-computation logic from getInstructionCost which provides
1335   /// the vector type as an output parameter.
1336   unsigned getInstructionCost(Instruction *I, unsigned VF, Type *&VectorTy);
1337 
1338   /// Calculate vectorization cost of memory instruction \p I.
1339   unsigned getMemoryInstructionCost(Instruction *I, unsigned VF);
1340 
1341   /// The cost computation for scalarized memory instruction.
1342   unsigned getMemInstScalarizationCost(Instruction *I, unsigned VF);
1343 
1344   /// The cost computation for interleaving group of memory instructions.
1345   unsigned getInterleaveGroupCost(Instruction *I, unsigned VF);
1346 
1347   /// The cost computation for Gather/Scatter instruction.
1348   unsigned getGatherScatterCost(Instruction *I, unsigned VF);
1349 
1350   /// The cost computation for widening instruction \p I with consecutive
1351   /// memory access.
1352   unsigned getConsecutiveMemOpCost(Instruction *I, unsigned VF);
1353 
1354   /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1355   /// Load: scalar load + broadcast.
1356   /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1357   /// element)
1358   unsigned getUniformMemOpCost(Instruction *I, unsigned VF);
1359 
1360   /// Estimate the overhead of scalarizing an instruction. This is a
1361   /// convenience wrapper for the type-based getScalarizationOverhead API.
1362   unsigned getScalarizationOverhead(Instruction *I, unsigned VF);
1363 
1364   /// Returns whether the instruction is a load or store and will be a emitted
1365   /// as a vector operation.
1366   bool isConsecutiveLoadOrStore(Instruction *I);
1367 
1368   /// Returns true if an artificially high cost for emulated masked memrefs
1369   /// should be used.
1370   bool useEmulatedMaskMemRefHack(Instruction *I);
1371 
1372   /// Map of scalar integer values to the smallest bitwidth they can be legally
1373   /// represented as. The vector equivalents of these values should be truncated
1374   /// to this type.
1375   MapVector<Instruction *, uint64_t> MinBWs;
1376 
1377   /// A type representing the costs for instructions if they were to be
1378   /// scalarized rather than vectorized. The entries are Instruction-Cost
1379   /// pairs.
1380   using ScalarCostsTy = DenseMap<Instruction *, unsigned>;
1381 
1382   /// A set containing all BasicBlocks that are known to present after
1383   /// vectorization as a predicated block.
1384   SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;
1385 
1386   /// Records whether it is allowed to have the original scalar loop execute at
1387   /// least once. This may be needed as a fallback loop in case runtime
1388   /// aliasing/dependence checks fail, or to handle the tail/remainder
1389   /// iterations when the trip count is unknown or doesn't divide by the VF,
1390   /// or as a peel-loop to handle gaps in interleave-groups.
1391   /// Under optsize and when the trip count is very small we don't allow any
1392   /// iterations to execute in the scalar loop.
1393   ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1394 
1395   /// All blocks of loop are to be masked to fold tail of scalar iterations.
1396   bool FoldTailByMasking = false;
1397 
1398   /// A map holding scalar costs for different vectorization factors. The
1399   /// presence of a cost for an instruction in the mapping indicates that the
1400   /// instruction will be scalarized when vectorizing with the associated
1401   /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1402   DenseMap<unsigned, ScalarCostsTy> InstsToScalarize;
1403 
1404   /// Holds the instructions known to be uniform after vectorization.
1405   /// The data is collected per VF.
1406   DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Uniforms;
1407 
1408   /// Holds the instructions known to be scalar after vectorization.
1409   /// The data is collected per VF.
1410   DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Scalars;
1411 
1412   /// Holds the instructions (address computations) that are forced to be
1413   /// scalarized.
1414   DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1415 
1416   /// Returns the expected difference in cost from scalarizing the expression
1417   /// feeding a predicated instruction \p PredInst. The instructions to
1418   /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1419   /// non-negative return value implies the expression will be scalarized.
1420   /// Currently, only single-use chains are considered for scalarization.
1421   int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
1422                               unsigned VF);
1423 
1424   /// Collect the instructions that are uniform after vectorization. An
1425   /// instruction is uniform if we represent it with a single scalar value in
1426   /// the vectorized loop corresponding to each vector iteration. Examples of
1427   /// uniform instructions include pointer operands of consecutive or
1428   /// interleaved memory accesses. Note that although uniformity implies an
1429   /// instruction will be scalar, the reverse is not true. In general, a
1430   /// scalarized instruction will be represented by VF scalar values in the
1431   /// vectorized loop, each corresponding to an iteration of the original
1432   /// scalar loop.
1433   void collectLoopUniforms(unsigned VF);
1434 
1435   /// Collect the instructions that are scalar after vectorization. An
1436   /// instruction is scalar if it is known to be uniform or will be scalarized
1437   /// during vectorization. Non-uniform scalarized instructions will be
1438   /// represented by VF values in the vectorized loop, each corresponding to an
1439   /// iteration of the original scalar loop.
1440   void collectLoopScalars(unsigned VF);
1441 
1442   /// Keeps cost model vectorization decision and cost for instructions.
1443   /// Right now it is used for memory instructions only.
1444   using DecisionList = DenseMap<std::pair<Instruction *, unsigned>,
1445                                 std::pair<InstWidening, unsigned>>;
1446 
1447   DecisionList WideningDecisions;
1448 
1449   /// Returns true if \p V is expected to be vectorized and it needs to be
1450   /// extracted.
1451   bool needsExtract(Value *V, unsigned VF) const {
1452     Instruction *I = dyn_cast<Instruction>(V);
1453     if (VF == 1 || !I || !TheLoop->contains(I) || TheLoop->isLoopInvariant(I))
1454       return false;
1455 
1456     // Assume we can vectorize V (and hence we need extraction) if the
1457     // scalars are not computed yet. This can happen, because it is called
1458     // via getScalarizationOverhead from setCostBasedWideningDecision, before
1459     // the scalars are collected. That should be a safe assumption in most
1460     // cases, because we check if the operands have vectorizable types
1461     // beforehand in LoopVectorizationLegality.
1462     return Scalars.find(VF) == Scalars.end() ||
1463            !isScalarAfterVectorization(I, VF);
1464   };
1465 
1466   /// Returns a range containing only operands needing to be extracted.
1467   SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1468                                                    unsigned VF) {
1469     return SmallVector<Value *, 4>(make_filter_range(
1470         Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1471   }
1472 
1473 public:
1474   /// The loop that we evaluate.
1475   Loop *TheLoop;
1476 
1477   /// Predicated scalar evolution analysis.
1478   PredicatedScalarEvolution &PSE;
1479 
1480   /// Loop Info analysis.
1481   LoopInfo *LI;
1482 
1483   /// Vectorization legality.
1484   LoopVectorizationLegality *Legal;
1485 
1486   /// Vector target information.
1487   const TargetTransformInfo &TTI;
1488 
1489   /// Target Library Info.
1490   const TargetLibraryInfo *TLI;
1491 
1492   /// Demanded bits analysis.
1493   DemandedBits *DB;
1494 
1495   /// Assumption cache.
1496   AssumptionCache *AC;
1497 
1498   /// Interface to emit optimization remarks.
1499   OptimizationRemarkEmitter *ORE;
1500 
1501   const Function *TheFunction;
1502 
1503   /// Loop Vectorize Hint.
1504   const LoopVectorizeHints *Hints;
1505 
1506   /// The interleave access information contains groups of interleaved accesses
1507   /// with the same stride and close to each other.
1508   InterleavedAccessInfo &InterleaveInfo;
1509 
1510   /// Values to ignore in the cost model.
1511   SmallPtrSet<const Value *, 16> ValuesToIgnore;
1512 
1513   /// Values to ignore in the cost model when VF > 1.
1514   SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1515 };
1516 
1517 } // end namespace llvm
1518 
1519 // Return true if \p OuterLp is an outer loop annotated with hints for explicit
1520 // vectorization. The loop needs to be annotated with #pragma omp simd
1521 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
1522 // vector length information is not provided, vectorization is not considered
1523 // explicit. Interleave hints are not allowed either. These limitations will be
1524 // relaxed in the future.
1525 // Please, note that we are currently forced to abuse the pragma 'clang
1526 // vectorize' semantics. This pragma provides *auto-vectorization hints*
1527 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
1528 // provides *explicit vectorization hints* (LV can bypass legal checks and
1529 // assume that vectorization is legal). However, both hints are implemented
1530 // using the same metadata (llvm.loop.vectorize, processed by
1531 // LoopVectorizeHints). This will be fixed in the future when the native IR
1532 // representation for pragma 'omp simd' is introduced.
1533 static bool isExplicitVecOuterLoop(Loop *OuterLp,
1534                                    OptimizationRemarkEmitter *ORE) {
1535   assert(!OuterLp->empty() && "This is not an outer loop");
1536   LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
1537 
1538   // Only outer loops with an explicit vectorization hint are supported.
1539   // Unannotated outer loops are ignored.
1540   if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
1541     return false;
1542 
1543   Function *Fn = OuterLp->getHeader()->getParent();
1544   if (!Hints.allowVectorization(Fn, OuterLp,
1545                                 true /*VectorizeOnlyWhenForced*/)) {
1546     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
1547     return false;
1548   }
1549 
1550   if (Hints.getInterleave() > 1) {
1551     // TODO: Interleave support is future work.
1552     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
1553                          "outer loops.\n");
1554     Hints.emitRemarkWithHints();
1555     return false;
1556   }
1557 
1558   return true;
1559 }
1560 
1561 static void collectSupportedLoops(Loop &L, LoopInfo *LI,
1562                                   OptimizationRemarkEmitter *ORE,
1563                                   SmallVectorImpl<Loop *> &V) {
1564   // Collect inner loops and outer loops without irreducible control flow. For
1565   // now, only collect outer loops that have explicit vectorization hints. If we
1566   // are stress testing the VPlan H-CFG construction, we collect the outermost
1567   // loop of every loop nest.
1568   if (L.empty() || VPlanBuildStressTest ||
1569       (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
1570     LoopBlocksRPO RPOT(&L);
1571     RPOT.perform(LI);
1572     if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
1573       V.push_back(&L);
1574       // TODO: Collect inner loops inside marked outer loops in case
1575       // vectorization fails for the outer loop. Do not invoke
1576       // 'containsIrreducibleCFG' again for inner loops when the outer loop is
1577       // already known to be reducible. We can use an inherited attribute for
1578       // that.
1579       return;
1580     }
1581   }
1582   for (Loop *InnerL : L)
1583     collectSupportedLoops(*InnerL, LI, ORE, V);
1584 }
1585 
1586 namespace {
1587 
1588 /// The LoopVectorize Pass.
1589 struct LoopVectorize : public FunctionPass {
1590   /// Pass identification, replacement for typeid
1591   static char ID;
1592 
1593   LoopVectorizePass Impl;
1594 
1595   explicit LoopVectorize(bool InterleaveOnlyWhenForced = false,
1596                          bool VectorizeOnlyWhenForced = false)
1597       : FunctionPass(ID) {
1598     Impl.InterleaveOnlyWhenForced = InterleaveOnlyWhenForced;
1599     Impl.VectorizeOnlyWhenForced = VectorizeOnlyWhenForced;
1600     initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
1601   }
1602 
1603   bool runOnFunction(Function &F) override {
1604     if (skipFunction(F))
1605       return false;
1606 
1607     auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
1608     auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
1609     auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
1610     auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
1611     auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
1612     auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
1613     auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
1614     auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
1615     auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
1616     auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
1617     auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
1618     auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
1619     auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
1620 
1621     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
1622         [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
1623 
1624     return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC,
1625                         GetLAA, *ORE, PSI);
1626   }
1627 
1628   void getAnalysisUsage(AnalysisUsage &AU) const override {
1629     AU.addRequired<AssumptionCacheTracker>();
1630     AU.addRequired<BlockFrequencyInfoWrapperPass>();
1631     AU.addRequired<DominatorTreeWrapperPass>();
1632     AU.addRequired<LoopInfoWrapperPass>();
1633     AU.addRequired<ScalarEvolutionWrapperPass>();
1634     AU.addRequired<TargetTransformInfoWrapperPass>();
1635     AU.addRequired<AAResultsWrapperPass>();
1636     AU.addRequired<LoopAccessLegacyAnalysis>();
1637     AU.addRequired<DemandedBitsWrapperPass>();
1638     AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
1639 
1640     // We currently do not preserve loopinfo/dominator analyses with outer loop
1641     // vectorization. Until this is addressed, mark these analyses as preserved
1642     // only for non-VPlan-native path.
1643     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
1644     if (!EnableVPlanNativePath) {
1645       AU.addPreserved<LoopInfoWrapperPass>();
1646       AU.addPreserved<DominatorTreeWrapperPass>();
1647     }
1648 
1649     AU.addPreserved<BasicAAWrapperPass>();
1650     AU.addPreserved<GlobalsAAWrapperPass>();
1651     AU.addRequired<ProfileSummaryInfoWrapperPass>();
1652   }
1653 };
1654 
1655 } // end anonymous namespace
1656 
1657 //===----------------------------------------------------------------------===//
1658 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
1659 // LoopVectorizationCostModel and LoopVectorizationPlanner.
1660 //===----------------------------------------------------------------------===//
1661 
1662 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
1663   // We need to place the broadcast of invariant variables outside the loop,
1664   // but only if it's proven safe to do so. Else, broadcast will be inside
1665   // vector loop body.
1666   Instruction *Instr = dyn_cast<Instruction>(V);
1667   bool SafeToHoist = OrigLoop->isLoopInvariant(V) &&
1668                      (!Instr ||
1669                       DT->dominates(Instr->getParent(), LoopVectorPreHeader));
1670   // Place the code for broadcasting invariant variables in the new preheader.
1671   IRBuilder<>::InsertPointGuard Guard(Builder);
1672   if (SafeToHoist)
1673     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
1674 
1675   // Broadcast the scalar into all locations in the vector.
1676   Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
1677 
1678   return Shuf;
1679 }
1680 
1681 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
1682     const InductionDescriptor &II, Value *Step, Instruction *EntryVal) {
1683   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1684          "Expected either an induction phi-node or a truncate of it!");
1685   Value *Start = II.getStartValue();
1686 
1687   // Construct the initial value of the vector IV in the vector loop preheader
1688   auto CurrIP = Builder.saveIP();
1689   Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
1690   if (isa<TruncInst>(EntryVal)) {
1691     assert(Start->getType()->isIntegerTy() &&
1692            "Truncation requires an integer type");
1693     auto *TruncType = cast<IntegerType>(EntryVal->getType());
1694     Step = Builder.CreateTrunc(Step, TruncType);
1695     Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
1696   }
1697   Value *SplatStart = Builder.CreateVectorSplat(VF, Start);
1698   Value *SteppedStart =
1699       getStepVector(SplatStart, 0, Step, II.getInductionOpcode());
1700 
1701   // We create vector phi nodes for both integer and floating-point induction
1702   // variables. Here, we determine the kind of arithmetic we will perform.
1703   Instruction::BinaryOps AddOp;
1704   Instruction::BinaryOps MulOp;
1705   if (Step->getType()->isIntegerTy()) {
1706     AddOp = Instruction::Add;
1707     MulOp = Instruction::Mul;
1708   } else {
1709     AddOp = II.getInductionOpcode();
1710     MulOp = Instruction::FMul;
1711   }
1712 
1713   // Multiply the vectorization factor by the step using integer or
1714   // floating-point arithmetic as appropriate.
1715   Value *ConstVF = getSignedIntOrFpConstant(Step->getType(), VF);
1716   Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF));
1717 
1718   // Create a vector splat to use in the induction update.
1719   //
1720   // FIXME: If the step is non-constant, we create the vector splat with
1721   //        IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
1722   //        handle a constant vector splat.
1723   Value *SplatVF = isa<Constant>(Mul)
1724                        ? ConstantVector::getSplat(VF, cast<Constant>(Mul))
1725                        : Builder.CreateVectorSplat(VF, Mul);
1726   Builder.restoreIP(CurrIP);
1727 
1728   // We may need to add the step a number of times, depending on the unroll
1729   // factor. The last of those goes into the PHI.
1730   PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
1731                                     &*LoopVectorBody->getFirstInsertionPt());
1732   VecInd->setDebugLoc(EntryVal->getDebugLoc());
1733   Instruction *LastInduction = VecInd;
1734   for (unsigned Part = 0; Part < UF; ++Part) {
1735     VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction);
1736 
1737     if (isa<TruncInst>(EntryVal))
1738       addMetadata(LastInduction, EntryVal);
1739     recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, Part);
1740 
1741     LastInduction = cast<Instruction>(addFastMathFlag(
1742         Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")));
1743     LastInduction->setDebugLoc(EntryVal->getDebugLoc());
1744   }
1745 
1746   // Move the last step to the end of the latch block. This ensures consistent
1747   // placement of all induction updates.
1748   auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
1749   auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator());
1750   auto *ICmp = cast<Instruction>(Br->getCondition());
1751   LastInduction->moveBefore(ICmp);
1752   LastInduction->setName("vec.ind.next");
1753 
1754   VecInd->addIncoming(SteppedStart, LoopVectorPreHeader);
1755   VecInd->addIncoming(LastInduction, LoopVectorLatch);
1756 }
1757 
1758 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const {
1759   return Cost->isScalarAfterVectorization(I, VF) ||
1760          Cost->isProfitableToScalarize(I, VF);
1761 }
1762 
1763 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const {
1764   if (shouldScalarizeInstruction(IV))
1765     return true;
1766   auto isScalarInst = [&](User *U) -> bool {
1767     auto *I = cast<Instruction>(U);
1768     return (OrigLoop->contains(I) && shouldScalarizeInstruction(I));
1769   };
1770   return llvm::any_of(IV->users(), isScalarInst);
1771 }
1772 
1773 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast(
1774     const InductionDescriptor &ID, const Instruction *EntryVal,
1775     Value *VectorLoopVal, unsigned Part, unsigned Lane) {
1776   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1777          "Expected either an induction phi-node or a truncate of it!");
1778 
1779   // This induction variable is not the phi from the original loop but the
1780   // newly-created IV based on the proof that casted Phi is equal to the
1781   // uncasted Phi in the vectorized loop (under a runtime guard possibly). It
1782   // re-uses the same InductionDescriptor that original IV uses but we don't
1783   // have to do any recording in this case - that is done when original IV is
1784   // processed.
1785   if (isa<TruncInst>(EntryVal))
1786     return;
1787 
1788   const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts();
1789   if (Casts.empty())
1790     return;
1791   // Only the first Cast instruction in the Casts vector is of interest.
1792   // The rest of the Casts (if exist) have no uses outside the
1793   // induction update chain itself.
1794   Instruction *CastInst = *Casts.begin();
1795   if (Lane < UINT_MAX)
1796     VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal);
1797   else
1798     VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal);
1799 }
1800 
1801 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) {
1802   assert((IV->getType()->isIntegerTy() || IV != OldInduction) &&
1803          "Primary induction variable must have an integer type");
1804 
1805   auto II = Legal->getInductionVars()->find(IV);
1806   assert(II != Legal->getInductionVars()->end() && "IV is not an induction");
1807 
1808   auto ID = II->second;
1809   assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
1810 
1811   // The scalar value to broadcast. This will be derived from the canonical
1812   // induction variable.
1813   Value *ScalarIV = nullptr;
1814 
1815   // The value from the original loop to which we are mapping the new induction
1816   // variable.
1817   Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
1818 
1819   // True if we have vectorized the induction variable.
1820   auto VectorizedIV = false;
1821 
1822   // Determine if we want a scalar version of the induction variable. This is
1823   // true if the induction variable itself is not widened, or if it has at
1824   // least one user in the loop that is not widened.
1825   auto NeedsScalarIV = VF > 1 && needsScalarInduction(EntryVal);
1826 
1827   // Generate code for the induction step. Note that induction steps are
1828   // required to be loop-invariant
1829   assert(PSE.getSE()->isLoopInvariant(ID.getStep(), OrigLoop) &&
1830          "Induction step should be loop invariant");
1831   auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
1832   Value *Step = nullptr;
1833   if (PSE.getSE()->isSCEVable(IV->getType())) {
1834     SCEVExpander Exp(*PSE.getSE(), DL, "induction");
1835     Step = Exp.expandCodeFor(ID.getStep(), ID.getStep()->getType(),
1836                              LoopVectorPreHeader->getTerminator());
1837   } else {
1838     Step = cast<SCEVUnknown>(ID.getStep())->getValue();
1839   }
1840 
1841   // Try to create a new independent vector induction variable. If we can't
1842   // create the phi node, we will splat the scalar induction variable in each
1843   // loop iteration.
1844   if (VF > 1 && !shouldScalarizeInstruction(EntryVal)) {
1845     createVectorIntOrFpInductionPHI(ID, Step, EntryVal);
1846     VectorizedIV = true;
1847   }
1848 
1849   // If we haven't yet vectorized the induction variable, or if we will create
1850   // a scalar one, we need to define the scalar induction variable and step
1851   // values. If we were given a truncation type, truncate the canonical
1852   // induction variable and step. Otherwise, derive these values from the
1853   // induction descriptor.
1854   if (!VectorizedIV || NeedsScalarIV) {
1855     ScalarIV = Induction;
1856     if (IV != OldInduction) {
1857       ScalarIV = IV->getType()->isIntegerTy()
1858                      ? Builder.CreateSExtOrTrunc(Induction, IV->getType())
1859                      : Builder.CreateCast(Instruction::SIToFP, Induction,
1860                                           IV->getType());
1861       ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID);
1862       ScalarIV->setName("offset.idx");
1863     }
1864     if (Trunc) {
1865       auto *TruncType = cast<IntegerType>(Trunc->getType());
1866       assert(Step->getType()->isIntegerTy() &&
1867              "Truncation requires an integer step");
1868       ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType);
1869       Step = Builder.CreateTrunc(Step, TruncType);
1870     }
1871   }
1872 
1873   // If we haven't yet vectorized the induction variable, splat the scalar
1874   // induction variable, and build the necessary step vectors.
1875   // TODO: Don't do it unless the vectorized IV is really required.
1876   if (!VectorizedIV) {
1877     Value *Broadcasted = getBroadcastInstrs(ScalarIV);
1878     for (unsigned Part = 0; Part < UF; ++Part) {
1879       Value *EntryPart =
1880           getStepVector(Broadcasted, VF * Part, Step, ID.getInductionOpcode());
1881       VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart);
1882       if (Trunc)
1883         addMetadata(EntryPart, Trunc);
1884       recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part);
1885     }
1886   }
1887 
1888   // If an induction variable is only used for counting loop iterations or
1889   // calculating addresses, it doesn't need to be widened. Create scalar steps
1890   // that can be used by instructions we will later scalarize. Note that the
1891   // addition of the scalar steps will not increase the number of instructions
1892   // in the loop in the common case prior to InstCombine. We will be trading
1893   // one vector extract for each scalar step.
1894   if (NeedsScalarIV)
1895     buildScalarSteps(ScalarIV, Step, EntryVal, ID);
1896 }
1897 
1898 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step,
1899                                           Instruction::BinaryOps BinOp) {
1900   // Create and check the types.
1901   assert(Val->getType()->isVectorTy() && "Must be a vector");
1902   int VLen = Val->getType()->getVectorNumElements();
1903 
1904   Type *STy = Val->getType()->getScalarType();
1905   assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
1906          "Induction Step must be an integer or FP");
1907   assert(Step->getType() == STy && "Step has wrong type");
1908 
1909   SmallVector<Constant *, 8> Indices;
1910 
1911   if (STy->isIntegerTy()) {
1912     // Create a vector of consecutive numbers from zero to VF.
1913     for (int i = 0; i < VLen; ++i)
1914       Indices.push_back(ConstantInt::get(STy, StartIdx + i));
1915 
1916     // Add the consecutive indices to the vector value.
1917     Constant *Cv = ConstantVector::get(Indices);
1918     assert(Cv->getType() == Val->getType() && "Invalid consecutive vec");
1919     Step = Builder.CreateVectorSplat(VLen, Step);
1920     assert(Step->getType() == Val->getType() && "Invalid step vec");
1921     // FIXME: The newly created binary instructions should contain nsw/nuw flags,
1922     // which can be found from the original scalar operations.
1923     Step = Builder.CreateMul(Cv, Step);
1924     return Builder.CreateAdd(Val, Step, "induction");
1925   }
1926 
1927   // Floating point induction.
1928   assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
1929          "Binary Opcode should be specified for FP induction");
1930   // Create a vector of consecutive numbers from zero to VF.
1931   for (int i = 0; i < VLen; ++i)
1932     Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i)));
1933 
1934   // Add the consecutive indices to the vector value.
1935   Constant *Cv = ConstantVector::get(Indices);
1936 
1937   Step = Builder.CreateVectorSplat(VLen, Step);
1938 
1939   // Floating point operations had to be 'fast' to enable the induction.
1940   FastMathFlags Flags;
1941   Flags.setFast();
1942 
1943   Value *MulOp = Builder.CreateFMul(Cv, Step);
1944   if (isa<Instruction>(MulOp))
1945     // Have to check, MulOp may be a constant
1946     cast<Instruction>(MulOp)->setFastMathFlags(Flags);
1947 
1948   Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
1949   if (isa<Instruction>(BOp))
1950     cast<Instruction>(BOp)->setFastMathFlags(Flags);
1951   return BOp;
1952 }
1953 
1954 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
1955                                            Instruction *EntryVal,
1956                                            const InductionDescriptor &ID) {
1957   // We shouldn't have to build scalar steps if we aren't vectorizing.
1958   assert(VF > 1 && "VF should be greater than one");
1959 
1960   // Get the value type and ensure it and the step have the same integer type.
1961   Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
1962   assert(ScalarIVTy == Step->getType() &&
1963          "Val and Step should have the same type");
1964 
1965   // We build scalar steps for both integer and floating-point induction
1966   // variables. Here, we determine the kind of arithmetic we will perform.
1967   Instruction::BinaryOps AddOp;
1968   Instruction::BinaryOps MulOp;
1969   if (ScalarIVTy->isIntegerTy()) {
1970     AddOp = Instruction::Add;
1971     MulOp = Instruction::Mul;
1972   } else {
1973     AddOp = ID.getInductionOpcode();
1974     MulOp = Instruction::FMul;
1975   }
1976 
1977   // Determine the number of scalars we need to generate for each unroll
1978   // iteration. If EntryVal is uniform, we only need to generate the first
1979   // lane. Otherwise, we generate all VF values.
1980   unsigned Lanes =
1981       Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) ? 1
1982                                                                          : VF;
1983   // Compute the scalar steps and save the results in VectorLoopValueMap.
1984   for (unsigned Part = 0; Part < UF; ++Part) {
1985     for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
1986       auto *StartIdx = getSignedIntOrFpConstant(ScalarIVTy, VF * Part + Lane);
1987       auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step));
1988       auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul));
1989       VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add);
1990       recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane);
1991     }
1992   }
1993 }
1994 
1995 Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) {
1996   assert(V != Induction && "The new induction variable should not be used.");
1997   assert(!V->getType()->isVectorTy() && "Can't widen a vector");
1998   assert(!V->getType()->isVoidTy() && "Type does not produce a value");
1999 
2000   // If we have a stride that is replaced by one, do it here. Defer this for
2001   // the VPlan-native path until we start running Legal checks in that path.
2002   if (!EnableVPlanNativePath && Legal->hasStride(V))
2003     V = ConstantInt::get(V->getType(), 1);
2004 
2005   // If we have a vector mapped to this value, return it.
2006   if (VectorLoopValueMap.hasVectorValue(V, Part))
2007     return VectorLoopValueMap.getVectorValue(V, Part);
2008 
2009   // If the value has not been vectorized, check if it has been scalarized
2010   // instead. If it has been scalarized, and we actually need the value in
2011   // vector form, we will construct the vector values on demand.
2012   if (VectorLoopValueMap.hasAnyScalarValue(V)) {
2013     Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0});
2014 
2015     // If we've scalarized a value, that value should be an instruction.
2016     auto *I = cast<Instruction>(V);
2017 
2018     // If we aren't vectorizing, we can just copy the scalar map values over to
2019     // the vector map.
2020     if (VF == 1) {
2021       VectorLoopValueMap.setVectorValue(V, Part, ScalarValue);
2022       return ScalarValue;
2023     }
2024 
2025     // Get the last scalar instruction we generated for V and Part. If the value
2026     // is known to be uniform after vectorization, this corresponds to lane zero
2027     // of the Part unroll iteration. Otherwise, the last instruction is the one
2028     // we created for the last vector lane of the Part unroll iteration.
2029     unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) ? 0 : VF - 1;
2030     auto *LastInst = cast<Instruction>(
2031         VectorLoopValueMap.getScalarValue(V, {Part, LastLane}));
2032 
2033     // Set the insert point after the last scalarized instruction. This ensures
2034     // the insertelement sequence will directly follow the scalar definitions.
2035     auto OldIP = Builder.saveIP();
2036     auto NewIP = std::next(BasicBlock::iterator(LastInst));
2037     Builder.SetInsertPoint(&*NewIP);
2038 
2039     // However, if we are vectorizing, we need to construct the vector values.
2040     // If the value is known to be uniform after vectorization, we can just
2041     // broadcast the scalar value corresponding to lane zero for each unroll
2042     // iteration. Otherwise, we construct the vector values using insertelement
2043     // instructions. Since the resulting vectors are stored in
2044     // VectorLoopValueMap, we will only generate the insertelements once.
2045     Value *VectorValue = nullptr;
2046     if (Cost->isUniformAfterVectorization(I, VF)) {
2047       VectorValue = getBroadcastInstrs(ScalarValue);
2048       VectorLoopValueMap.setVectorValue(V, Part, VectorValue);
2049     } else {
2050       // Initialize packing with insertelements to start from undef.
2051       Value *Undef = UndefValue::get(VectorType::get(V->getType(), VF));
2052       VectorLoopValueMap.setVectorValue(V, Part, Undef);
2053       for (unsigned Lane = 0; Lane < VF; ++Lane)
2054         packScalarIntoVectorValue(V, {Part, Lane});
2055       VectorValue = VectorLoopValueMap.getVectorValue(V, Part);
2056     }
2057     Builder.restoreIP(OldIP);
2058     return VectorValue;
2059   }
2060 
2061   // If this scalar is unknown, assume that it is a constant or that it is
2062   // loop invariant. Broadcast V and save the value for future uses.
2063   Value *B = getBroadcastInstrs(V);
2064   VectorLoopValueMap.setVectorValue(V, Part, B);
2065   return B;
2066 }
2067 
2068 Value *
2069 InnerLoopVectorizer::getOrCreateScalarValue(Value *V,
2070                                             const VPIteration &Instance) {
2071   // If the value is not an instruction contained in the loop, it should
2072   // already be scalar.
2073   if (OrigLoop->isLoopInvariant(V))
2074     return V;
2075 
2076   assert(Instance.Lane > 0
2077              ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF)
2078              : true && "Uniform values only have lane zero");
2079 
2080   // If the value from the original loop has not been vectorized, it is
2081   // represented by UF x VF scalar values in the new loop. Return the requested
2082   // scalar value.
2083   if (VectorLoopValueMap.hasScalarValue(V, Instance))
2084     return VectorLoopValueMap.getScalarValue(V, Instance);
2085 
2086   // If the value has not been scalarized, get its entry in VectorLoopValueMap
2087   // for the given unroll part. If this entry is not a vector type (i.e., the
2088   // vectorization factor is one), there is no need to generate an
2089   // extractelement instruction.
2090   auto *U = getOrCreateVectorValue(V, Instance.Part);
2091   if (!U->getType()->isVectorTy()) {
2092     assert(VF == 1 && "Value not scalarized has non-vector type");
2093     return U;
2094   }
2095 
2096   // Otherwise, the value from the original loop has been vectorized and is
2097   // represented by UF vector values. Extract and return the requested scalar
2098   // value from the appropriate vector lane.
2099   return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane));
2100 }
2101 
2102 void InnerLoopVectorizer::packScalarIntoVectorValue(
2103     Value *V, const VPIteration &Instance) {
2104   assert(V != Induction && "The new induction variable should not be used.");
2105   assert(!V->getType()->isVectorTy() && "Can't pack a vector");
2106   assert(!V->getType()->isVoidTy() && "Type does not produce a value");
2107 
2108   Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance);
2109   Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part);
2110   VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst,
2111                                             Builder.getInt32(Instance.Lane));
2112   VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue);
2113 }
2114 
2115 Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
2116   assert(Vec->getType()->isVectorTy() && "Invalid type");
2117   SmallVector<Constant *, 8> ShuffleMask;
2118   for (unsigned i = 0; i < VF; ++i)
2119     ShuffleMask.push_back(Builder.getInt32(VF - i - 1));
2120 
2121   return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()),
2122                                      ConstantVector::get(ShuffleMask),
2123                                      "reverse");
2124 }
2125 
2126 // Return whether we allow using masked interleave-groups (for dealing with
2127 // strided loads/stores that reside in predicated blocks, or for dealing
2128 // with gaps).
2129 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2130   // If an override option has been passed in for interleaved accesses, use it.
2131   if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2132     return EnableMaskedInterleavedMemAccesses;
2133 
2134   return TTI.enableMaskedInterleavedAccessVectorization();
2135 }
2136 
2137 // Try to vectorize the interleave group that \p Instr belongs to.
2138 //
2139 // E.g. Translate following interleaved load group (factor = 3):
2140 //   for (i = 0; i < N; i+=3) {
2141 //     R = Pic[i];             // Member of index 0
2142 //     G = Pic[i+1];           // Member of index 1
2143 //     B = Pic[i+2];           // Member of index 2
2144 //     ... // do something to R, G, B
2145 //   }
2146 // To:
2147 //   %wide.vec = load <12 x i32>                       ; Read 4 tuples of R,G,B
2148 //   %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9>   ; R elements
2149 //   %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10>  ; G elements
2150 //   %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11>  ; B elements
2151 //
2152 // Or translate following interleaved store group (factor = 3):
2153 //   for (i = 0; i < N; i+=3) {
2154 //     ... do something to R, G, B
2155 //     Pic[i]   = R;           // Member of index 0
2156 //     Pic[i+1] = G;           // Member of index 1
2157 //     Pic[i+2] = B;           // Member of index 2
2158 //   }
2159 // To:
2160 //   %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2161 //   %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u>
2162 //   %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2163 //        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements
2164 //   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
2165 void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr,
2166                                                    VectorParts *BlockInMask) {
2167   const InterleaveGroup<Instruction> *Group =
2168       Cost->getInterleavedAccessGroup(Instr);
2169   assert(Group && "Fail to get an interleaved access group.");
2170 
2171   // Skip if current instruction is not the insert position.
2172   if (Instr != Group->getInsertPos())
2173     return;
2174 
2175   const DataLayout &DL = Instr->getModule()->getDataLayout();
2176   Value *Ptr = getLoadStorePointerOperand(Instr);
2177 
2178   // Prepare for the vector type of the interleaved load/store.
2179   Type *ScalarTy = getMemInstValueType(Instr);
2180   unsigned InterleaveFactor = Group->getFactor();
2181   Type *VecTy = VectorType::get(ScalarTy, InterleaveFactor * VF);
2182   Type *PtrTy = VecTy->getPointerTo(getLoadStoreAddressSpace(Instr));
2183 
2184   // Prepare for the new pointers.
2185   setDebugLocFromInst(Builder, Ptr);
2186   SmallVector<Value *, 2> NewPtrs;
2187   unsigned Index = Group->getIndex(Instr);
2188 
2189   VectorParts Mask;
2190   bool IsMaskForCondRequired = BlockInMask;
2191   if (IsMaskForCondRequired) {
2192     Mask = *BlockInMask;
2193     // TODO: extend the masked interleaved-group support to reversed access.
2194     assert(!Group->isReverse() && "Reversed masked interleave-group "
2195                                   "not supported.");
2196   }
2197 
2198   // If the group is reverse, adjust the index to refer to the last vector lane
2199   // instead of the first. We adjust the index from the first vector lane,
2200   // rather than directly getting the pointer for lane VF - 1, because the
2201   // pointer operand of the interleaved access is supposed to be uniform. For
2202   // uniform instructions, we're only required to generate a value for the
2203   // first vector lane in each unroll iteration.
2204   if (Group->isReverse())
2205     Index += (VF - 1) * Group->getFactor();
2206 
2207   bool InBounds = false;
2208   if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
2209     InBounds = gep->isInBounds();
2210 
2211   for (unsigned Part = 0; Part < UF; Part++) {
2212     Value *NewPtr = getOrCreateScalarValue(Ptr, {Part, 0});
2213 
2214     // Notice current instruction could be any index. Need to adjust the address
2215     // to the member of index 0.
2216     //
2217     // E.g.  a = A[i+1];     // Member of index 1 (Current instruction)
2218     //       b = A[i];       // Member of index 0
2219     // Current pointer is pointed to A[i+1], adjust it to A[i].
2220     //
2221     // E.g.  A[i+1] = a;     // Member of index 1
2222     //       A[i]   = b;     // Member of index 0
2223     //       A[i+2] = c;     // Member of index 2 (Current instruction)
2224     // Current pointer is pointed to A[i+2], adjust it to A[i].
2225     NewPtr = Builder.CreateGEP(ScalarTy, NewPtr, Builder.getInt32(-Index));
2226     if (InBounds)
2227       cast<GetElementPtrInst>(NewPtr)->setIsInBounds(true);
2228 
2229     // Cast to the vector pointer type.
2230     NewPtrs.push_back(Builder.CreateBitCast(NewPtr, PtrTy));
2231   }
2232 
2233   setDebugLocFromInst(Builder, Instr);
2234   Value *UndefVec = UndefValue::get(VecTy);
2235 
2236   Value *MaskForGaps = nullptr;
2237   if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) {
2238     MaskForGaps = createBitMaskForGaps(Builder, VF, *Group);
2239     assert(MaskForGaps && "Mask for Gaps is required but it is null");
2240   }
2241 
2242   // Vectorize the interleaved load group.
2243   if (isa<LoadInst>(Instr)) {
2244     // For each unroll part, create a wide load for the group.
2245     SmallVector<Value *, 2> NewLoads;
2246     for (unsigned Part = 0; Part < UF; Part++) {
2247       Instruction *NewLoad;
2248       if (IsMaskForCondRequired || MaskForGaps) {
2249         assert(useMaskedInterleavedAccesses(*TTI) &&
2250                "masked interleaved groups are not allowed.");
2251         Value *GroupMask = MaskForGaps;
2252         if (IsMaskForCondRequired) {
2253           auto *Undefs = UndefValue::get(Mask[Part]->getType());
2254           auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF);
2255           Value *ShuffledMask = Builder.CreateShuffleVector(
2256               Mask[Part], Undefs, RepMask, "interleaved.mask");
2257           GroupMask = MaskForGaps
2258                           ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
2259                                                 MaskForGaps)
2260                           : ShuffledMask;
2261         }
2262         NewLoad =
2263             Builder.CreateMaskedLoad(NewPtrs[Part], Group->getAlignment(),
2264                                      GroupMask, UndefVec, "wide.masked.vec");
2265       }
2266       else
2267         NewLoad = Builder.CreateAlignedLoad(VecTy, NewPtrs[Part],
2268                                             Group->getAlignment(), "wide.vec");
2269       Group->addMetadata(NewLoad);
2270       NewLoads.push_back(NewLoad);
2271     }
2272 
2273     // For each member in the group, shuffle out the appropriate data from the
2274     // wide loads.
2275     for (unsigned I = 0; I < InterleaveFactor; ++I) {
2276       Instruction *Member = Group->getMember(I);
2277 
2278       // Skip the gaps in the group.
2279       if (!Member)
2280         continue;
2281 
2282       Constant *StrideMask = createStrideMask(Builder, I, InterleaveFactor, VF);
2283       for (unsigned Part = 0; Part < UF; Part++) {
2284         Value *StridedVec = Builder.CreateShuffleVector(
2285             NewLoads[Part], UndefVec, StrideMask, "strided.vec");
2286 
2287         // If this member has different type, cast the result type.
2288         if (Member->getType() != ScalarTy) {
2289           VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2290           StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2291         }
2292 
2293         if (Group->isReverse())
2294           StridedVec = reverseVector(StridedVec);
2295 
2296         VectorLoopValueMap.setVectorValue(Member, Part, StridedVec);
2297       }
2298     }
2299     return;
2300   }
2301 
2302   // The sub vector type for current instruction.
2303   VectorType *SubVT = VectorType::get(ScalarTy, VF);
2304 
2305   // Vectorize the interleaved store group.
2306   for (unsigned Part = 0; Part < UF; Part++) {
2307     // Collect the stored vector from each member.
2308     SmallVector<Value *, 4> StoredVecs;
2309     for (unsigned i = 0; i < InterleaveFactor; i++) {
2310       // Interleaved store group doesn't allow a gap, so each index has a member
2311       Instruction *Member = Group->getMember(i);
2312       assert(Member && "Fail to get a member from an interleaved store group");
2313 
2314       Value *StoredVec = getOrCreateVectorValue(
2315           cast<StoreInst>(Member)->getValueOperand(), Part);
2316       if (Group->isReverse())
2317         StoredVec = reverseVector(StoredVec);
2318 
2319       // If this member has different type, cast it to a unified type.
2320 
2321       if (StoredVec->getType() != SubVT)
2322         StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2323 
2324       StoredVecs.push_back(StoredVec);
2325     }
2326 
2327     // Concatenate all vectors into a wide vector.
2328     Value *WideVec = concatenateVectors(Builder, StoredVecs);
2329 
2330     // Interleave the elements in the wide vector.
2331     Constant *IMask = createInterleaveMask(Builder, VF, InterleaveFactor);
2332     Value *IVec = Builder.CreateShuffleVector(WideVec, UndefVec, IMask,
2333                                               "interleaved.vec");
2334 
2335     Instruction *NewStoreInstr;
2336     if (IsMaskForCondRequired) {
2337       auto *Undefs = UndefValue::get(Mask[Part]->getType());
2338       auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF);
2339       Value *ShuffledMask = Builder.CreateShuffleVector(
2340           Mask[Part], Undefs, RepMask, "interleaved.mask");
2341       NewStoreInstr = Builder.CreateMaskedStore(
2342           IVec, NewPtrs[Part], Group->getAlignment(), ShuffledMask);
2343     }
2344     else
2345       NewStoreInstr = Builder.CreateAlignedStore(IVec, NewPtrs[Part],
2346         Group->getAlignment());
2347 
2348     Group->addMetadata(NewStoreInstr);
2349   }
2350 }
2351 
2352 void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
2353                                                      VectorParts *BlockInMask) {
2354   // Attempt to issue a wide load.
2355   LoadInst *LI = dyn_cast<LoadInst>(Instr);
2356   StoreInst *SI = dyn_cast<StoreInst>(Instr);
2357 
2358   assert((LI || SI) && "Invalid Load/Store instruction");
2359 
2360   LoopVectorizationCostModel::InstWidening Decision =
2361       Cost->getWideningDecision(Instr, VF);
2362   assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
2363          "CM decision should be taken at this point");
2364   if (Decision == LoopVectorizationCostModel::CM_Interleave)
2365     return vectorizeInterleaveGroup(Instr);
2366 
2367   Type *ScalarDataTy = getMemInstValueType(Instr);
2368   Type *DataTy = VectorType::get(ScalarDataTy, VF);
2369   Value *Ptr = getLoadStorePointerOperand(Instr);
2370   // An alignment of 0 means target abi alignment. We need to use the scalar's
2371   // target abi alignment in such a case.
2372   const DataLayout &DL = Instr->getModule()->getDataLayout();
2373   const Align Alignment =
2374       DL.getValueOrABITypeAlignment(getLoadStoreAlignment(Instr), ScalarDataTy);
2375   unsigned AddressSpace = getLoadStoreAddressSpace(Instr);
2376 
2377   // Determine if the pointer operand of the access is either consecutive or
2378   // reverse consecutive.
2379   bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse);
2380   bool ConsecutiveStride =
2381       Reverse || (Decision == LoopVectorizationCostModel::CM_Widen);
2382   bool CreateGatherScatter =
2383       (Decision == LoopVectorizationCostModel::CM_GatherScatter);
2384 
2385   // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector
2386   // gather/scatter. Otherwise Decision should have been to Scalarize.
2387   assert((ConsecutiveStride || CreateGatherScatter) &&
2388          "The instruction should be scalarized");
2389 
2390   // Handle consecutive loads/stores.
2391   if (ConsecutiveStride)
2392     Ptr = getOrCreateScalarValue(Ptr, {0, 0});
2393 
2394   VectorParts Mask;
2395   bool isMaskRequired = BlockInMask;
2396   if (isMaskRequired)
2397     Mask = *BlockInMask;
2398 
2399   bool InBounds = false;
2400   if (auto *gep = dyn_cast<GetElementPtrInst>(
2401           getLoadStorePointerOperand(Instr)->stripPointerCasts()))
2402     InBounds = gep->isInBounds();
2403 
2404   const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
2405     // Calculate the pointer for the specific unroll-part.
2406     GetElementPtrInst *PartPtr = nullptr;
2407 
2408     if (Reverse) {
2409       // If the address is consecutive but reversed, then the
2410       // wide store needs to start at the last vector element.
2411       PartPtr = cast<GetElementPtrInst>(
2412           Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(-Part * VF)));
2413       PartPtr->setIsInBounds(InBounds);
2414       PartPtr = cast<GetElementPtrInst>(
2415           Builder.CreateGEP(ScalarDataTy, PartPtr, Builder.getInt32(1 - VF)));
2416       PartPtr->setIsInBounds(InBounds);
2417       if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
2418         Mask[Part] = reverseVector(Mask[Part]);
2419     } else {
2420       PartPtr = cast<GetElementPtrInst>(
2421           Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(Part * VF)));
2422       PartPtr->setIsInBounds(InBounds);
2423     }
2424 
2425     return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
2426   };
2427 
2428   // Handle Stores:
2429   if (SI) {
2430     setDebugLocFromInst(Builder, SI);
2431 
2432     for (unsigned Part = 0; Part < UF; ++Part) {
2433       Instruction *NewSI = nullptr;
2434       Value *StoredVal = getOrCreateVectorValue(SI->getValueOperand(), Part);
2435       if (CreateGatherScatter) {
2436         Value *MaskPart = isMaskRequired ? Mask[Part] : nullptr;
2437         Value *VectorGep = getOrCreateVectorValue(Ptr, Part);
2438         NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep,
2439                                             Alignment.value(), MaskPart);
2440       } else {
2441         if (Reverse) {
2442           // If we store to reverse consecutive memory locations, then we need
2443           // to reverse the order of elements in the stored value.
2444           StoredVal = reverseVector(StoredVal);
2445           // We don't want to update the value in the map as it might be used in
2446           // another expression. So don't call resetVectorValue(StoredVal).
2447         }
2448         auto *VecPtr = CreateVecPtr(Part, Ptr);
2449         if (isMaskRequired)
2450           NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr,
2451                                             Alignment.value(), Mask[Part]);
2452         else
2453           NewSI =
2454               Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment.value());
2455       }
2456       addMetadata(NewSI, SI);
2457     }
2458     return;
2459   }
2460 
2461   // Handle loads.
2462   assert(LI && "Must have a load instruction");
2463   setDebugLocFromInst(Builder, LI);
2464   for (unsigned Part = 0; Part < UF; ++Part) {
2465     Value *NewLI;
2466     if (CreateGatherScatter) {
2467       Value *MaskPart = isMaskRequired ? Mask[Part] : nullptr;
2468       Value *VectorGep = getOrCreateVectorValue(Ptr, Part);
2469       NewLI = Builder.CreateMaskedGather(VectorGep, Alignment.value(), MaskPart,
2470                                          nullptr, "wide.masked.gather");
2471       addMetadata(NewLI, LI);
2472     } else {
2473       auto *VecPtr = CreateVecPtr(Part, Ptr);
2474       if (isMaskRequired)
2475         NewLI = Builder.CreateMaskedLoad(VecPtr, Alignment.value(), Mask[Part],
2476                                          UndefValue::get(DataTy),
2477                                          "wide.masked.load");
2478       else
2479         NewLI = Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment.value(),
2480                                           "wide.load");
2481 
2482       // Add metadata to the load, but setVectorValue to the reverse shuffle.
2483       addMetadata(NewLI, LI);
2484       if (Reverse)
2485         NewLI = reverseVector(NewLI);
2486     }
2487     VectorLoopValueMap.setVectorValue(Instr, Part, NewLI);
2488   }
2489 }
2490 
2491 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
2492                                                const VPIteration &Instance,
2493                                                bool IfPredicateInstr) {
2494   assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
2495 
2496   setDebugLocFromInst(Builder, Instr);
2497 
2498   // Does this instruction return a value ?
2499   bool IsVoidRetTy = Instr->getType()->isVoidTy();
2500 
2501   Instruction *Cloned = Instr->clone();
2502   if (!IsVoidRetTy)
2503     Cloned->setName(Instr->getName() + ".cloned");
2504 
2505   // Replace the operands of the cloned instructions with their scalar
2506   // equivalents in the new loop.
2507   for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
2508     auto *NewOp = getOrCreateScalarValue(Instr->getOperand(op), Instance);
2509     Cloned->setOperand(op, NewOp);
2510   }
2511   addNewMetadata(Cloned, Instr);
2512 
2513   // Place the cloned scalar in the new loop.
2514   Builder.Insert(Cloned);
2515 
2516   // Add the cloned scalar to the scalar map entry.
2517   VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned);
2518 
2519   // If we just cloned a new assumption, add it the assumption cache.
2520   if (auto *II = dyn_cast<IntrinsicInst>(Cloned))
2521     if (II->getIntrinsicID() == Intrinsic::assume)
2522       AC->registerAssumption(II);
2523 
2524   // End if-block.
2525   if (IfPredicateInstr)
2526     PredicatedInstructions.push_back(Cloned);
2527 }
2528 
2529 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start,
2530                                                       Value *End, Value *Step,
2531                                                       Instruction *DL) {
2532   BasicBlock *Header = L->getHeader();
2533   BasicBlock *Latch = L->getLoopLatch();
2534   // As we're just creating this loop, it's possible no latch exists
2535   // yet. If so, use the header as this will be a single block loop.
2536   if (!Latch)
2537     Latch = Header;
2538 
2539   IRBuilder<> Builder(&*Header->getFirstInsertionPt());
2540   Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction);
2541   setDebugLocFromInst(Builder, OldInst);
2542   auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index");
2543 
2544   Builder.SetInsertPoint(Latch->getTerminator());
2545   setDebugLocFromInst(Builder, OldInst);
2546 
2547   // Create i+1 and fill the PHINode.
2548   Value *Next = Builder.CreateAdd(Induction, Step, "index.next");
2549   Induction->addIncoming(Start, L->getLoopPreheader());
2550   Induction->addIncoming(Next, Latch);
2551   // Create the compare.
2552   Value *ICmp = Builder.CreateICmpEQ(Next, End);
2553   Builder.CreateCondBr(ICmp, L->getExitBlock(), Header);
2554 
2555   // Now we have two terminators. Remove the old one from the block.
2556   Latch->getTerminator()->eraseFromParent();
2557 
2558   return Induction;
2559 }
2560 
2561 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
2562   if (TripCount)
2563     return TripCount;
2564 
2565   assert(L && "Create Trip Count for null loop.");
2566   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2567   // Find the loop boundaries.
2568   ScalarEvolution *SE = PSE.getSE();
2569   const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
2570   assert(BackedgeTakenCount != SE->getCouldNotCompute() &&
2571          "Invalid loop count");
2572 
2573   Type *IdxTy = Legal->getWidestInductionType();
2574   assert(IdxTy && "No type for induction");
2575 
2576   // The exit count might have the type of i64 while the phi is i32. This can
2577   // happen if we have an induction variable that is sign extended before the
2578   // compare. The only way that we get a backedge taken count is that the
2579   // induction variable was signed and as such will not overflow. In such a case
2580   // truncation is legal.
2581   if (BackedgeTakenCount->getType()->getPrimitiveSizeInBits() >
2582       IdxTy->getPrimitiveSizeInBits())
2583     BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
2584   BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
2585 
2586   // Get the total trip count from the count by adding 1.
2587   const SCEV *ExitCount = SE->getAddExpr(
2588       BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
2589 
2590   const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
2591 
2592   // Expand the trip count and place the new instructions in the preheader.
2593   // Notice that the pre-header does not change, only the loop body.
2594   SCEVExpander Exp(*SE, DL, "induction");
2595 
2596   // Count holds the overall loop count (N).
2597   TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
2598                                 L->getLoopPreheader()->getTerminator());
2599 
2600   if (TripCount->getType()->isPointerTy())
2601     TripCount =
2602         CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
2603                                     L->getLoopPreheader()->getTerminator());
2604 
2605   return TripCount;
2606 }
2607 
2608 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
2609   if (VectorTripCount)
2610     return VectorTripCount;
2611 
2612   Value *TC = getOrCreateTripCount(L);
2613   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2614 
2615   Type *Ty = TC->getType();
2616   Constant *Step = ConstantInt::get(Ty, VF * UF);
2617 
2618   // If the tail is to be folded by masking, round the number of iterations N
2619   // up to a multiple of Step instead of rounding down. This is done by first
2620   // adding Step-1 and then rounding down. Note that it's ok if this addition
2621   // overflows: the vector induction variable will eventually wrap to zero given
2622   // that it starts at zero and its Step is a power of two; the loop will then
2623   // exit, with the last early-exit vector comparison also producing all-true.
2624   if (Cost->foldTailByMasking()) {
2625     assert(isPowerOf2_32(VF * UF) &&
2626            "VF*UF must be a power of 2 when folding tail by masking");
2627     TC = Builder.CreateAdd(TC, ConstantInt::get(Ty, VF * UF - 1), "n.rnd.up");
2628   }
2629 
2630   // Now we need to generate the expression for the part of the loop that the
2631   // vectorized body will execute. This is equal to N - (N % Step) if scalar
2632   // iterations are not required for correctness, or N - Step, otherwise. Step
2633   // is equal to the vectorization factor (number of SIMD elements) times the
2634   // unroll factor (number of SIMD instructions).
2635   Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
2636 
2637   // If there is a non-reversed interleaved group that may speculatively access
2638   // memory out-of-bounds, we need to ensure that there will be at least one
2639   // iteration of the scalar epilogue loop. Thus, if the step evenly divides
2640   // the trip count, we set the remainder to be equal to the step. If the step
2641   // does not evenly divide the trip count, no adjustment is necessary since
2642   // there will already be scalar iterations. Note that the minimum iterations
2643   // check ensures that N >= Step.
2644   if (VF > 1 && Cost->requiresScalarEpilogue()) {
2645     auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
2646     R = Builder.CreateSelect(IsZero, Step, R);
2647   }
2648 
2649   VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
2650 
2651   return VectorTripCount;
2652 }
2653 
2654 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
2655                                                    const DataLayout &DL) {
2656   // Verify that V is a vector type with same number of elements as DstVTy.
2657   unsigned VF = DstVTy->getNumElements();
2658   VectorType *SrcVecTy = cast<VectorType>(V->getType());
2659   assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match");
2660   Type *SrcElemTy = SrcVecTy->getElementType();
2661   Type *DstElemTy = DstVTy->getElementType();
2662   assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
2663          "Vector elements must have same size");
2664 
2665   // Do a direct cast if element types are castable.
2666   if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
2667     return Builder.CreateBitOrPointerCast(V, DstVTy);
2668   }
2669   // V cannot be directly casted to desired vector type.
2670   // May happen when V is a floating point vector but DstVTy is a vector of
2671   // pointers or vice-versa. Handle this using a two-step bitcast using an
2672   // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
2673   assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
2674          "Only one type should be a pointer type");
2675   assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
2676          "Only one type should be a floating point type");
2677   Type *IntTy =
2678       IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
2679   VectorType *VecIntTy = VectorType::get(IntTy, VF);
2680   Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
2681   return Builder.CreateBitOrPointerCast(CastVal, DstVTy);
2682 }
2683 
2684 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
2685                                                          BasicBlock *Bypass) {
2686   Value *Count = getOrCreateTripCount(L);
2687   BasicBlock *BB = L->getLoopPreheader();
2688   IRBuilder<> Builder(BB->getTerminator());
2689 
2690   // Generate code to check if the loop's trip count is less than VF * UF, or
2691   // equal to it in case a scalar epilogue is required; this implies that the
2692   // vector trip count is zero. This check also covers the case where adding one
2693   // to the backedge-taken count overflowed leading to an incorrect trip count
2694   // of zero. In this case we will also jump to the scalar loop.
2695   auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE
2696                                           : ICmpInst::ICMP_ULT;
2697 
2698   // If tail is to be folded, vector loop takes care of all iterations.
2699   Value *CheckMinIters = Builder.getFalse();
2700   if (!Cost->foldTailByMasking())
2701     CheckMinIters = Builder.CreateICmp(
2702         P, Count, ConstantInt::get(Count->getType(), VF * UF),
2703         "min.iters.check");
2704 
2705   BasicBlock *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
2706   // Update dominator tree immediately if the generated block is a
2707   // LoopBypassBlock because SCEV expansions to generate loop bypass
2708   // checks may query it before the current function is finished.
2709   DT->addNewBlock(NewBB, BB);
2710   if (L->getParentLoop())
2711     L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
2712   ReplaceInstWithInst(BB->getTerminator(),
2713                       BranchInst::Create(Bypass, NewBB, CheckMinIters));
2714   LoopBypassBlocks.push_back(BB);
2715 }
2716 
2717 void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
2718   BasicBlock *BB = L->getLoopPreheader();
2719 
2720   // Generate the code to check that the SCEV assumptions that we made.
2721   // We want the new basic block to start at the first instruction in a
2722   // sequence of instructions that form a check.
2723   SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(),
2724                    "scev.check");
2725   Value *SCEVCheck =
2726       Exp.expandCodeForPredicate(&PSE.getUnionPredicate(), BB->getTerminator());
2727 
2728   if (auto *C = dyn_cast<ConstantInt>(SCEVCheck))
2729     if (C->isZero())
2730       return;
2731 
2732   assert(!BB->getParent()->hasOptSize() &&
2733          "Cannot SCEV check stride or overflow when optimizing for size");
2734 
2735   // Create a new block containing the stride check.
2736   BB->setName("vector.scevcheck");
2737   auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
2738   // Update dominator tree immediately if the generated block is a
2739   // LoopBypassBlock because SCEV expansions to generate loop bypass
2740   // checks may query it before the current function is finished.
2741   DT->addNewBlock(NewBB, BB);
2742   if (L->getParentLoop())
2743     L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
2744   ReplaceInstWithInst(BB->getTerminator(),
2745                       BranchInst::Create(Bypass, NewBB, SCEVCheck));
2746   LoopBypassBlocks.push_back(BB);
2747   AddedSafetyChecks = true;
2748 }
2749 
2750 void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {
2751   // VPlan-native path does not do any analysis for runtime checks currently.
2752   if (EnableVPlanNativePath)
2753     return;
2754 
2755   BasicBlock *BB = L->getLoopPreheader();
2756 
2757   // Generate the code that checks in runtime if arrays overlap. We put the
2758   // checks into a separate block to make the more common case of few elements
2759   // faster.
2760   Instruction *FirstCheckInst;
2761   Instruction *MemRuntimeCheck;
2762   std::tie(FirstCheckInst, MemRuntimeCheck) =
2763       Legal->getLAI()->addRuntimeChecks(BB->getTerminator());
2764   if (!MemRuntimeCheck)
2765     return;
2766 
2767   if (BB->getParent()->hasOptSize()) {
2768     assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
2769            "Cannot emit memory checks when optimizing for size, unless forced "
2770            "to vectorize.");
2771     ORE->emit([&]() {
2772       return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
2773                                         L->getStartLoc(), L->getHeader())
2774              << "Code-size may be reduced by not forcing "
2775                 "vectorization, or by source-code modifications "
2776                 "eliminating the need for runtime checks "
2777                 "(e.g., adding 'restrict').";
2778     });
2779   }
2780 
2781   // Create a new block containing the memory check.
2782   BB->setName("vector.memcheck");
2783   auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
2784   // Update dominator tree immediately if the generated block is a
2785   // LoopBypassBlock because SCEV expansions to generate loop bypass
2786   // checks may query it before the current function is finished.
2787   DT->addNewBlock(NewBB, BB);
2788   if (L->getParentLoop())
2789     L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
2790   ReplaceInstWithInst(BB->getTerminator(),
2791                       BranchInst::Create(Bypass, NewBB, MemRuntimeCheck));
2792   LoopBypassBlocks.push_back(BB);
2793   AddedSafetyChecks = true;
2794 
2795   // We currently don't use LoopVersioning for the actual loop cloning but we
2796   // still use it to add the noalias metadata.
2797   LVer = std::make_unique<LoopVersioning>(*Legal->getLAI(), OrigLoop, LI, DT,
2798                                            PSE.getSE());
2799   LVer->prepareNoAliasMetadata();
2800 }
2801 
2802 Value *InnerLoopVectorizer::emitTransformedIndex(
2803     IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL,
2804     const InductionDescriptor &ID) const {
2805 
2806   SCEVExpander Exp(*SE, DL, "induction");
2807   auto Step = ID.getStep();
2808   auto StartValue = ID.getStartValue();
2809   assert(Index->getType() == Step->getType() &&
2810          "Index type does not match StepValue type");
2811 
2812   // Note: the IR at this point is broken. We cannot use SE to create any new
2813   // SCEV and then expand it, hoping that SCEV's simplification will give us
2814   // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
2815   // lead to various SCEV crashes. So all we can do is to use builder and rely
2816   // on InstCombine for future simplifications. Here we handle some trivial
2817   // cases only.
2818   auto CreateAdd = [&B](Value *X, Value *Y) {
2819     assert(X->getType() == Y->getType() && "Types don't match!");
2820     if (auto *CX = dyn_cast<ConstantInt>(X))
2821       if (CX->isZero())
2822         return Y;
2823     if (auto *CY = dyn_cast<ConstantInt>(Y))
2824       if (CY->isZero())
2825         return X;
2826     return B.CreateAdd(X, Y);
2827   };
2828 
2829   auto CreateMul = [&B](Value *X, Value *Y) {
2830     assert(X->getType() == Y->getType() && "Types don't match!");
2831     if (auto *CX = dyn_cast<ConstantInt>(X))
2832       if (CX->isOne())
2833         return Y;
2834     if (auto *CY = dyn_cast<ConstantInt>(Y))
2835       if (CY->isOne())
2836         return X;
2837     return B.CreateMul(X, Y);
2838   };
2839 
2840   switch (ID.getKind()) {
2841   case InductionDescriptor::IK_IntInduction: {
2842     assert(Index->getType() == StartValue->getType() &&
2843            "Index type does not match StartValue type");
2844     if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne())
2845       return B.CreateSub(StartValue, Index);
2846     auto *Offset = CreateMul(
2847         Index, Exp.expandCodeFor(Step, Index->getType(), &*B.GetInsertPoint()));
2848     return CreateAdd(StartValue, Offset);
2849   }
2850   case InductionDescriptor::IK_PtrInduction: {
2851     assert(isa<SCEVConstant>(Step) &&
2852            "Expected constant step for pointer induction");
2853     return B.CreateGEP(
2854         StartValue->getType()->getPointerElementType(), StartValue,
2855         CreateMul(Index, Exp.expandCodeFor(Step, Index->getType(),
2856                                            &*B.GetInsertPoint())));
2857   }
2858   case InductionDescriptor::IK_FpInduction: {
2859     assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
2860     auto InductionBinOp = ID.getInductionBinOp();
2861     assert(InductionBinOp &&
2862            (InductionBinOp->getOpcode() == Instruction::FAdd ||
2863             InductionBinOp->getOpcode() == Instruction::FSub) &&
2864            "Original bin op should be defined for FP induction");
2865 
2866     Value *StepValue = cast<SCEVUnknown>(Step)->getValue();
2867 
2868     // Floating point operations had to be 'fast' to enable the induction.
2869     FastMathFlags Flags;
2870     Flags.setFast();
2871 
2872     Value *MulExp = B.CreateFMul(StepValue, Index);
2873     if (isa<Instruction>(MulExp))
2874       // We have to check, the MulExp may be a constant.
2875       cast<Instruction>(MulExp)->setFastMathFlags(Flags);
2876 
2877     Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
2878                                "induction");
2879     if (isa<Instruction>(BOp))
2880       cast<Instruction>(BOp)->setFastMathFlags(Flags);
2881 
2882     return BOp;
2883   }
2884   case InductionDescriptor::IK_NoInduction:
2885     return nullptr;
2886   }
2887   llvm_unreachable("invalid enum");
2888 }
2889 
2890 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
2891   /*
2892    In this function we generate a new loop. The new loop will contain
2893    the vectorized instructions while the old loop will continue to run the
2894    scalar remainder.
2895 
2896        [ ] <-- loop iteration number check.
2897     /   |
2898    /    v
2899   |    [ ] <-- vector loop bypass (may consist of multiple blocks).
2900   |  /  |
2901   | /   v
2902   ||   [ ]     <-- vector pre header.
2903   |/    |
2904   |     v
2905   |    [  ] \
2906   |    [  ]_|   <-- vector loop.
2907   |     |
2908   |     v
2909   |   -[ ]   <--- middle-block.
2910   |  /  |
2911   | /   v
2912   -|- >[ ]     <--- new preheader.
2913    |    |
2914    |    v
2915    |   [ ] \
2916    |   [ ]_|   <-- old scalar loop to handle remainder.
2917     \   |
2918      \  v
2919       >[ ]     <-- exit block.
2920    ...
2921    */
2922 
2923   BasicBlock *OldBasicBlock = OrigLoop->getHeader();
2924   BasicBlock *VectorPH = OrigLoop->getLoopPreheader();
2925   BasicBlock *ExitBlock = OrigLoop->getExitBlock();
2926   MDNode *OrigLoopID = OrigLoop->getLoopID();
2927   assert(VectorPH && "Invalid loop structure");
2928   assert(ExitBlock && "Must have an exit block");
2929 
2930   // Some loops have a single integer induction variable, while other loops
2931   // don't. One example is c++ iterators that often have multiple pointer
2932   // induction variables. In the code below we also support a case where we
2933   // don't have a single induction variable.
2934   //
2935   // We try to obtain an induction variable from the original loop as hard
2936   // as possible. However if we don't find one that:
2937   //   - is an integer
2938   //   - counts from zero, stepping by one
2939   //   - is the size of the widest induction variable type
2940   // then we create a new one.
2941   OldInduction = Legal->getPrimaryInduction();
2942   Type *IdxTy = Legal->getWidestInductionType();
2943 
2944   // Split the single block loop into the two loop structure described above.
2945   BasicBlock *VecBody =
2946       VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.body");
2947   BasicBlock *MiddleBlock =
2948       VecBody->splitBasicBlock(VecBody->getTerminator(), "middle.block");
2949   BasicBlock *ScalarPH =
2950       MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), "scalar.ph");
2951 
2952   // Create and register the new vector loop.
2953   Loop *Lp = LI->AllocateLoop();
2954   Loop *ParentLoop = OrigLoop->getParentLoop();
2955 
2956   // Insert the new loop into the loop nest and register the new basic blocks
2957   // before calling any utilities such as SCEV that require valid LoopInfo.
2958   if (ParentLoop) {
2959     ParentLoop->addChildLoop(Lp);
2960     ParentLoop->addBasicBlockToLoop(ScalarPH, *LI);
2961     ParentLoop->addBasicBlockToLoop(MiddleBlock, *LI);
2962   } else {
2963     LI->addTopLevelLoop(Lp);
2964   }
2965   Lp->addBasicBlockToLoop(VecBody, *LI);
2966 
2967   // Find the loop boundaries.
2968   Value *Count = getOrCreateTripCount(Lp);
2969 
2970   Value *StartIdx = ConstantInt::get(IdxTy, 0);
2971 
2972   // Now, compare the new count to zero. If it is zero skip the vector loop and
2973   // jump to the scalar loop. This check also covers the case where the
2974   // backedge-taken count is uint##_max: adding one to it will overflow leading
2975   // to an incorrect trip count of zero. In this (rare) case we will also jump
2976   // to the scalar loop.
2977   emitMinimumIterationCountCheck(Lp, ScalarPH);
2978 
2979   // Generate the code to check any assumptions that we've made for SCEV
2980   // expressions.
2981   emitSCEVChecks(Lp, ScalarPH);
2982 
2983   // Generate the code that checks in runtime if arrays overlap. We put the
2984   // checks into a separate block to make the more common case of few elements
2985   // faster.
2986   emitMemRuntimeChecks(Lp, ScalarPH);
2987 
2988   // Generate the induction variable.
2989   // The loop step is equal to the vectorization factor (num of SIMD elements)
2990   // times the unroll factor (num of SIMD instructions).
2991   Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
2992   Constant *Step = ConstantInt::get(IdxTy, VF * UF);
2993   Induction =
2994       createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
2995                               getDebugLocFromInstOrOperands(OldInduction));
2996 
2997   // We are going to resume the execution of the scalar loop.
2998   // Go over all of the induction variables that we found and fix the
2999   // PHIs that are left in the scalar version of the loop.
3000   // The starting values of PHI nodes depend on the counter of the last
3001   // iteration in the vectorized loop.
3002   // If we come from a bypass edge then we need to start from the original
3003   // start value.
3004 
3005   // This variable saves the new starting index for the scalar loop. It is used
3006   // to test if there are any tail iterations left once the vector loop has
3007   // completed.
3008   LoopVectorizationLegality::InductionList *List = Legal->getInductionVars();
3009   for (auto &InductionEntry : *List) {
3010     PHINode *OrigPhi = InductionEntry.first;
3011     InductionDescriptor II = InductionEntry.second;
3012 
3013     // Create phi nodes to merge from the  backedge-taken check block.
3014     PHINode *BCResumeVal = PHINode::Create(
3015         OrigPhi->getType(), 3, "bc.resume.val", ScalarPH->getTerminator());
3016     // Copy original phi DL over to the new one.
3017     BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
3018     Value *&EndValue = IVEndValues[OrigPhi];
3019     if (OrigPhi == OldInduction) {
3020       // We know what the end value is.
3021       EndValue = CountRoundDown;
3022     } else {
3023       IRBuilder<> B(Lp->getLoopPreheader()->getTerminator());
3024       Type *StepType = II.getStep()->getType();
3025       Instruction::CastOps CastOp =
3026         CastInst::getCastOpcode(CountRoundDown, true, StepType, true);
3027       Value *CRD = B.CreateCast(CastOp, CountRoundDown, StepType, "cast.crd");
3028       const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
3029       EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
3030       EndValue->setName("ind.end");
3031     }
3032 
3033     // The new PHI merges the original incoming value, in case of a bypass,
3034     // or the value at the end of the vectorized loop.
3035     BCResumeVal->addIncoming(EndValue, MiddleBlock);
3036 
3037     // Fix the scalar body counter (PHI node).
3038     // The old induction's phi node in the scalar body needs the truncated
3039     // value.
3040     for (BasicBlock *BB : LoopBypassBlocks)
3041       BCResumeVal->addIncoming(II.getStartValue(), BB);
3042     OrigPhi->setIncomingValueForBlock(ScalarPH, BCResumeVal);
3043   }
3044 
3045   // We need the OrigLoop (scalar loop part) latch terminator to help
3046   // produce correct debug info for the middle block BB instructions.
3047   // The legality check stage guarantees that the loop will have a single
3048   // latch.
3049   assert(isa<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()) &&
3050          "Scalar loop latch terminator isn't a branch");
3051   BranchInst *ScalarLatchBr =
3052       cast<BranchInst>(OrigLoop->getLoopLatch()->getTerminator());
3053 
3054   // Add a check in the middle block to see if we have completed
3055   // all of the iterations in the first vector loop.
3056   // If (N - N%VF) == N, then we *don't* need to run the remainder.
3057   // If tail is to be folded, we know we don't need to run the remainder.
3058   Value *CmpN = Builder.getTrue();
3059   if (!Cost->foldTailByMasking()) {
3060     CmpN =
3061         CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count,
3062                         CountRoundDown, "cmp.n", MiddleBlock->getTerminator());
3063 
3064     // Here we use the same DebugLoc as the scalar loop latch branch instead
3065     // of the corresponding compare because they may have ended up with
3066     // different line numbers and we want to avoid awkward line stepping while
3067     // debugging. Eg. if the compare has got a line number inside the loop.
3068     cast<Instruction>(CmpN)->setDebugLoc(ScalarLatchBr->getDebugLoc());
3069   }
3070 
3071   BranchInst *BrInst = BranchInst::Create(ExitBlock, ScalarPH, CmpN);
3072   BrInst->setDebugLoc(ScalarLatchBr->getDebugLoc());
3073   ReplaceInstWithInst(MiddleBlock->getTerminator(), BrInst);
3074 
3075   // Get ready to start creating new instructions into the vectorized body.
3076   Builder.SetInsertPoint(&*VecBody->getFirstInsertionPt());
3077 
3078   // Save the state.
3079   LoopVectorPreHeader = Lp->getLoopPreheader();
3080   LoopScalarPreHeader = ScalarPH;
3081   LoopMiddleBlock = MiddleBlock;
3082   LoopExitBlock = ExitBlock;
3083   LoopVectorBody = VecBody;
3084   LoopScalarBody = OldBasicBlock;
3085 
3086   Optional<MDNode *> VectorizedLoopID =
3087       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
3088                                       LLVMLoopVectorizeFollowupVectorized});
3089   if (VectorizedLoopID.hasValue()) {
3090     Lp->setLoopID(VectorizedLoopID.getValue());
3091 
3092     // Do not setAlreadyVectorized if loop attributes have been defined
3093     // explicitly.
3094     return LoopVectorPreHeader;
3095   }
3096 
3097   // Keep all loop hints from the original loop on the vector loop (we'll
3098   // replace the vectorizer-specific hints below).
3099   if (MDNode *LID = OrigLoop->getLoopID())
3100     Lp->setLoopID(LID);
3101 
3102   LoopVectorizeHints Hints(Lp, true, *ORE);
3103   Hints.setAlreadyVectorized();
3104 
3105   return LoopVectorPreHeader;
3106 }
3107 
3108 // Fix up external users of the induction variable. At this point, we are
3109 // in LCSSA form, with all external PHIs that use the IV having one input value,
3110 // coming from the remainder loop. We need those PHIs to also have a correct
3111 // value for the IV when arriving directly from the middle block.
3112 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
3113                                        const InductionDescriptor &II,
3114                                        Value *CountRoundDown, Value *EndValue,
3115                                        BasicBlock *MiddleBlock) {
3116   // There are two kinds of external IV usages - those that use the value
3117   // computed in the last iteration (the PHI) and those that use the penultimate
3118   // value (the value that feeds into the phi from the loop latch).
3119   // We allow both, but they, obviously, have different values.
3120 
3121   assert(OrigLoop->getExitBlock() && "Expected a single exit block");
3122 
3123   DenseMap<Value *, Value *> MissingVals;
3124 
3125   // An external user of the last iteration's value should see the value that
3126   // the remainder loop uses to initialize its own IV.
3127   Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
3128   for (User *U : PostInc->users()) {
3129     Instruction *UI = cast<Instruction>(U);
3130     if (!OrigLoop->contains(UI)) {
3131       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3132       MissingVals[UI] = EndValue;
3133     }
3134   }
3135 
3136   // An external user of the penultimate value need to see EndValue - Step.
3137   // The simplest way to get this is to recompute it from the constituent SCEVs,
3138   // that is Start + (Step * (CRD - 1)).
3139   for (User *U : OrigPhi->users()) {
3140     auto *UI = cast<Instruction>(U);
3141     if (!OrigLoop->contains(UI)) {
3142       const DataLayout &DL =
3143           OrigLoop->getHeader()->getModule()->getDataLayout();
3144       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3145 
3146       IRBuilder<> B(MiddleBlock->getTerminator());
3147       Value *CountMinusOne = B.CreateSub(
3148           CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1));
3149       Value *CMO =
3150           !II.getStep()->getType()->isIntegerTy()
3151               ? B.CreateCast(Instruction::SIToFP, CountMinusOne,
3152                              II.getStep()->getType())
3153               : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType());
3154       CMO->setName("cast.cmo");
3155       Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II);
3156       Escape->setName("ind.escape");
3157       MissingVals[UI] = Escape;
3158     }
3159   }
3160 
3161   for (auto &I : MissingVals) {
3162     PHINode *PHI = cast<PHINode>(I.first);
3163     // One corner case we have to handle is two IVs "chasing" each-other,
3164     // that is %IV2 = phi [...], [ %IV1, %latch ]
3165     // In this case, if IV1 has an external use, we need to avoid adding both
3166     // "last value of IV1" and "penultimate value of IV2". So, verify that we
3167     // don't already have an incoming value for the middle block.
3168     if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
3169       PHI->addIncoming(I.second, MiddleBlock);
3170   }
3171 }
3172 
3173 namespace {
3174 
3175 struct CSEDenseMapInfo {
3176   static bool canHandle(const Instruction *I) {
3177     return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
3178            isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
3179   }
3180 
3181   static inline Instruction *getEmptyKey() {
3182     return DenseMapInfo<Instruction *>::getEmptyKey();
3183   }
3184 
3185   static inline Instruction *getTombstoneKey() {
3186     return DenseMapInfo<Instruction *>::getTombstoneKey();
3187   }
3188 
3189   static unsigned getHashValue(const Instruction *I) {
3190     assert(canHandle(I) && "Unknown instruction!");
3191     return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
3192                                                            I->value_op_end()));
3193   }
3194 
3195   static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
3196     if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3197         LHS == getTombstoneKey() || RHS == getTombstoneKey())
3198       return LHS == RHS;
3199     return LHS->isIdenticalTo(RHS);
3200   }
3201 };
3202 
3203 } // end anonymous namespace
3204 
3205 ///Perform cse of induction variable instructions.
3206 static void cse(BasicBlock *BB) {
3207   // Perform simple cse.
3208   SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
3209   for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
3210     Instruction *In = &*I++;
3211 
3212     if (!CSEDenseMapInfo::canHandle(In))
3213       continue;
3214 
3215     // Check if we can replace this instruction with any of the
3216     // visited instructions.
3217     if (Instruction *V = CSEMap.lookup(In)) {
3218       In->replaceAllUsesWith(V);
3219       In->eraseFromParent();
3220       continue;
3221     }
3222 
3223     CSEMap[In] = In;
3224   }
3225 }
3226 
3227 unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
3228                                                        unsigned VF,
3229                                                        bool &NeedToScalarize) {
3230   Function *F = CI->getCalledFunction();
3231   StringRef FnName = CI->getCalledFunction()->getName();
3232   Type *ScalarRetTy = CI->getType();
3233   SmallVector<Type *, 4> Tys, ScalarTys;
3234   for (auto &ArgOp : CI->arg_operands())
3235     ScalarTys.push_back(ArgOp->getType());
3236 
3237   // Estimate cost of scalarized vector call. The source operands are assumed
3238   // to be vectors, so we need to extract individual elements from there,
3239   // execute VF scalar calls, and then gather the result into the vector return
3240   // value.
3241   unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys);
3242   if (VF == 1)
3243     return ScalarCallCost;
3244 
3245   // Compute corresponding vector type for return value and arguments.
3246   Type *RetTy = ToVectorTy(ScalarRetTy, VF);
3247   for (Type *ScalarTy : ScalarTys)
3248     Tys.push_back(ToVectorTy(ScalarTy, VF));
3249 
3250   // Compute costs of unpacking argument values for the scalar calls and
3251   // packing the return values to a vector.
3252   unsigned ScalarizationCost = getScalarizationOverhead(CI, VF);
3253 
3254   unsigned Cost = ScalarCallCost * VF + ScalarizationCost;
3255 
3256   // If we can't emit a vector call for this function, then the currently found
3257   // cost is the cost we need to return.
3258   NeedToScalarize = true;
3259   if (!TLI || !TLI->isFunctionVectorizable(FnName, VF) || CI->isNoBuiltin())
3260     return Cost;
3261 
3262   // If the corresponding vector cost is cheaper, return its cost.
3263   unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys);
3264   if (VectorCallCost < Cost) {
3265     NeedToScalarize = false;
3266     return VectorCallCost;
3267   }
3268   return Cost;
3269 }
3270 
3271 unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
3272                                                             unsigned VF) {
3273   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
3274   assert(ID && "Expected intrinsic call!");
3275 
3276   FastMathFlags FMF;
3277   if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
3278     FMF = FPMO->getFastMathFlags();
3279 
3280   SmallVector<Value *, 4> Operands(CI->arg_operands());
3281   return TTI.getIntrinsicInstrCost(ID, CI->getType(), Operands, FMF, VF);
3282 }
3283 
3284 static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
3285   auto *I1 = cast<IntegerType>(T1->getVectorElementType());
3286   auto *I2 = cast<IntegerType>(T2->getVectorElementType());
3287   return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3288 }
3289 static Type *largestIntegerVectorType(Type *T1, Type *T2) {
3290   auto *I1 = cast<IntegerType>(T1->getVectorElementType());
3291   auto *I2 = cast<IntegerType>(T2->getVectorElementType());
3292   return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3293 }
3294 
3295 void InnerLoopVectorizer::truncateToMinimalBitwidths() {
3296   // For every instruction `I` in MinBWs, truncate the operands, create a
3297   // truncated version of `I` and reextend its result. InstCombine runs
3298   // later and will remove any ext/trunc pairs.
3299   SmallPtrSet<Value *, 4> Erased;
3300   for (const auto &KV : Cost->getMinimalBitwidths()) {
3301     // If the value wasn't vectorized, we must maintain the original scalar
3302     // type. The absence of the value from VectorLoopValueMap indicates that it
3303     // wasn't vectorized.
3304     if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3305       continue;
3306     for (unsigned Part = 0; Part < UF; ++Part) {
3307       Value *I = getOrCreateVectorValue(KV.first, Part);
3308       if (Erased.find(I) != Erased.end() || I->use_empty() ||
3309           !isa<Instruction>(I))
3310         continue;
3311       Type *OriginalTy = I->getType();
3312       Type *ScalarTruncatedTy =
3313           IntegerType::get(OriginalTy->getContext(), KV.second);
3314       Type *TruncatedTy = VectorType::get(ScalarTruncatedTy,
3315                                           OriginalTy->getVectorNumElements());
3316       if (TruncatedTy == OriginalTy)
3317         continue;
3318 
3319       IRBuilder<> B(cast<Instruction>(I));
3320       auto ShrinkOperand = [&](Value *V) -> Value * {
3321         if (auto *ZI = dyn_cast<ZExtInst>(V))
3322           if (ZI->getSrcTy() == TruncatedTy)
3323             return ZI->getOperand(0);
3324         return B.CreateZExtOrTrunc(V, TruncatedTy);
3325       };
3326 
3327       // The actual instruction modification depends on the instruction type,
3328       // unfortunately.
3329       Value *NewI = nullptr;
3330       if (auto *BO = dyn_cast<BinaryOperator>(I)) {
3331         NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
3332                              ShrinkOperand(BO->getOperand(1)));
3333 
3334         // Any wrapping introduced by shrinking this operation shouldn't be
3335         // considered undefined behavior. So, we can't unconditionally copy
3336         // arithmetic wrapping flags to NewI.
3337         cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
3338       } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
3339         NewI =
3340             B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
3341                          ShrinkOperand(CI->getOperand(1)));
3342       } else if (auto *SI = dyn_cast<SelectInst>(I)) {
3343         NewI = B.CreateSelect(SI->getCondition(),
3344                               ShrinkOperand(SI->getTrueValue()),
3345                               ShrinkOperand(SI->getFalseValue()));
3346       } else if (auto *CI = dyn_cast<CastInst>(I)) {
3347         switch (CI->getOpcode()) {
3348         default:
3349           llvm_unreachable("Unhandled cast!");
3350         case Instruction::Trunc:
3351           NewI = ShrinkOperand(CI->getOperand(0));
3352           break;
3353         case Instruction::SExt:
3354           NewI = B.CreateSExtOrTrunc(
3355               CI->getOperand(0),
3356               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3357           break;
3358         case Instruction::ZExt:
3359           NewI = B.CreateZExtOrTrunc(
3360               CI->getOperand(0),
3361               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3362           break;
3363         }
3364       } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
3365         auto Elements0 = SI->getOperand(0)->getType()->getVectorNumElements();
3366         auto *O0 = B.CreateZExtOrTrunc(
3367             SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0));
3368         auto Elements1 = SI->getOperand(1)->getType()->getVectorNumElements();
3369         auto *O1 = B.CreateZExtOrTrunc(
3370             SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1));
3371 
3372         NewI = B.CreateShuffleVector(O0, O1, SI->getMask());
3373       } else if (isa<LoadInst>(I) || isa<PHINode>(I)) {
3374         // Don't do anything with the operands, just extend the result.
3375         continue;
3376       } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
3377         auto Elements = IE->getOperand(0)->getType()->getVectorNumElements();
3378         auto *O0 = B.CreateZExtOrTrunc(
3379             IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3380         auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
3381         NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
3382       } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
3383         auto Elements = EE->getOperand(0)->getType()->getVectorNumElements();
3384         auto *O0 = B.CreateZExtOrTrunc(
3385             EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3386         NewI = B.CreateExtractElement(O0, EE->getOperand(2));
3387       } else {
3388         // If we don't know what to do, be conservative and don't do anything.
3389         continue;
3390       }
3391 
3392       // Lastly, extend the result.
3393       NewI->takeName(cast<Instruction>(I));
3394       Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
3395       I->replaceAllUsesWith(Res);
3396       cast<Instruction>(I)->eraseFromParent();
3397       Erased.insert(I);
3398       VectorLoopValueMap.resetVectorValue(KV.first, Part, Res);
3399     }
3400   }
3401 
3402   // We'll have created a bunch of ZExts that are now parentless. Clean up.
3403   for (const auto &KV : Cost->getMinimalBitwidths()) {
3404     // If the value wasn't vectorized, we must maintain the original scalar
3405     // type. The absence of the value from VectorLoopValueMap indicates that it
3406     // wasn't vectorized.
3407     if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3408       continue;
3409     for (unsigned Part = 0; Part < UF; ++Part) {
3410       Value *I = getOrCreateVectorValue(KV.first, Part);
3411       ZExtInst *Inst = dyn_cast<ZExtInst>(I);
3412       if (Inst && Inst->use_empty()) {
3413         Value *NewI = Inst->getOperand(0);
3414         Inst->eraseFromParent();
3415         VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI);
3416       }
3417     }
3418   }
3419 }
3420 
3421 void InnerLoopVectorizer::fixVectorizedLoop() {
3422   // Insert truncates and extends for any truncated instructions as hints to
3423   // InstCombine.
3424   if (VF > 1)
3425     truncateToMinimalBitwidths();
3426 
3427   // Fix widened non-induction PHIs by setting up the PHI operands.
3428   if (OrigPHIsToFix.size()) {
3429     assert(EnableVPlanNativePath &&
3430            "Unexpected non-induction PHIs for fixup in non VPlan-native path");
3431     fixNonInductionPHIs();
3432   }
3433 
3434   // At this point every instruction in the original loop is widened to a
3435   // vector form. Now we need to fix the recurrences in the loop. These PHI
3436   // nodes are currently empty because we did not want to introduce cycles.
3437   // This is the second stage of vectorizing recurrences.
3438   fixCrossIterationPHIs();
3439 
3440   // Update the dominator tree.
3441   //
3442   // FIXME: After creating the structure of the new loop, the dominator tree is
3443   //        no longer up-to-date, and it remains that way until we update it
3444   //        here. An out-of-date dominator tree is problematic for SCEV,
3445   //        because SCEVExpander uses it to guide code generation. The
3446   //        vectorizer use SCEVExpanders in several places. Instead, we should
3447   //        keep the dominator tree up-to-date as we go.
3448   updateAnalysis();
3449 
3450   // Fix-up external users of the induction variables.
3451   for (auto &Entry : *Legal->getInductionVars())
3452     fixupIVUsers(Entry.first, Entry.second,
3453                  getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)),
3454                  IVEndValues[Entry.first], LoopMiddleBlock);
3455 
3456   fixLCSSAPHIs();
3457   for (Instruction *PI : PredicatedInstructions)
3458     sinkScalarOperands(&*PI);
3459 
3460   // Remove redundant induction instructions.
3461   cse(LoopVectorBody);
3462 }
3463 
3464 void InnerLoopVectorizer::fixCrossIterationPHIs() {
3465   // In order to support recurrences we need to be able to vectorize Phi nodes.
3466   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3467   // stage #2: We now need to fix the recurrences by adding incoming edges to
3468   // the currently empty PHI nodes. At this point every instruction in the
3469   // original loop is widened to a vector form so we can use them to construct
3470   // the incoming edges.
3471   for (PHINode &Phi : OrigLoop->getHeader()->phis()) {
3472     // Handle first-order recurrences and reductions that need to be fixed.
3473     if (Legal->isFirstOrderRecurrence(&Phi))
3474       fixFirstOrderRecurrence(&Phi);
3475     else if (Legal->isReductionVariable(&Phi))
3476       fixReduction(&Phi);
3477   }
3478 }
3479 
3480 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
3481   // This is the second phase of vectorizing first-order recurrences. An
3482   // overview of the transformation is described below. Suppose we have the
3483   // following loop.
3484   //
3485   //   for (int i = 0; i < n; ++i)
3486   //     b[i] = a[i] - a[i - 1];
3487   //
3488   // There is a first-order recurrence on "a". For this loop, the shorthand
3489   // scalar IR looks like:
3490   //
3491   //   scalar.ph:
3492   //     s_init = a[-1]
3493   //     br scalar.body
3494   //
3495   //   scalar.body:
3496   //     i = phi [0, scalar.ph], [i+1, scalar.body]
3497   //     s1 = phi [s_init, scalar.ph], [s2, scalar.body]
3498   //     s2 = a[i]
3499   //     b[i] = s2 - s1
3500   //     br cond, scalar.body, ...
3501   //
3502   // In this example, s1 is a recurrence because it's value depends on the
3503   // previous iteration. In the first phase of vectorization, we created a
3504   // temporary value for s1. We now complete the vectorization and produce the
3505   // shorthand vector IR shown below (for VF = 4, UF = 1).
3506   //
3507   //   vector.ph:
3508   //     v_init = vector(..., ..., ..., a[-1])
3509   //     br vector.body
3510   //
3511   //   vector.body
3512   //     i = phi [0, vector.ph], [i+4, vector.body]
3513   //     v1 = phi [v_init, vector.ph], [v2, vector.body]
3514   //     v2 = a[i, i+1, i+2, i+3];
3515   //     v3 = vector(v1(3), v2(0, 1, 2))
3516   //     b[i, i+1, i+2, i+3] = v2 - v3
3517   //     br cond, vector.body, middle.block
3518   //
3519   //   middle.block:
3520   //     x = v2(3)
3521   //     br scalar.ph
3522   //
3523   //   scalar.ph:
3524   //     s_init = phi [x, middle.block], [a[-1], otherwise]
3525   //     br scalar.body
3526   //
3527   // After execution completes the vector loop, we extract the next value of
3528   // the recurrence (x) to use as the initial value in the scalar loop.
3529 
3530   // Get the original loop preheader and single loop latch.
3531   auto *Preheader = OrigLoop->getLoopPreheader();
3532   auto *Latch = OrigLoop->getLoopLatch();
3533 
3534   // Get the initial and previous values of the scalar recurrence.
3535   auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader);
3536   auto *Previous = Phi->getIncomingValueForBlock(Latch);
3537 
3538   // Create a vector from the initial value.
3539   auto *VectorInit = ScalarInit;
3540   if (VF > 1) {
3541     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
3542     VectorInit = Builder.CreateInsertElement(
3543         UndefValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit,
3544         Builder.getInt32(VF - 1), "vector.recur.init");
3545   }
3546 
3547   // We constructed a temporary phi node in the first phase of vectorization.
3548   // This phi node will eventually be deleted.
3549   Builder.SetInsertPoint(
3550       cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0)));
3551 
3552   // Create a phi node for the new recurrence. The current value will either be
3553   // the initial value inserted into a vector or loop-varying vector value.
3554   auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur");
3555   VecPhi->addIncoming(VectorInit, LoopVectorPreHeader);
3556 
3557   // Get the vectorized previous value of the last part UF - 1. It appears last
3558   // among all unrolled iterations, due to the order of their construction.
3559   Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1);
3560 
3561   // Find and set the insertion point after the previous value if it is an
3562   // instruction.
3563   BasicBlock::iterator InsertPt;
3564   // Note that the previous value may have been constant-folded so it is not
3565   // guaranteed to be an instruction in the vector loop.
3566   // FIXME: Loop invariant values do not form recurrences. We should deal with
3567   //        them earlier.
3568   if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart))
3569     InsertPt = LoopVectorBody->getFirstInsertionPt();
3570   else {
3571     Instruction *PreviousInst = cast<Instruction>(PreviousLastPart);
3572     if (isa<PHINode>(PreviousLastPart))
3573       // If the previous value is a phi node, we should insert after all the phi
3574       // nodes in the block containing the PHI to avoid breaking basic block
3575       // verification. Note that the basic block may be different to
3576       // LoopVectorBody, in case we predicate the loop.
3577       InsertPt = PreviousInst->getParent()->getFirstInsertionPt();
3578     else
3579       InsertPt = ++PreviousInst->getIterator();
3580   }
3581   Builder.SetInsertPoint(&*InsertPt);
3582 
3583   // We will construct a vector for the recurrence by combining the values for
3584   // the current and previous iterations. This is the required shuffle mask.
3585   SmallVector<Constant *, 8> ShuffleMask(VF);
3586   ShuffleMask[0] = Builder.getInt32(VF - 1);
3587   for (unsigned I = 1; I < VF; ++I)
3588     ShuffleMask[I] = Builder.getInt32(I + VF - 1);
3589 
3590   // The vector from which to take the initial value for the current iteration
3591   // (actual or unrolled). Initially, this is the vector phi node.
3592   Value *Incoming = VecPhi;
3593 
3594   // Shuffle the current and previous vector and update the vector parts.
3595   for (unsigned Part = 0; Part < UF; ++Part) {
3596     Value *PreviousPart = getOrCreateVectorValue(Previous, Part);
3597     Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part);
3598     auto *Shuffle =
3599         VF > 1 ? Builder.CreateShuffleVector(Incoming, PreviousPart,
3600                                              ConstantVector::get(ShuffleMask))
3601                : Incoming;
3602     PhiPart->replaceAllUsesWith(Shuffle);
3603     cast<Instruction>(PhiPart)->eraseFromParent();
3604     VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle);
3605     Incoming = PreviousPart;
3606   }
3607 
3608   // Fix the latch value of the new recurrence in the vector loop.
3609   VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
3610 
3611   // Extract the last vector element in the middle block. This will be the
3612   // initial value for the recurrence when jumping to the scalar loop.
3613   auto *ExtractForScalar = Incoming;
3614   if (VF > 1) {
3615     Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
3616     ExtractForScalar = Builder.CreateExtractElement(
3617         ExtractForScalar, Builder.getInt32(VF - 1), "vector.recur.extract");
3618   }
3619   // Extract the second last element in the middle block if the
3620   // Phi is used outside the loop. We need to extract the phi itself
3621   // and not the last element (the phi update in the current iteration). This
3622   // will be the value when jumping to the exit block from the LoopMiddleBlock,
3623   // when the scalar loop is not run at all.
3624   Value *ExtractForPhiUsedOutsideLoop = nullptr;
3625   if (VF > 1)
3626     ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
3627         Incoming, Builder.getInt32(VF - 2), "vector.recur.extract.for.phi");
3628   // When loop is unrolled without vectorizing, initialize
3629   // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of
3630   // `Incoming`. This is analogous to the vectorized case above: extracting the
3631   // second last element when VF > 1.
3632   else if (UF > 1)
3633     ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2);
3634 
3635   // Fix the initial value of the original recurrence in the scalar loop.
3636   Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
3637   auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
3638   for (auto *BB : predecessors(LoopScalarPreHeader)) {
3639     auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
3640     Start->addIncoming(Incoming, BB);
3641   }
3642 
3643   Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start);
3644   Phi->setName("scalar.recur");
3645 
3646   // Finally, fix users of the recurrence outside the loop. The users will need
3647   // either the last value of the scalar recurrence or the last value of the
3648   // vector recurrence we extracted in the middle block. Since the loop is in
3649   // LCSSA form, we just need to find all the phi nodes for the original scalar
3650   // recurrence in the exit block, and then add an edge for the middle block.
3651   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3652     if (LCSSAPhi.getIncomingValue(0) == Phi) {
3653       LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
3654     }
3655   }
3656 }
3657 
3658 void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
3659   Constant *Zero = Builder.getInt32(0);
3660 
3661   // Get it's reduction variable descriptor.
3662   assert(Legal->isReductionVariable(Phi) &&
3663          "Unable to find the reduction variable");
3664   RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[Phi];
3665 
3666   RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();
3667   TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
3668   Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
3669   RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind =
3670     RdxDesc.getMinMaxRecurrenceKind();
3671   setDebugLocFromInst(Builder, ReductionStartValue);
3672 
3673   // We need to generate a reduction vector from the incoming scalar.
3674   // To do so, we need to generate the 'identity' vector and override
3675   // one of the elements with the incoming scalar reduction. We need
3676   // to do it in the vector-loop preheader.
3677   Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
3678 
3679   // This is the vector-clone of the value that leaves the loop.
3680   Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType();
3681 
3682   // Find the reduction identity variable. Zero for addition, or, xor,
3683   // one for multiplication, -1 for And.
3684   Value *Identity;
3685   Value *VectorStart;
3686   if (RK == RecurrenceDescriptor::RK_IntegerMinMax ||
3687       RK == RecurrenceDescriptor::RK_FloatMinMax) {
3688     // MinMax reduction have the start value as their identify.
3689     if (VF == 1) {
3690       VectorStart = Identity = ReductionStartValue;
3691     } else {
3692       VectorStart = Identity =
3693         Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident");
3694     }
3695   } else {
3696     // Handle other reduction kinds:
3697     Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity(
3698         RK, VecTy->getScalarType());
3699     if (VF == 1) {
3700       Identity = Iden;
3701       // This vector is the Identity vector where the first element is the
3702       // incoming scalar reduction.
3703       VectorStart = ReductionStartValue;
3704     } else {
3705       Identity = ConstantVector::getSplat(VF, Iden);
3706 
3707       // This vector is the Identity vector where the first element is the
3708       // incoming scalar reduction.
3709       VectorStart =
3710         Builder.CreateInsertElement(Identity, ReductionStartValue, Zero);
3711     }
3712   }
3713 
3714   // Fix the vector-loop phi.
3715 
3716   // Reductions do not have to start at zero. They can start with
3717   // any loop invariant values.
3718   BasicBlock *Latch = OrigLoop->getLoopLatch();
3719   Value *LoopVal = Phi->getIncomingValueForBlock(Latch);
3720   for (unsigned Part = 0; Part < UF; ++Part) {
3721     Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part);
3722     Value *Val = getOrCreateVectorValue(LoopVal, Part);
3723     // Make sure to add the reduction stat value only to the
3724     // first unroll part.
3725     Value *StartVal = (Part == 0) ? VectorStart : Identity;
3726     cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader);
3727     cast<PHINode>(VecRdxPhi)
3728       ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
3729   }
3730 
3731   // Before each round, move the insertion point right between
3732   // the PHIs and the values we are going to write.
3733   // This allows us to write both PHINodes and the extractelement
3734   // instructions.
3735   Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3736 
3737   setDebugLocFromInst(Builder, LoopExitInst);
3738 
3739   // If tail is folded by masking, the vector value to leave the loop should be
3740   // a Select choosing between the vectorized LoopExitInst and vectorized Phi,
3741   // instead of the former.
3742   if (Cost->foldTailByMasking()) {
3743     for (unsigned Part = 0; Part < UF; ++Part) {
3744       Value *VecLoopExitInst =
3745           VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3746       Value *Sel = nullptr;
3747       for (User *U : VecLoopExitInst->users()) {
3748         if (isa<SelectInst>(U)) {
3749           assert(!Sel && "Reduction exit feeding two selects");
3750           Sel = U;
3751         } else
3752           assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select");
3753       }
3754       assert(Sel && "Reduction exit feeds no select");
3755       VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, Sel);
3756     }
3757   }
3758 
3759   // If the vector reduction can be performed in a smaller type, we truncate
3760   // then extend the loop exit value to enable InstCombine to evaluate the
3761   // entire expression in the smaller type.
3762   if (VF > 1 && Phi->getType() != RdxDesc.getRecurrenceType()) {
3763     Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
3764     Builder.SetInsertPoint(
3765         LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator());
3766     VectorParts RdxParts(UF);
3767     for (unsigned Part = 0; Part < UF; ++Part) {
3768       RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3769       Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3770       Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
3771                                         : Builder.CreateZExt(Trunc, VecTy);
3772       for (Value::user_iterator UI = RdxParts[Part]->user_begin();
3773            UI != RdxParts[Part]->user_end();)
3774         if (*UI != Trunc) {
3775           (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd);
3776           RdxParts[Part] = Extnd;
3777         } else {
3778           ++UI;
3779         }
3780     }
3781     Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3782     for (unsigned Part = 0; Part < UF; ++Part) {
3783       RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3784       VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]);
3785     }
3786   }
3787 
3788   // Reduce all of the unrolled parts into a single vector.
3789   Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0);
3790   unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK);
3791 
3792   // The middle block terminator has already been assigned a DebugLoc here (the
3793   // OrigLoop's single latch terminator). We want the whole middle block to
3794   // appear to execute on this line because: (a) it is all compiler generated,
3795   // (b) these instructions are always executed after evaluating the latch
3796   // conditional branch, and (c) other passes may add new predecessors which
3797   // terminate on this line. This is the easiest way to ensure we don't
3798   // accidentally cause an extra step back into the loop while debugging.
3799   setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator());
3800   for (unsigned Part = 1; Part < UF; ++Part) {
3801     Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3802     if (Op != Instruction::ICmp && Op != Instruction::FCmp)
3803       // Floating point operations had to be 'fast' to enable the reduction.
3804       ReducedPartRdx = addFastMathFlag(
3805           Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxPart,
3806                               ReducedPartRdx, "bin.rdx"),
3807           RdxDesc.getFastMathFlags());
3808     else
3809       ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx,
3810                                       RdxPart);
3811   }
3812 
3813   if (VF > 1) {
3814     bool NoNaN = Legal->hasFunNoNaNAttr();
3815     ReducedPartRdx =
3816         createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN);
3817     // If the reduction can be performed in a smaller type, we need to extend
3818     // the reduction to the wider type before we branch to the original loop.
3819     if (Phi->getType() != RdxDesc.getRecurrenceType())
3820       ReducedPartRdx =
3821         RdxDesc.isSigned()
3822         ? Builder.CreateSExt(ReducedPartRdx, Phi->getType())
3823         : Builder.CreateZExt(ReducedPartRdx, Phi->getType());
3824   }
3825 
3826   // Create a phi node that merges control-flow from the backedge-taken check
3827   // block and the middle block.
3828   PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx",
3829                                         LoopScalarPreHeader->getTerminator());
3830   for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
3831     BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]);
3832   BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
3833 
3834   // Now, we need to fix the users of the reduction variable
3835   // inside and outside of the scalar remainder loop.
3836   // We know that the loop is in LCSSA form. We need to update the
3837   // PHI nodes in the exit blocks.
3838   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3839     // All PHINodes need to have a single entry edge, or two if
3840     // we already fixed them.
3841     assert(LCSSAPhi.getNumIncomingValues() < 3 && "Invalid LCSSA PHI");
3842 
3843     // We found a reduction value exit-PHI. Update it with the
3844     // incoming bypass edge.
3845     if (LCSSAPhi.getIncomingValue(0) == LoopExitInst)
3846       LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
3847   } // end of the LCSSA phi scan.
3848 
3849     // Fix the scalar loop reduction variable with the incoming reduction sum
3850     // from the vector body and from the backedge value.
3851   int IncomingEdgeBlockIdx =
3852     Phi->getBasicBlockIndex(OrigLoop->getLoopLatch());
3853   assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
3854   // Pick the other block.
3855   int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
3856   Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
3857   Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
3858 }
3859 
3860 void InnerLoopVectorizer::fixLCSSAPHIs() {
3861   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3862     if (LCSSAPhi.getNumIncomingValues() == 1) {
3863       auto *IncomingValue = LCSSAPhi.getIncomingValue(0);
3864       // Non-instruction incoming values will have only one value.
3865       unsigned LastLane = 0;
3866       if (isa<Instruction>(IncomingValue))
3867           LastLane = Cost->isUniformAfterVectorization(
3868                          cast<Instruction>(IncomingValue), VF)
3869                          ? 0
3870                          : VF - 1;
3871       // Can be a loop invariant incoming value or the last scalar value to be
3872       // extracted from the vectorized loop.
3873       Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
3874       Value *lastIncomingValue =
3875           getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane });
3876       LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock);
3877     }
3878   }
3879 }
3880 
3881 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
3882   // The basic block and loop containing the predicated instruction.
3883   auto *PredBB = PredInst->getParent();
3884   auto *VectorLoop = LI->getLoopFor(PredBB);
3885 
3886   // Initialize a worklist with the operands of the predicated instruction.
3887   SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
3888 
3889   // Holds instructions that we need to analyze again. An instruction may be
3890   // reanalyzed if we don't yet know if we can sink it or not.
3891   SmallVector<Instruction *, 8> InstsToReanalyze;
3892 
3893   // Returns true if a given use occurs in the predicated block. Phi nodes use
3894   // their operands in their corresponding predecessor blocks.
3895   auto isBlockOfUsePredicated = [&](Use &U) -> bool {
3896     auto *I = cast<Instruction>(U.getUser());
3897     BasicBlock *BB = I->getParent();
3898     if (auto *Phi = dyn_cast<PHINode>(I))
3899       BB = Phi->getIncomingBlock(
3900           PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
3901     return BB == PredBB;
3902   };
3903 
3904   // Iteratively sink the scalarized operands of the predicated instruction
3905   // into the block we created for it. When an instruction is sunk, it's
3906   // operands are then added to the worklist. The algorithm ends after one pass
3907   // through the worklist doesn't sink a single instruction.
3908   bool Changed;
3909   do {
3910     // Add the instructions that need to be reanalyzed to the worklist, and
3911     // reset the changed indicator.
3912     Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
3913     InstsToReanalyze.clear();
3914     Changed = false;
3915 
3916     while (!Worklist.empty()) {
3917       auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
3918 
3919       // We can't sink an instruction if it is a phi node, is already in the
3920       // predicated block, is not in the loop, or may have side effects.
3921       if (!I || isa<PHINode>(I) || I->getParent() == PredBB ||
3922           !VectorLoop->contains(I) || I->mayHaveSideEffects())
3923         continue;
3924 
3925       // It's legal to sink the instruction if all its uses occur in the
3926       // predicated block. Otherwise, there's nothing to do yet, and we may
3927       // need to reanalyze the instruction.
3928       if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
3929         InstsToReanalyze.push_back(I);
3930         continue;
3931       }
3932 
3933       // Move the instruction to the beginning of the predicated block, and add
3934       // it's operands to the worklist.
3935       I->moveBefore(&*PredBB->getFirstInsertionPt());
3936       Worklist.insert(I->op_begin(), I->op_end());
3937 
3938       // The sinking may have enabled other instructions to be sunk, so we will
3939       // need to iterate.
3940       Changed = true;
3941     }
3942   } while (Changed);
3943 }
3944 
3945 void InnerLoopVectorizer::fixNonInductionPHIs() {
3946   for (PHINode *OrigPhi : OrigPHIsToFix) {
3947     PHINode *NewPhi =
3948         cast<PHINode>(VectorLoopValueMap.getVectorValue(OrigPhi, 0));
3949     unsigned NumIncomingValues = OrigPhi->getNumIncomingValues();
3950 
3951     SmallVector<BasicBlock *, 2> ScalarBBPredecessors(
3952         predecessors(OrigPhi->getParent()));
3953     SmallVector<BasicBlock *, 2> VectorBBPredecessors(
3954         predecessors(NewPhi->getParent()));
3955     assert(ScalarBBPredecessors.size() == VectorBBPredecessors.size() &&
3956            "Scalar and Vector BB should have the same number of predecessors");
3957 
3958     // The insertion point in Builder may be invalidated by the time we get
3959     // here. Force the Builder insertion point to something valid so that we do
3960     // not run into issues during insertion point restore in
3961     // getOrCreateVectorValue calls below.
3962     Builder.SetInsertPoint(NewPhi);
3963 
3964     // The predecessor order is preserved and we can rely on mapping between
3965     // scalar and vector block predecessors.
3966     for (unsigned i = 0; i < NumIncomingValues; ++i) {
3967       BasicBlock *NewPredBB = VectorBBPredecessors[i];
3968 
3969       // When looking up the new scalar/vector values to fix up, use incoming
3970       // values from original phi.
3971       Value *ScIncV =
3972           OrigPhi->getIncomingValueForBlock(ScalarBBPredecessors[i]);
3973 
3974       // Scalar incoming value may need a broadcast
3975       Value *NewIncV = getOrCreateVectorValue(ScIncV, 0);
3976       NewPhi->addIncoming(NewIncV, NewPredBB);
3977     }
3978   }
3979 }
3980 
3981 void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, unsigned UF,
3982                                    unsigned VF, bool IsPtrLoopInvariant,
3983                                    SmallBitVector &IsIndexLoopInvariant) {
3984   // Construct a vector GEP by widening the operands of the scalar GEP as
3985   // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
3986   // results in a vector of pointers when at least one operand of the GEP
3987   // is vector-typed. Thus, to keep the representation compact, we only use
3988   // vector-typed operands for loop-varying values.
3989 
3990   if (VF > 1 && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) {
3991     // If we are vectorizing, but the GEP has only loop-invariant operands,
3992     // the GEP we build (by only using vector-typed operands for
3993     // loop-varying values) would be a scalar pointer. Thus, to ensure we
3994     // produce a vector of pointers, we need to either arbitrarily pick an
3995     // operand to broadcast, or broadcast a clone of the original GEP.
3996     // Here, we broadcast a clone of the original.
3997     //
3998     // TODO: If at some point we decide to scalarize instructions having
3999     //       loop-invariant operands, this special case will no longer be
4000     //       required. We would add the scalarization decision to
4001     //       collectLoopScalars() and teach getVectorValue() to broadcast
4002     //       the lane-zero scalar value.
4003     auto *Clone = Builder.Insert(GEP->clone());
4004     for (unsigned Part = 0; Part < UF; ++Part) {
4005       Value *EntryPart = Builder.CreateVectorSplat(VF, Clone);
4006       VectorLoopValueMap.setVectorValue(GEP, Part, EntryPart);
4007       addMetadata(EntryPart, GEP);
4008     }
4009   } else {
4010     // If the GEP has at least one loop-varying operand, we are sure to
4011     // produce a vector of pointers. But if we are only unrolling, we want
4012     // to produce a scalar GEP for each unroll part. Thus, the GEP we
4013     // produce with the code below will be scalar (if VF == 1) or vector
4014     // (otherwise). Note that for the unroll-only case, we still maintain
4015     // values in the vector mapping with initVector, as we do for other
4016     // instructions.
4017     for (unsigned Part = 0; Part < UF; ++Part) {
4018       // The pointer operand of the new GEP. If it's loop-invariant, we
4019       // won't broadcast it.
4020       auto *Ptr = IsPtrLoopInvariant
4021                       ? GEP->getPointerOperand()
4022                       : getOrCreateVectorValue(GEP->getPointerOperand(), Part);
4023 
4024       // Collect all the indices for the new GEP. If any index is
4025       // loop-invariant, we won't broadcast it.
4026       SmallVector<Value *, 4> Indices;
4027       for (auto Index : enumerate(GEP->indices())) {
4028         Value *User = Index.value().get();
4029         if (IsIndexLoopInvariant[Index.index()])
4030           Indices.push_back(User);
4031         else
4032           Indices.push_back(getOrCreateVectorValue(User, Part));
4033       }
4034 
4035       // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
4036       // but it should be a vector, otherwise.
4037       auto *NewGEP =
4038           GEP->isInBounds()
4039               ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr,
4040                                           Indices)
4041               : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices);
4042       assert((VF == 1 || NewGEP->getType()->isVectorTy()) &&
4043              "NewGEP is not a pointer vector");
4044       VectorLoopValueMap.setVectorValue(GEP, Part, NewGEP);
4045       addMetadata(NewGEP, GEP);
4046     }
4047   }
4048 }
4049 
4050 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
4051                                               unsigned VF) {
4052   PHINode *P = cast<PHINode>(PN);
4053   if (EnableVPlanNativePath) {
4054     // Currently we enter here in the VPlan-native path for non-induction
4055     // PHIs where all control flow is uniform. We simply widen these PHIs.
4056     // Create a vector phi with no operands - the vector phi operands will be
4057     // set at the end of vector code generation.
4058     Type *VecTy =
4059         (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF);
4060     Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi");
4061     VectorLoopValueMap.setVectorValue(P, 0, VecPhi);
4062     OrigPHIsToFix.push_back(P);
4063 
4064     return;
4065   }
4066 
4067   assert(PN->getParent() == OrigLoop->getHeader() &&
4068          "Non-header phis should have been handled elsewhere");
4069 
4070   // In order to support recurrences we need to be able to vectorize Phi nodes.
4071   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
4072   // stage #1: We create a new vector PHI node with no incoming edges. We'll use
4073   // this value when we vectorize all of the instructions that use the PHI.
4074   if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) {
4075     for (unsigned Part = 0; Part < UF; ++Part) {
4076       // This is phase one of vectorizing PHIs.
4077       Type *VecTy =
4078           (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF);
4079       Value *EntryPart = PHINode::Create(
4080           VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt());
4081       VectorLoopValueMap.setVectorValue(P, Part, EntryPart);
4082     }
4083     return;
4084   }
4085 
4086   setDebugLocFromInst(Builder, P);
4087 
4088   // This PHINode must be an induction variable.
4089   // Make sure that we know about it.
4090   assert(Legal->getInductionVars()->count(P) && "Not an induction variable");
4091 
4092   InductionDescriptor II = Legal->getInductionVars()->lookup(P);
4093   const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
4094 
4095   // FIXME: The newly created binary instructions should contain nsw/nuw flags,
4096   // which can be found from the original scalar operations.
4097   switch (II.getKind()) {
4098   case InductionDescriptor::IK_NoInduction:
4099     llvm_unreachable("Unknown induction");
4100   case InductionDescriptor::IK_IntInduction:
4101   case InductionDescriptor::IK_FpInduction:
4102     llvm_unreachable("Integer/fp induction is handled elsewhere.");
4103   case InductionDescriptor::IK_PtrInduction: {
4104     // Handle the pointer induction variable case.
4105     assert(P->getType()->isPointerTy() && "Unexpected type.");
4106     // This is the normalized GEP that starts counting at zero.
4107     Value *PtrInd = Induction;
4108     PtrInd = Builder.CreateSExtOrTrunc(PtrInd, II.getStep()->getType());
4109     // Determine the number of scalars we need to generate for each unroll
4110     // iteration. If the instruction is uniform, we only need to generate the
4111     // first lane. Otherwise, we generate all VF values.
4112     unsigned Lanes = Cost->isUniformAfterVectorization(P, VF) ? 1 : VF;
4113     // These are the scalar results. Notice that we don't generate vector GEPs
4114     // because scalar GEPs result in better code.
4115     for (unsigned Part = 0; Part < UF; ++Part) {
4116       for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
4117         Constant *Idx = ConstantInt::get(PtrInd->getType(), Lane + Part * VF);
4118         Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
4119         Value *SclrGep =
4120             emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II);
4121         SclrGep->setName("next.gep");
4122         VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep);
4123       }
4124     }
4125     return;
4126   }
4127   }
4128 }
4129 
4130 /// A helper function for checking whether an integer division-related
4131 /// instruction may divide by zero (in which case it must be predicated if
4132 /// executed conditionally in the scalar code).
4133 /// TODO: It may be worthwhile to generalize and check isKnownNonZero().
4134 /// Non-zero divisors that are non compile-time constants will not be
4135 /// converted into multiplication, so we will still end up scalarizing
4136 /// the division, but can do so w/o predication.
4137 static bool mayDivideByZero(Instruction &I) {
4138   assert((I.getOpcode() == Instruction::UDiv ||
4139           I.getOpcode() == Instruction::SDiv ||
4140           I.getOpcode() == Instruction::URem ||
4141           I.getOpcode() == Instruction::SRem) &&
4142          "Unexpected instruction");
4143   Value *Divisor = I.getOperand(1);
4144   auto *CInt = dyn_cast<ConstantInt>(Divisor);
4145   return !CInt || CInt->isZero();
4146 }
4147 
4148 void InnerLoopVectorizer::widenInstruction(Instruction &I) {
4149   switch (I.getOpcode()) {
4150   case Instruction::Br:
4151   case Instruction::PHI:
4152   case Instruction::GetElementPtr:
4153     llvm_unreachable("This instruction is handled by a different recipe.");
4154   case Instruction::UDiv:
4155   case Instruction::SDiv:
4156   case Instruction::SRem:
4157   case Instruction::URem:
4158   case Instruction::Add:
4159   case Instruction::FAdd:
4160   case Instruction::Sub:
4161   case Instruction::FSub:
4162   case Instruction::FNeg:
4163   case Instruction::Mul:
4164   case Instruction::FMul:
4165   case Instruction::FDiv:
4166   case Instruction::FRem:
4167   case Instruction::Shl:
4168   case Instruction::LShr:
4169   case Instruction::AShr:
4170   case Instruction::And:
4171   case Instruction::Or:
4172   case Instruction::Xor: {
4173     // Just widen unops and binops.
4174     setDebugLocFromInst(Builder, &I);
4175 
4176     for (unsigned Part = 0; Part < UF; ++Part) {
4177       SmallVector<Value *, 2> Ops;
4178       for (Value *Op : I.operands())
4179         Ops.push_back(getOrCreateVectorValue(Op, Part));
4180 
4181       Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops);
4182 
4183       if (auto *VecOp = dyn_cast<Instruction>(V))
4184         VecOp->copyIRFlags(&I);
4185 
4186       // Use this vector value for all users of the original instruction.
4187       VectorLoopValueMap.setVectorValue(&I, Part, V);
4188       addMetadata(V, &I);
4189     }
4190 
4191     break;
4192   }
4193   case Instruction::Select: {
4194     // Widen selects.
4195     // If the selector is loop invariant we can create a select
4196     // instruction with a scalar condition. Otherwise, use vector-select.
4197     auto *SE = PSE.getSE();
4198     bool InvariantCond =
4199         SE->isLoopInvariant(PSE.getSCEV(I.getOperand(0)), OrigLoop);
4200     setDebugLocFromInst(Builder, &I);
4201 
4202     // The condition can be loop invariant  but still defined inside the
4203     // loop. This means that we can't just use the original 'cond' value.
4204     // We have to take the 'vectorized' value and pick the first lane.
4205     // Instcombine will make this a no-op.
4206 
4207     auto *ScalarCond = getOrCreateScalarValue(I.getOperand(0), {0, 0});
4208 
4209     for (unsigned Part = 0; Part < UF; ++Part) {
4210       Value *Cond = getOrCreateVectorValue(I.getOperand(0), Part);
4211       Value *Op0 = getOrCreateVectorValue(I.getOperand(1), Part);
4212       Value *Op1 = getOrCreateVectorValue(I.getOperand(2), Part);
4213       Value *Sel =
4214           Builder.CreateSelect(InvariantCond ? ScalarCond : Cond, Op0, Op1);
4215       VectorLoopValueMap.setVectorValue(&I, Part, Sel);
4216       addMetadata(Sel, &I);
4217     }
4218 
4219     break;
4220   }
4221 
4222   case Instruction::ICmp:
4223   case Instruction::FCmp: {
4224     // Widen compares. Generate vector compares.
4225     bool FCmp = (I.getOpcode() == Instruction::FCmp);
4226     auto *Cmp = cast<CmpInst>(&I);
4227     setDebugLocFromInst(Builder, Cmp);
4228     for (unsigned Part = 0; Part < UF; ++Part) {
4229       Value *A = getOrCreateVectorValue(Cmp->getOperand(0), Part);
4230       Value *B = getOrCreateVectorValue(Cmp->getOperand(1), Part);
4231       Value *C = nullptr;
4232       if (FCmp) {
4233         // Propagate fast math flags.
4234         IRBuilder<>::FastMathFlagGuard FMFG(Builder);
4235         Builder.setFastMathFlags(Cmp->getFastMathFlags());
4236         C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
4237       } else {
4238         C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
4239       }
4240       VectorLoopValueMap.setVectorValue(&I, Part, C);
4241       addMetadata(C, &I);
4242     }
4243 
4244     break;
4245   }
4246 
4247   case Instruction::ZExt:
4248   case Instruction::SExt:
4249   case Instruction::FPToUI:
4250   case Instruction::FPToSI:
4251   case Instruction::FPExt:
4252   case Instruction::PtrToInt:
4253   case Instruction::IntToPtr:
4254   case Instruction::SIToFP:
4255   case Instruction::UIToFP:
4256   case Instruction::Trunc:
4257   case Instruction::FPTrunc:
4258   case Instruction::BitCast: {
4259     auto *CI = cast<CastInst>(&I);
4260     setDebugLocFromInst(Builder, CI);
4261 
4262     /// Vectorize casts.
4263     Type *DestTy =
4264         (VF == 1) ? CI->getType() : VectorType::get(CI->getType(), VF);
4265 
4266     for (unsigned Part = 0; Part < UF; ++Part) {
4267       Value *A = getOrCreateVectorValue(CI->getOperand(0), Part);
4268       Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
4269       VectorLoopValueMap.setVectorValue(&I, Part, Cast);
4270       addMetadata(Cast, &I);
4271     }
4272     break;
4273   }
4274 
4275   case Instruction::Call: {
4276     // Ignore dbg intrinsics.
4277     if (isa<DbgInfoIntrinsic>(I))
4278       break;
4279     setDebugLocFromInst(Builder, &I);
4280 
4281     Module *M = I.getParent()->getParent()->getParent();
4282     auto *CI = cast<CallInst>(&I);
4283 
4284     StringRef FnName = CI->getCalledFunction()->getName();
4285     Function *F = CI->getCalledFunction();
4286     Type *RetTy = ToVectorTy(CI->getType(), VF);
4287     SmallVector<Type *, 4> Tys;
4288     for (Value *ArgOperand : CI->arg_operands())
4289       Tys.push_back(ToVectorTy(ArgOperand->getType(), VF));
4290 
4291     Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
4292 
4293     // The flag shows whether we use Intrinsic or a usual Call for vectorized
4294     // version of the instruction.
4295     // Is it beneficial to perform intrinsic call compared to lib call?
4296     bool NeedToScalarize;
4297     unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize);
4298     bool UseVectorIntrinsic =
4299         ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost;
4300     assert((UseVectorIntrinsic || !NeedToScalarize) &&
4301            "Instruction should be scalarized elsewhere.");
4302 
4303     for (unsigned Part = 0; Part < UF; ++Part) {
4304       SmallVector<Value *, 4> Args;
4305       for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) {
4306         Value *Arg = CI->getArgOperand(i);
4307         // Some intrinsics have a scalar argument - don't replace it with a
4308         // vector.
4309         if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, i))
4310           Arg = getOrCreateVectorValue(CI->getArgOperand(i), Part);
4311         Args.push_back(Arg);
4312       }
4313 
4314       Function *VectorF;
4315       if (UseVectorIntrinsic) {
4316         // Use vector version of the intrinsic.
4317         Type *TysForDecl[] = {CI->getType()};
4318         if (VF > 1)
4319           TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
4320         VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
4321       } else {
4322         // Use vector version of the library call.
4323         StringRef VFnName = TLI->getVectorizedFunction(FnName, VF);
4324         assert(!VFnName.empty() && "Vector function name is empty.");
4325         VectorF = M->getFunction(VFnName);
4326         if (!VectorF) {
4327           // Generate a declaration
4328           FunctionType *FTy = FunctionType::get(RetTy, Tys, false);
4329           VectorF =
4330               Function::Create(FTy, Function::ExternalLinkage, VFnName, M);
4331           VectorF->copyAttributesFrom(F);
4332         }
4333       }
4334       assert(VectorF && "Can't create vector function.");
4335 
4336       SmallVector<OperandBundleDef, 1> OpBundles;
4337       CI->getOperandBundlesAsDefs(OpBundles);
4338       CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
4339 
4340       if (isa<FPMathOperator>(V))
4341         V->copyFastMathFlags(CI);
4342 
4343       VectorLoopValueMap.setVectorValue(&I, Part, V);
4344       addMetadata(V, &I);
4345     }
4346 
4347     break;
4348   }
4349 
4350   default:
4351     // This instruction is not vectorized by simple widening.
4352     LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
4353     llvm_unreachable("Unhandled instruction!");
4354   } // end of switch.
4355 }
4356 
4357 void InnerLoopVectorizer::updateAnalysis() {
4358   // Forget the original basic block.
4359   PSE.getSE()->forgetLoop(OrigLoop);
4360 
4361   // DT is not kept up-to-date for outer loop vectorization
4362   if (EnableVPlanNativePath)
4363     return;
4364 
4365   // Update the dominator tree information.
4366   assert(DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) &&
4367          "Entry does not dominate exit.");
4368 
4369   DT->addNewBlock(LoopMiddleBlock,
4370                   LI->getLoopFor(LoopVectorBody)->getLoopLatch());
4371   DT->addNewBlock(LoopScalarPreHeader, LoopBypassBlocks[0]);
4372   DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader);
4373   DT->changeImmediateDominator(LoopExitBlock, LoopBypassBlocks[0]);
4374   assert(DT->verify(DominatorTree::VerificationLevel::Fast));
4375 }
4376 
4377 void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
4378   // We should not collect Scalars more than once per VF. Right now, this
4379   // function is called from collectUniformsAndScalars(), which already does
4380   // this check. Collecting Scalars for VF=1 does not make any sense.
4381   assert(VF >= 2 && Scalars.find(VF) == Scalars.end() &&
4382          "This function should not be visited twice for the same VF");
4383 
4384   SmallSetVector<Instruction *, 8> Worklist;
4385 
4386   // These sets are used to seed the analysis with pointers used by memory
4387   // accesses that will remain scalar.
4388   SmallSetVector<Instruction *, 8> ScalarPtrs;
4389   SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
4390 
4391   // A helper that returns true if the use of Ptr by MemAccess will be scalar.
4392   // The pointer operands of loads and stores will be scalar as long as the
4393   // memory access is not a gather or scatter operation. The value operand of a
4394   // store will remain scalar if the store is scalarized.
4395   auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
4396     InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
4397     assert(WideningDecision != CM_Unknown &&
4398            "Widening decision should be ready at this moment");
4399     if (auto *Store = dyn_cast<StoreInst>(MemAccess))
4400       if (Ptr == Store->getValueOperand())
4401         return WideningDecision == CM_Scalarize;
4402     assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
4403            "Ptr is neither a value or pointer operand");
4404     return WideningDecision != CM_GatherScatter;
4405   };
4406 
4407   // A helper that returns true if the given value is a bitcast or
4408   // getelementptr instruction contained in the loop.
4409   auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
4410     return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
4411             isa<GetElementPtrInst>(V)) &&
4412            !TheLoop->isLoopInvariant(V);
4413   };
4414 
4415   // A helper that evaluates a memory access's use of a pointer. If the use
4416   // will be a scalar use, and the pointer is only used by memory accesses, we
4417   // place the pointer in ScalarPtrs. Otherwise, the pointer is placed in
4418   // PossibleNonScalarPtrs.
4419   auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
4420     // We only care about bitcast and getelementptr instructions contained in
4421     // the loop.
4422     if (!isLoopVaryingBitCastOrGEP(Ptr))
4423       return;
4424 
4425     // If the pointer has already been identified as scalar (e.g., if it was
4426     // also identified as uniform), there's nothing to do.
4427     auto *I = cast<Instruction>(Ptr);
4428     if (Worklist.count(I))
4429       return;
4430 
4431     // If the use of the pointer will be a scalar use, and all users of the
4432     // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
4433     // place the pointer in PossibleNonScalarPtrs.
4434     if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
4435           return isa<LoadInst>(U) || isa<StoreInst>(U);
4436         }))
4437       ScalarPtrs.insert(I);
4438     else
4439       PossibleNonScalarPtrs.insert(I);
4440   };
4441 
4442   // We seed the scalars analysis with three classes of instructions: (1)
4443   // instructions marked uniform-after-vectorization, (2) bitcast and
4444   // getelementptr instructions used by memory accesses requiring a scalar use,
4445   // and (3) pointer induction variables and their update instructions (we
4446   // currently only scalarize these).
4447   //
4448   // (1) Add to the worklist all instructions that have been identified as
4449   // uniform-after-vectorization.
4450   Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
4451 
4452   // (2) Add to the worklist all bitcast and getelementptr instructions used by
4453   // memory accesses requiring a scalar use. The pointer operands of loads and
4454   // stores will be scalar as long as the memory accesses is not a gather or
4455   // scatter operation. The value operand of a store will remain scalar if the
4456   // store is scalarized.
4457   for (auto *BB : TheLoop->blocks())
4458     for (auto &I : *BB) {
4459       if (auto *Load = dyn_cast<LoadInst>(&I)) {
4460         evaluatePtrUse(Load, Load->getPointerOperand());
4461       } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
4462         evaluatePtrUse(Store, Store->getPointerOperand());
4463         evaluatePtrUse(Store, Store->getValueOperand());
4464       }
4465     }
4466   for (auto *I : ScalarPtrs)
4467     if (PossibleNonScalarPtrs.find(I) == PossibleNonScalarPtrs.end()) {
4468       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
4469       Worklist.insert(I);
4470     }
4471 
4472   // (3) Add to the worklist all pointer induction variables and their update
4473   // instructions.
4474   //
4475   // TODO: Once we are able to vectorize pointer induction variables we should
4476   //       no longer insert them into the worklist here.
4477   auto *Latch = TheLoop->getLoopLatch();
4478   for (auto &Induction : *Legal->getInductionVars()) {
4479     auto *Ind = Induction.first;
4480     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4481     if (Induction.second.getKind() != InductionDescriptor::IK_PtrInduction)
4482       continue;
4483     Worklist.insert(Ind);
4484     Worklist.insert(IndUpdate);
4485     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
4486     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
4487                       << "\n");
4488   }
4489 
4490   // Insert the forced scalars.
4491   // FIXME: Currently widenPHIInstruction() often creates a dead vector
4492   // induction variable when the PHI user is scalarized.
4493   auto ForcedScalar = ForcedScalars.find(VF);
4494   if (ForcedScalar != ForcedScalars.end())
4495     for (auto *I : ForcedScalar->second)
4496       Worklist.insert(I);
4497 
4498   // Expand the worklist by looking through any bitcasts and getelementptr
4499   // instructions we've already identified as scalar. This is similar to the
4500   // expansion step in collectLoopUniforms(); however, here we're only
4501   // expanding to include additional bitcasts and getelementptr instructions.
4502   unsigned Idx = 0;
4503   while (Idx != Worklist.size()) {
4504     Instruction *Dst = Worklist[Idx++];
4505     if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
4506       continue;
4507     auto *Src = cast<Instruction>(Dst->getOperand(0));
4508     if (llvm::all_of(Src->users(), [&](User *U) -> bool {
4509           auto *J = cast<Instruction>(U);
4510           return !TheLoop->contains(J) || Worklist.count(J) ||
4511                  ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
4512                   isScalarUse(J, Src));
4513         })) {
4514       Worklist.insert(Src);
4515       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
4516     }
4517   }
4518 
4519   // An induction variable will remain scalar if all users of the induction
4520   // variable and induction variable update remain scalar.
4521   for (auto &Induction : *Legal->getInductionVars()) {
4522     auto *Ind = Induction.first;
4523     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4524 
4525     // We already considered pointer induction variables, so there's no reason
4526     // to look at their users again.
4527     //
4528     // TODO: Once we are able to vectorize pointer induction variables we
4529     //       should no longer skip over them here.
4530     if (Induction.second.getKind() == InductionDescriptor::IK_PtrInduction)
4531       continue;
4532 
4533     // Determine if all users of the induction variable are scalar after
4534     // vectorization.
4535     auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4536       auto *I = cast<Instruction>(U);
4537       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I);
4538     });
4539     if (!ScalarInd)
4540       continue;
4541 
4542     // Determine if all users of the induction variable update instruction are
4543     // scalar after vectorization.
4544     auto ScalarIndUpdate =
4545         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4546           auto *I = cast<Instruction>(U);
4547           return I == Ind || !TheLoop->contains(I) || Worklist.count(I);
4548         });
4549     if (!ScalarIndUpdate)
4550       continue;
4551 
4552     // The induction variable and its update instruction will remain scalar.
4553     Worklist.insert(Ind);
4554     Worklist.insert(IndUpdate);
4555     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
4556     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
4557                       << "\n");
4558   }
4559 
4560   Scalars[VF].insert(Worklist.begin(), Worklist.end());
4561 }
4562 
4563 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, unsigned VF) {
4564   if (!blockNeedsPredication(I->getParent()))
4565     return false;
4566   switch(I->getOpcode()) {
4567   default:
4568     break;
4569   case Instruction::Load:
4570   case Instruction::Store: {
4571     if (!Legal->isMaskRequired(I))
4572       return false;
4573     auto *Ptr = getLoadStorePointerOperand(I);
4574     auto *Ty = getMemInstValueType(I);
4575     // We have already decided how to vectorize this instruction, get that
4576     // result.
4577     if (VF > 1) {
4578       InstWidening WideningDecision = getWideningDecision(I, VF);
4579       assert(WideningDecision != CM_Unknown &&
4580              "Widening decision should be ready at this moment");
4581       return WideningDecision == CM_Scalarize;
4582     }
4583     const MaybeAlign Alignment = getLoadStoreAlignment(I);
4584     return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
4585                                 isLegalMaskedGather(Ty, Alignment))
4586                             : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
4587                                 isLegalMaskedScatter(Ty, Alignment));
4588   }
4589   case Instruction::UDiv:
4590   case Instruction::SDiv:
4591   case Instruction::SRem:
4592   case Instruction::URem:
4593     return mayDivideByZero(*I);
4594   }
4595   return false;
4596 }
4597 
4598 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(Instruction *I,
4599                                                                unsigned VF) {
4600   assert(isAccessInterleaved(I) && "Expecting interleaved access.");
4601   assert(getWideningDecision(I, VF) == CM_Unknown &&
4602          "Decision should not be set yet.");
4603   auto *Group = getInterleavedAccessGroup(I);
4604   assert(Group && "Must have a group.");
4605 
4606   // If the instruction's allocated size doesn't equal it's type size, it
4607   // requires padding and will be scalarized.
4608   auto &DL = I->getModule()->getDataLayout();
4609   auto *ScalarTy = getMemInstValueType(I);
4610   if (hasIrregularType(ScalarTy, DL, VF))
4611     return false;
4612 
4613   // Check if masking is required.
4614   // A Group may need masking for one of two reasons: it resides in a block that
4615   // needs predication, or it was decided to use masking to deal with gaps.
4616   bool PredicatedAccessRequiresMasking =
4617       Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I);
4618   bool AccessWithGapsRequiresMasking =
4619       Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
4620   if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking)
4621     return true;
4622 
4623   // If masked interleaving is required, we expect that the user/target had
4624   // enabled it, because otherwise it either wouldn't have been created or
4625   // it should have been invalidated by the CostModel.
4626   assert(useMaskedInterleavedAccesses(TTI) &&
4627          "Masked interleave-groups for predicated accesses are not enabled.");
4628 
4629   auto *Ty = getMemInstValueType(I);
4630   const MaybeAlign Alignment = getLoadStoreAlignment(I);
4631   return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
4632                           : TTI.isLegalMaskedStore(Ty, Alignment);
4633 }
4634 
4635 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(Instruction *I,
4636                                                                unsigned VF) {
4637   // Get and ensure we have a valid memory instruction.
4638   LoadInst *LI = dyn_cast<LoadInst>(I);
4639   StoreInst *SI = dyn_cast<StoreInst>(I);
4640   assert((LI || SI) && "Invalid memory instruction");
4641 
4642   auto *Ptr = getLoadStorePointerOperand(I);
4643 
4644   // In order to be widened, the pointer should be consecutive, first of all.
4645   if (!Legal->isConsecutivePtr(Ptr))
4646     return false;
4647 
4648   // If the instruction is a store located in a predicated block, it will be
4649   // scalarized.
4650   if (isScalarWithPredication(I))
4651     return false;
4652 
4653   // If the instruction's allocated size doesn't equal it's type size, it
4654   // requires padding and will be scalarized.
4655   auto &DL = I->getModule()->getDataLayout();
4656   auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType();
4657   if (hasIrregularType(ScalarTy, DL, VF))
4658     return false;
4659 
4660   return true;
4661 }
4662 
4663 void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
4664   // We should not collect Uniforms more than once per VF. Right now,
4665   // this function is called from collectUniformsAndScalars(), which
4666   // already does this check. Collecting Uniforms for VF=1 does not make any
4667   // sense.
4668 
4669   assert(VF >= 2 && Uniforms.find(VF) == Uniforms.end() &&
4670          "This function should not be visited twice for the same VF");
4671 
4672   // Visit the list of Uniforms. If we'll not find any uniform value, we'll
4673   // not analyze again.  Uniforms.count(VF) will return 1.
4674   Uniforms[VF].clear();
4675 
4676   // We now know that the loop is vectorizable!
4677   // Collect instructions inside the loop that will remain uniform after
4678   // vectorization.
4679 
4680   // Global values, params and instructions outside of current loop are out of
4681   // scope.
4682   auto isOutOfScope = [&](Value *V) -> bool {
4683     Instruction *I = dyn_cast<Instruction>(V);
4684     return (!I || !TheLoop->contains(I));
4685   };
4686 
4687   SetVector<Instruction *> Worklist;
4688   BasicBlock *Latch = TheLoop->getLoopLatch();
4689 
4690   // Instructions that are scalar with predication must not be considered
4691   // uniform after vectorization, because that would create an erroneous
4692   // replicating region where only a single instance out of VF should be formed.
4693   // TODO: optimize such seldom cases if found important, see PR40816.
4694   auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
4695     if (isScalarWithPredication(I, VF)) {
4696       LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "
4697                         << *I << "\n");
4698       return;
4699     }
4700     LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
4701     Worklist.insert(I);
4702   };
4703 
4704   // Start with the conditional branch. If the branch condition is an
4705   // instruction contained in the loop that is only used by the branch, it is
4706   // uniform.
4707   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
4708   if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
4709     addToWorklistIfAllowed(Cmp);
4710 
4711   // Holds consecutive and consecutive-like pointers. Consecutive-like pointers
4712   // are pointers that are treated like consecutive pointers during
4713   // vectorization. The pointer operands of interleaved accesses are an
4714   // example.
4715   SmallSetVector<Instruction *, 8> ConsecutiveLikePtrs;
4716 
4717   // Holds pointer operands of instructions that are possibly non-uniform.
4718   SmallPtrSet<Instruction *, 8> PossibleNonUniformPtrs;
4719 
4720   auto isUniformDecision = [&](Instruction *I, unsigned VF) {
4721     InstWidening WideningDecision = getWideningDecision(I, VF);
4722     assert(WideningDecision != CM_Unknown &&
4723            "Widening decision should be ready at this moment");
4724 
4725     return (WideningDecision == CM_Widen ||
4726             WideningDecision == CM_Widen_Reverse ||
4727             WideningDecision == CM_Interleave);
4728   };
4729   // Iterate over the instructions in the loop, and collect all
4730   // consecutive-like pointer operands in ConsecutiveLikePtrs. If it's possible
4731   // that a consecutive-like pointer operand will be scalarized, we collect it
4732   // in PossibleNonUniformPtrs instead. We use two sets here because a single
4733   // getelementptr instruction can be used by both vectorized and scalarized
4734   // memory instructions. For example, if a loop loads and stores from the same
4735   // location, but the store is conditional, the store will be scalarized, and
4736   // the getelementptr won't remain uniform.
4737   for (auto *BB : TheLoop->blocks())
4738     for (auto &I : *BB) {
4739       // If there's no pointer operand, there's nothing to do.
4740       auto *Ptr = dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
4741       if (!Ptr)
4742         continue;
4743 
4744       // True if all users of Ptr are memory accesses that have Ptr as their
4745       // pointer operand.
4746       auto UsersAreMemAccesses =
4747           llvm::all_of(Ptr->users(), [&](User *U) -> bool {
4748             return getLoadStorePointerOperand(U) == Ptr;
4749           });
4750 
4751       // Ensure the memory instruction will not be scalarized or used by
4752       // gather/scatter, making its pointer operand non-uniform. If the pointer
4753       // operand is used by any instruction other than a memory access, we
4754       // conservatively assume the pointer operand may be non-uniform.
4755       if (!UsersAreMemAccesses || !isUniformDecision(&I, VF))
4756         PossibleNonUniformPtrs.insert(Ptr);
4757 
4758       // If the memory instruction will be vectorized and its pointer operand
4759       // is consecutive-like, or interleaving - the pointer operand should
4760       // remain uniform.
4761       else
4762         ConsecutiveLikePtrs.insert(Ptr);
4763     }
4764 
4765   // Add to the Worklist all consecutive and consecutive-like pointers that
4766   // aren't also identified as possibly non-uniform.
4767   for (auto *V : ConsecutiveLikePtrs)
4768     if (PossibleNonUniformPtrs.find(V) == PossibleNonUniformPtrs.end())
4769       addToWorklistIfAllowed(V);
4770 
4771   // Expand Worklist in topological order: whenever a new instruction
4772   // is added , its users should be already inside Worklist.  It ensures
4773   // a uniform instruction will only be used by uniform instructions.
4774   unsigned idx = 0;
4775   while (idx != Worklist.size()) {
4776     Instruction *I = Worklist[idx++];
4777 
4778     for (auto OV : I->operand_values()) {
4779       // isOutOfScope operands cannot be uniform instructions.
4780       if (isOutOfScope(OV))
4781         continue;
4782       // First order recurrence Phi's should typically be considered
4783       // non-uniform.
4784       auto *OP = dyn_cast<PHINode>(OV);
4785       if (OP && Legal->isFirstOrderRecurrence(OP))
4786         continue;
4787       // If all the users of the operand are uniform, then add the
4788       // operand into the uniform worklist.
4789       auto *OI = cast<Instruction>(OV);
4790       if (llvm::all_of(OI->users(), [&](User *U) -> bool {
4791             auto *J = cast<Instruction>(U);
4792             return Worklist.count(J) ||
4793                    (OI == getLoadStorePointerOperand(J) &&
4794                     isUniformDecision(J, VF));
4795           }))
4796         addToWorklistIfAllowed(OI);
4797     }
4798   }
4799 
4800   // Returns true if Ptr is the pointer operand of a memory access instruction
4801   // I, and I is known to not require scalarization.
4802   auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
4803     return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF);
4804   };
4805 
4806   // For an instruction to be added into Worklist above, all its users inside
4807   // the loop should also be in Worklist. However, this condition cannot be
4808   // true for phi nodes that form a cyclic dependence. We must process phi
4809   // nodes separately. An induction variable will remain uniform if all users
4810   // of the induction variable and induction variable update remain uniform.
4811   // The code below handles both pointer and non-pointer induction variables.
4812   for (auto &Induction : *Legal->getInductionVars()) {
4813     auto *Ind = Induction.first;
4814     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4815 
4816     // Determine if all users of the induction variable are uniform after
4817     // vectorization.
4818     auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4819       auto *I = cast<Instruction>(U);
4820       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
4821              isVectorizedMemAccessUse(I, Ind);
4822     });
4823     if (!UniformInd)
4824       continue;
4825 
4826     // Determine if all users of the induction variable update instruction are
4827     // uniform after vectorization.
4828     auto UniformIndUpdate =
4829         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4830           auto *I = cast<Instruction>(U);
4831           return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
4832                  isVectorizedMemAccessUse(I, IndUpdate);
4833         });
4834     if (!UniformIndUpdate)
4835       continue;
4836 
4837     // The induction variable and its update instruction will remain uniform.
4838     addToWorklistIfAllowed(Ind);
4839     addToWorklistIfAllowed(IndUpdate);
4840   }
4841 
4842   Uniforms[VF].insert(Worklist.begin(), Worklist.end());
4843 }
4844 
4845 bool LoopVectorizationCostModel::runtimeChecksRequired() {
4846   LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
4847 
4848   if (Legal->getRuntimePointerChecking()->Need) {
4849     reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
4850         "runtime pointer checks needed. Enable vectorization of this "
4851         "loop with '#pragma clang loop vectorize(enable)' when "
4852         "compiling with -Os/-Oz",
4853         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4854     return true;
4855   }
4856 
4857   if (!PSE.getUnionPredicate().getPredicates().empty()) {
4858     reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
4859         "runtime SCEV checks needed. Enable vectorization of this "
4860         "loop with '#pragma clang loop vectorize(enable)' when "
4861         "compiling with -Os/-Oz",
4862         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4863     return true;
4864   }
4865 
4866   // FIXME: Avoid specializing for stride==1 instead of bailing out.
4867   if (!Legal->getLAI()->getSymbolicStrides().empty()) {
4868     reportVectorizationFailure("Runtime stride check is required with -Os/-Oz",
4869         "runtime stride == 1 checks needed. Enable vectorization of "
4870         "this loop with '#pragma clang loop vectorize(enable)' when "
4871         "compiling with -Os/-Oz",
4872         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4873     return true;
4874   }
4875 
4876   return false;
4877 }
4878 
4879 Optional<unsigned> LoopVectorizationCostModel::computeMaxVF() {
4880   if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
4881     // TODO: It may by useful to do since it's still likely to be dynamically
4882     // uniform if the target can skip.
4883     reportVectorizationFailure(
4884         "Not inserting runtime ptr check for divergent target",
4885         "runtime pointer checks needed. Not enabled for divergent target",
4886         "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
4887     return None;
4888   }
4889 
4890   unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
4891   LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
4892   if (TC == 1) {
4893     reportVectorizationFailure("Single iteration (non) loop",
4894         "loop trip count is one, irrelevant for vectorization",
4895         "SingleIterationLoop", ORE, TheLoop);
4896     return None;
4897   }
4898 
4899   switch (ScalarEpilogueStatus) {
4900   case CM_ScalarEpilogueAllowed:
4901     return computeFeasibleMaxVF(TC);
4902   case CM_ScalarEpilogueNotNeededUsePredicate:
4903     LLVM_DEBUG(
4904         dbgs() << "LV: vector predicate hint/switch found.\n"
4905                << "LV: Not allowing scalar epilogue, creating predicated "
4906                << "vector loop.\n");
4907     break;
4908   case CM_ScalarEpilogueNotAllowedLowTripLoop:
4909     // fallthrough as a special case of OptForSize
4910   case CM_ScalarEpilogueNotAllowedOptSize:
4911     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
4912       LLVM_DEBUG(
4913           dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
4914     else
4915       LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
4916                         << "count.\n");
4917 
4918     // Bail if runtime checks are required, which are not good when optimising
4919     // for size.
4920     if (runtimeChecksRequired())
4921       return None;
4922     break;
4923   }
4924 
4925   // Now try the tail folding
4926 
4927   // Invalidate interleave groups that require an epilogue if we can't mask
4928   // the interleave-group.
4929   if (!useMaskedInterleavedAccesses(TTI))
4930     InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
4931 
4932   unsigned MaxVF = computeFeasibleMaxVF(TC);
4933   if (TC > 0 && TC % MaxVF == 0) {
4934     // Accept MaxVF if we do not have a tail.
4935     LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
4936     return MaxVF;
4937   }
4938 
4939   // If we don't know the precise trip count, or if the trip count that we
4940   // found modulo the vectorization factor is not zero, try to fold the tail
4941   // by masking.
4942   // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
4943   if (Legal->prepareToFoldTailByMasking()) {
4944     FoldTailByMasking = true;
4945     return MaxVF;
4946   }
4947 
4948   if (TC == 0) {
4949     reportVectorizationFailure(
4950         "Unable to calculate the loop count due to complex control flow",
4951         "unable to calculate the loop count due to complex control flow",
4952         "UnknownLoopCountComplexCFG", ORE, TheLoop);
4953     return None;
4954   }
4955 
4956   reportVectorizationFailure(
4957       "Cannot optimize for size and vectorize at the same time.",
4958       "cannot optimize for size and vectorize at the same time. "
4959       "Enable vectorization of this loop with '#pragma clang loop "
4960       "vectorize(enable)' when compiling with -Os/-Oz",
4961       "NoTailLoopWithOptForSize", ORE, TheLoop);
4962   return None;
4963 }
4964 
4965 unsigned
4966 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) {
4967   MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
4968   unsigned SmallestType, WidestType;
4969   std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
4970   unsigned WidestRegister = TTI.getRegisterBitWidth(true);
4971 
4972   // Get the maximum safe dependence distance in bits computed by LAA.
4973   // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
4974   // the memory accesses that is most restrictive (involved in the smallest
4975   // dependence distance).
4976   unsigned MaxSafeRegisterWidth = Legal->getMaxSafeRegisterWidth();
4977 
4978   WidestRegister = std::min(WidestRegister, MaxSafeRegisterWidth);
4979 
4980   unsigned MaxVectorSize = WidestRegister / WidestType;
4981 
4982   LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
4983                     << " / " << WidestType << " bits.\n");
4984   LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
4985                     << WidestRegister << " bits.\n");
4986 
4987   assert(MaxVectorSize <= 256 && "Did not expect to pack so many elements"
4988                                  " into one vector!");
4989   if (MaxVectorSize == 0) {
4990     LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n");
4991     MaxVectorSize = 1;
4992     return MaxVectorSize;
4993   } else if (ConstTripCount && ConstTripCount < MaxVectorSize &&
4994              isPowerOf2_32(ConstTripCount)) {
4995     // We need to clamp the VF to be the ConstTripCount. There is no point in
4996     // choosing a higher viable VF as done in the loop below.
4997     LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: "
4998                       << ConstTripCount << "\n");
4999     MaxVectorSize = ConstTripCount;
5000     return MaxVectorSize;
5001   }
5002 
5003   unsigned MaxVF = MaxVectorSize;
5004   if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) ||
5005       (MaximizeBandwidth && isScalarEpilogueAllowed())) {
5006     // Collect all viable vectorization factors larger than the default MaxVF
5007     // (i.e. MaxVectorSize).
5008     SmallVector<unsigned, 8> VFs;
5009     unsigned NewMaxVectorSize = WidestRegister / SmallestType;
5010     for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2)
5011       VFs.push_back(VS);
5012 
5013     // For each VF calculate its register usage.
5014     auto RUs = calculateRegisterUsage(VFs);
5015 
5016     // Select the largest VF which doesn't require more registers than existing
5017     // ones.
5018     for (int i = RUs.size() - 1; i >= 0; --i) {
5019       bool Selected = true;
5020       for (auto& pair : RUs[i].MaxLocalUsers) {
5021         unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5022         if (pair.second > TargetNumRegisters)
5023           Selected = false;
5024       }
5025       if (Selected) {
5026         MaxVF = VFs[i];
5027         break;
5028       }
5029     }
5030     if (unsigned MinVF = TTI.getMinimumVF(SmallestType)) {
5031       if (MaxVF < MinVF) {
5032         LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
5033                           << ") with target's minimum: " << MinVF << '\n');
5034         MaxVF = MinVF;
5035       }
5036     }
5037   }
5038   return MaxVF;
5039 }
5040 
5041 VectorizationFactor
5042 LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) {
5043   float Cost = expectedCost(1).first;
5044   const float ScalarCost = Cost;
5045   unsigned Width = 1;
5046   LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n");
5047 
5048   bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
5049   if (ForceVectorization && MaxVF > 1) {
5050     // Ignore scalar width, because the user explicitly wants vectorization.
5051     // Initialize cost to max so that VF = 2 is, at least, chosen during cost
5052     // evaluation.
5053     Cost = std::numeric_limits<float>::max();
5054   }
5055 
5056   for (unsigned i = 2; i <= MaxVF; i *= 2) {
5057     // Notice that the vector loop needs to be executed less times, so
5058     // we need to divide the cost of the vector loops by the width of
5059     // the vector elements.
5060     VectorizationCostTy C = expectedCost(i);
5061     float VectorCost = C.first / (float)i;
5062     LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
5063                       << " costs: " << (int)VectorCost << ".\n");
5064     if (!C.second && !ForceVectorization) {
5065       LLVM_DEBUG(
5066           dbgs() << "LV: Not considering vector loop of width " << i
5067                  << " because it will not generate any vector instructions.\n");
5068       continue;
5069     }
5070     if (VectorCost < Cost) {
5071       Cost = VectorCost;
5072       Width = i;
5073     }
5074   }
5075 
5076   if (!EnableCondStoresVectorization && NumPredStores) {
5077     reportVectorizationFailure("There are conditional stores.",
5078         "store that is conditionally executed prevents vectorization",
5079         "ConditionalStore", ORE, TheLoop);
5080     Width = 1;
5081     Cost = ScalarCost;
5082   }
5083 
5084   LLVM_DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs()
5085              << "LV: Vectorization seems to be not beneficial, "
5086              << "but was forced by a user.\n");
5087   LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n");
5088   VectorizationFactor Factor = {Width, (unsigned)(Width * Cost)};
5089   return Factor;
5090 }
5091 
5092 std::pair<unsigned, unsigned>
5093 LoopVectorizationCostModel::getSmallestAndWidestTypes() {
5094   unsigned MinWidth = -1U;
5095   unsigned MaxWidth = 8;
5096   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5097 
5098   // For each block.
5099   for (BasicBlock *BB : TheLoop->blocks()) {
5100     // For each instruction in the loop.
5101     for (Instruction &I : BB->instructionsWithoutDebug()) {
5102       Type *T = I.getType();
5103 
5104       // Skip ignored values.
5105       if (ValuesToIgnore.find(&I) != ValuesToIgnore.end())
5106         continue;
5107 
5108       // Only examine Loads, Stores and PHINodes.
5109       if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
5110         continue;
5111 
5112       // Examine PHI nodes that are reduction variables. Update the type to
5113       // account for the recurrence type.
5114       if (auto *PN = dyn_cast<PHINode>(&I)) {
5115         if (!Legal->isReductionVariable(PN))
5116           continue;
5117         RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[PN];
5118         T = RdxDesc.getRecurrenceType();
5119       }
5120 
5121       // Examine the stored values.
5122       if (auto *ST = dyn_cast<StoreInst>(&I))
5123         T = ST->getValueOperand()->getType();
5124 
5125       // Ignore loaded pointer types and stored pointer types that are not
5126       // vectorizable.
5127       //
5128       // FIXME: The check here attempts to predict whether a load or store will
5129       //        be vectorized. We only know this for certain after a VF has
5130       //        been selected. Here, we assume that if an access can be
5131       //        vectorized, it will be. We should also look at extending this
5132       //        optimization to non-pointer types.
5133       //
5134       if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) &&
5135           !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I))
5136         continue;
5137 
5138       MinWidth = std::min(MinWidth,
5139                           (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
5140       MaxWidth = std::max(MaxWidth,
5141                           (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
5142     }
5143   }
5144 
5145   return {MinWidth, MaxWidth};
5146 }
5147 
5148 unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF,
5149                                                            unsigned LoopCost) {
5150   // -- The interleave heuristics --
5151   // We interleave the loop in order to expose ILP and reduce the loop overhead.
5152   // There are many micro-architectural considerations that we can't predict
5153   // at this level. For example, frontend pressure (on decode or fetch) due to
5154   // code size, or the number and capabilities of the execution ports.
5155   //
5156   // We use the following heuristics to select the interleave count:
5157   // 1. If the code has reductions, then we interleave to break the cross
5158   // iteration dependency.
5159   // 2. If the loop is really small, then we interleave to reduce the loop
5160   // overhead.
5161   // 3. We don't interleave if we think that we will spill registers to memory
5162   // due to the increased register pressure.
5163 
5164   if (!isScalarEpilogueAllowed())
5165     return 1;
5166 
5167   // We used the distance for the interleave count.
5168   if (Legal->getMaxSafeDepDistBytes() != -1U)
5169     return 1;
5170 
5171   // Do not interleave loops with a relatively small known or estimated trip
5172   // count.
5173   auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
5174   if (BestKnownTC && *BestKnownTC < TinyTripCountInterleaveThreshold)
5175     return 1;
5176 
5177   RegisterUsage R = calculateRegisterUsage({VF})[0];
5178   // We divide by these constants so assume that we have at least one
5179   // instruction that uses at least one register.
5180   for (auto& pair : R.MaxLocalUsers) {
5181     pair.second = std::max(pair.second, 1U);
5182   }
5183 
5184   // We calculate the interleave count using the following formula.
5185   // Subtract the number of loop invariants from the number of available
5186   // registers. These registers are used by all of the interleaved instances.
5187   // Next, divide the remaining registers by the number of registers that is
5188   // required by the loop, in order to estimate how many parallel instances
5189   // fit without causing spills. All of this is rounded down if necessary to be
5190   // a power of two. We want power of two interleave count to simplify any
5191   // addressing operations or alignment considerations.
5192   // We also want power of two interleave counts to ensure that the induction
5193   // variable of the vector loop wraps to zero, when tail is folded by masking;
5194   // this currently happens when OptForSize, in which case IC is set to 1 above.
5195   unsigned IC = UINT_MAX;
5196 
5197   for (auto& pair : R.MaxLocalUsers) {
5198     unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5199     LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
5200                       << " registers of "
5201                       << TTI.getRegisterClassName(pair.first) << " register class\n");
5202     if (VF == 1) {
5203       if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
5204         TargetNumRegisters = ForceTargetNumScalarRegs;
5205     } else {
5206       if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
5207         TargetNumRegisters = ForceTargetNumVectorRegs;
5208     }
5209     unsigned MaxLocalUsers = pair.second;
5210     unsigned LoopInvariantRegs = 0;
5211     if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end())
5212       LoopInvariantRegs = R.LoopInvariantRegs[pair.first];
5213 
5214     unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers);
5215     // Don't count the induction variable as interleaved.
5216     if (EnableIndVarRegisterHeur) {
5217       TmpIC =
5218           PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) /
5219                         std::max(1U, (MaxLocalUsers - 1)));
5220     }
5221 
5222     IC = std::min(IC, TmpIC);
5223   }
5224 
5225   // Clamp the interleave ranges to reasonable counts.
5226   unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF);
5227 
5228   // Check if the user has overridden the max.
5229   if (VF == 1) {
5230     if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
5231       MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
5232   } else {
5233     if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
5234       MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
5235   }
5236 
5237   // If trip count is known or estimated compile time constant, limit the
5238   // interleave count to be less than the trip count divided by VF.
5239   if (BestKnownTC) {
5240     MaxInterleaveCount = std::min(*BestKnownTC / VF, MaxInterleaveCount);
5241   }
5242 
5243   // If we did not calculate the cost for VF (because the user selected the VF)
5244   // then we calculate the cost of VF here.
5245   if (LoopCost == 0)
5246     LoopCost = expectedCost(VF).first;
5247 
5248   assert(LoopCost && "Non-zero loop cost expected");
5249 
5250   // Clamp the calculated IC to be between the 1 and the max interleave count
5251   // that the target and trip count allows.
5252   if (IC > MaxInterleaveCount)
5253     IC = MaxInterleaveCount;
5254   else if (IC < 1)
5255     IC = 1;
5256 
5257   // Interleave if we vectorized this loop and there is a reduction that could
5258   // benefit from interleaving.
5259   if (VF > 1 && !Legal->getReductionVars()->empty()) {
5260     LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
5261     return IC;
5262   }
5263 
5264   // Note that if we've already vectorized the loop we will have done the
5265   // runtime check and so interleaving won't require further checks.
5266   bool InterleavingRequiresRuntimePointerCheck =
5267       (VF == 1 && Legal->getRuntimePointerChecking()->Need);
5268 
5269   // We want to interleave small loops in order to reduce the loop overhead and
5270   // potentially expose ILP opportunities.
5271   LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n');
5272   if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) {
5273     // We assume that the cost overhead is 1 and we use the cost model
5274     // to estimate the cost of the loop and interleave until the cost of the
5275     // loop overhead is about 5% of the cost of the loop.
5276     unsigned SmallIC =
5277         std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost));
5278 
5279     // Interleave until store/load ports (estimated by max interleave count) are
5280     // saturated.
5281     unsigned NumStores = Legal->getNumStores();
5282     unsigned NumLoads = Legal->getNumLoads();
5283     unsigned StoresIC = IC / (NumStores ? NumStores : 1);
5284     unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
5285 
5286     // If we have a scalar reduction (vector reductions are already dealt with
5287     // by this point), we can increase the critical path length if the loop
5288     // we're interleaving is inside another loop. Limit, by default to 2, so the
5289     // critical path only gets increased by one reduction operation.
5290     if (!Legal->getReductionVars()->empty() && TheLoop->getLoopDepth() > 1) {
5291       unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
5292       SmallIC = std::min(SmallIC, F);
5293       StoresIC = std::min(StoresIC, F);
5294       LoadsIC = std::min(LoadsIC, F);
5295     }
5296 
5297     if (EnableLoadStoreRuntimeInterleave &&
5298         std::max(StoresIC, LoadsIC) > SmallIC) {
5299       LLVM_DEBUG(
5300           dbgs() << "LV: Interleaving to saturate store or load ports.\n");
5301       return std::max(StoresIC, LoadsIC);
5302     }
5303 
5304     LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
5305     return SmallIC;
5306   }
5307 
5308   // Interleave if this is a large loop (small loops are already dealt with by
5309   // this point) that could benefit from interleaving.
5310   bool HasReductions = !Legal->getReductionVars()->empty();
5311   if (TTI.enableAggressiveInterleaving(HasReductions)) {
5312     LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5313     return IC;
5314   }
5315 
5316   LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
5317   return 1;
5318 }
5319 
5320 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
5321 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {
5322   // This function calculates the register usage by measuring the highest number
5323   // of values that are alive at a single location. Obviously, this is a very
5324   // rough estimation. We scan the loop in a topological order in order and
5325   // assign a number to each instruction. We use RPO to ensure that defs are
5326   // met before their users. We assume that each instruction that has in-loop
5327   // users starts an interval. We record every time that an in-loop value is
5328   // used, so we have a list of the first and last occurrences of each
5329   // instruction. Next, we transpose this data structure into a multi map that
5330   // holds the list of intervals that *end* at a specific location. This multi
5331   // map allows us to perform a linear search. We scan the instructions linearly
5332   // and record each time that a new interval starts, by placing it in a set.
5333   // If we find this value in the multi-map then we remove it from the set.
5334   // The max register usage is the maximum size of the set.
5335   // We also search for instructions that are defined outside the loop, but are
5336   // used inside the loop. We need this number separately from the max-interval
5337   // usage number because when we unroll, loop-invariant values do not take
5338   // more register.
5339   LoopBlocksDFS DFS(TheLoop);
5340   DFS.perform(LI);
5341 
5342   RegisterUsage RU;
5343 
5344   // Each 'key' in the map opens a new interval. The values
5345   // of the map are the index of the 'last seen' usage of the
5346   // instruction that is the key.
5347   using IntervalMap = DenseMap<Instruction *, unsigned>;
5348 
5349   // Maps instruction to its index.
5350   SmallVector<Instruction *, 64> IdxToInstr;
5351   // Marks the end of each interval.
5352   IntervalMap EndPoint;
5353   // Saves the list of instruction indices that are used in the loop.
5354   SmallPtrSet<Instruction *, 8> Ends;
5355   // Saves the list of values that are used in the loop but are
5356   // defined outside the loop, such as arguments and constants.
5357   SmallPtrSet<Value *, 8> LoopInvariants;
5358 
5359   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
5360     for (Instruction &I : BB->instructionsWithoutDebug()) {
5361       IdxToInstr.push_back(&I);
5362 
5363       // Save the end location of each USE.
5364       for (Value *U : I.operands()) {
5365         auto *Instr = dyn_cast<Instruction>(U);
5366 
5367         // Ignore non-instruction values such as arguments, constants, etc.
5368         if (!Instr)
5369           continue;
5370 
5371         // If this instruction is outside the loop then record it and continue.
5372         if (!TheLoop->contains(Instr)) {
5373           LoopInvariants.insert(Instr);
5374           continue;
5375         }
5376 
5377         // Overwrite previous end points.
5378         EndPoint[Instr] = IdxToInstr.size();
5379         Ends.insert(Instr);
5380       }
5381     }
5382   }
5383 
5384   // Saves the list of intervals that end with the index in 'key'.
5385   using InstrList = SmallVector<Instruction *, 2>;
5386   DenseMap<unsigned, InstrList> TransposeEnds;
5387 
5388   // Transpose the EndPoints to a list of values that end at each index.
5389   for (auto &Interval : EndPoint)
5390     TransposeEnds[Interval.second].push_back(Interval.first);
5391 
5392   SmallPtrSet<Instruction *, 8> OpenIntervals;
5393 
5394   // Get the size of the widest register.
5395   unsigned MaxSafeDepDist = -1U;
5396   if (Legal->getMaxSafeDepDistBytes() != -1U)
5397     MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8;
5398   unsigned WidestRegister =
5399       std::min(TTI.getRegisterBitWidth(true), MaxSafeDepDist);
5400   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5401 
5402   SmallVector<RegisterUsage, 8> RUs(VFs.size());
5403   SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size());
5404 
5405   LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
5406 
5407   // A lambda that gets the register usage for the given type and VF.
5408   auto GetRegUsage = [&DL, WidestRegister](Type *Ty, unsigned VF) {
5409     if (Ty->isTokenTy())
5410       return 0U;
5411     unsigned TypeSize = DL.getTypeSizeInBits(Ty->getScalarType());
5412     return std::max<unsigned>(1, VF * TypeSize / WidestRegister);
5413   };
5414 
5415   for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
5416     Instruction *I = IdxToInstr[i];
5417 
5418     // Remove all of the instructions that end at this location.
5419     InstrList &List = TransposeEnds[i];
5420     for (Instruction *ToRemove : List)
5421       OpenIntervals.erase(ToRemove);
5422 
5423     // Ignore instructions that are never used within the loop.
5424     if (Ends.find(I) == Ends.end())
5425       continue;
5426 
5427     // Skip ignored values.
5428     if (ValuesToIgnore.find(I) != ValuesToIgnore.end())
5429       continue;
5430 
5431     // For each VF find the maximum usage of registers.
5432     for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
5433       // Count the number of live intervals.
5434       SmallMapVector<unsigned, unsigned, 4> RegUsage;
5435 
5436       if (VFs[j] == 1) {
5437         for (auto Inst : OpenIntervals) {
5438           unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
5439           if (RegUsage.find(ClassID) == RegUsage.end())
5440             RegUsage[ClassID] = 1;
5441           else
5442             RegUsage[ClassID] += 1;
5443         }
5444       } else {
5445         collectUniformsAndScalars(VFs[j]);
5446         for (auto Inst : OpenIntervals) {
5447           // Skip ignored values for VF > 1.
5448           if (VecValuesToIgnore.find(Inst) != VecValuesToIgnore.end())
5449             continue;
5450           if (isScalarAfterVectorization(Inst, VFs[j])) {
5451             unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
5452             if (RegUsage.find(ClassID) == RegUsage.end())
5453               RegUsage[ClassID] = 1;
5454             else
5455               RegUsage[ClassID] += 1;
5456           } else {
5457             unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType());
5458             if (RegUsage.find(ClassID) == RegUsage.end())
5459               RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]);
5460             else
5461               RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]);
5462           }
5463         }
5464       }
5465 
5466       for (auto& pair : RegUsage) {
5467         if (MaxUsages[j].find(pair.first) != MaxUsages[j].end())
5468           MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second);
5469         else
5470           MaxUsages[j][pair.first] = pair.second;
5471       }
5472     }
5473 
5474     LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
5475                       << OpenIntervals.size() << '\n');
5476 
5477     // Add the current instruction to the list of open intervals.
5478     OpenIntervals.insert(I);
5479   }
5480 
5481   for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
5482     SmallMapVector<unsigned, unsigned, 4> Invariant;
5483 
5484     for (auto Inst : LoopInvariants) {
5485       unsigned Usage = VFs[i] == 1 ? 1 : GetRegUsage(Inst->getType(), VFs[i]);
5486       unsigned ClassID = TTI.getRegisterClassForType(VFs[i] > 1, Inst->getType());
5487       if (Invariant.find(ClassID) == Invariant.end())
5488         Invariant[ClassID] = Usage;
5489       else
5490         Invariant[ClassID] += Usage;
5491     }
5492 
5493     LLVM_DEBUG({
5494       dbgs() << "LV(REG): VF = " << VFs[i] << '\n';
5495       dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size()
5496              << " item\n";
5497       for (const auto &pair : MaxUsages[i]) {
5498         dbgs() << "LV(REG): RegisterClass: "
5499                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5500                << " registers\n";
5501       }
5502       dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()
5503              << " item\n";
5504       for (const auto &pair : Invariant) {
5505         dbgs() << "LV(REG): RegisterClass: "
5506                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5507                << " registers\n";
5508       }
5509     });
5510 
5511     RU.LoopInvariantRegs = Invariant;
5512     RU.MaxLocalUsers = MaxUsages[i];
5513     RUs[i] = RU;
5514   }
5515 
5516   return RUs;
5517 }
5518 
5519 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){
5520   // TODO: Cost model for emulated masked load/store is completely
5521   // broken. This hack guides the cost model to use an artificially
5522   // high enough value to practically disable vectorization with such
5523   // operations, except where previously deployed legality hack allowed
5524   // using very low cost values. This is to avoid regressions coming simply
5525   // from moving "masked load/store" check from legality to cost model.
5526   // Masked Load/Gather emulation was previously never allowed.
5527   // Limited number of Masked Store/Scatter emulation was allowed.
5528   assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction");
5529   return isa<LoadInst>(I) ||
5530          (isa<StoreInst>(I) &&
5531           NumPredStores > NumberOfStoresToPredicate);
5532 }
5533 
5534 void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) {
5535   // If we aren't vectorizing the loop, or if we've already collected the
5536   // instructions to scalarize, there's nothing to do. Collection may already
5537   // have occurred if we have a user-selected VF and are now computing the
5538   // expected cost for interleaving.
5539   if (VF < 2 || InstsToScalarize.find(VF) != InstsToScalarize.end())
5540     return;
5541 
5542   // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
5543   // not profitable to scalarize any instructions, the presence of VF in the
5544   // map will indicate that we've analyzed it already.
5545   ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
5546 
5547   // Find all the instructions that are scalar with predication in the loop and
5548   // determine if it would be better to not if-convert the blocks they are in.
5549   // If so, we also record the instructions to scalarize.
5550   for (BasicBlock *BB : TheLoop->blocks()) {
5551     if (!blockNeedsPredication(BB))
5552       continue;
5553     for (Instruction &I : *BB)
5554       if (isScalarWithPredication(&I)) {
5555         ScalarCostsTy ScalarCosts;
5556         // Do not apply discount logic if hacked cost is needed
5557         // for emulated masked memrefs.
5558         if (!useEmulatedMaskMemRefHack(&I) &&
5559             computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
5560           ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
5561         // Remember that BB will remain after vectorization.
5562         PredicatedBBsAfterVectorization.insert(BB);
5563       }
5564   }
5565 }
5566 
5567 int LoopVectorizationCostModel::computePredInstDiscount(
5568     Instruction *PredInst, DenseMap<Instruction *, unsigned> &ScalarCosts,
5569     unsigned VF) {
5570   assert(!isUniformAfterVectorization(PredInst, VF) &&
5571          "Instruction marked uniform-after-vectorization will be predicated");
5572 
5573   // Initialize the discount to zero, meaning that the scalar version and the
5574   // vector version cost the same.
5575   int Discount = 0;
5576 
5577   // Holds instructions to analyze. The instructions we visit are mapped in
5578   // ScalarCosts. Those instructions are the ones that would be scalarized if
5579   // we find that the scalar version costs less.
5580   SmallVector<Instruction *, 8> Worklist;
5581 
5582   // Returns true if the given instruction can be scalarized.
5583   auto canBeScalarized = [&](Instruction *I) -> bool {
5584     // We only attempt to scalarize instructions forming a single-use chain
5585     // from the original predicated block that would otherwise be vectorized.
5586     // Although not strictly necessary, we give up on instructions we know will
5587     // already be scalar to avoid traversing chains that are unlikely to be
5588     // beneficial.
5589     if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
5590         isScalarAfterVectorization(I, VF))
5591       return false;
5592 
5593     // If the instruction is scalar with predication, it will be analyzed
5594     // separately. We ignore it within the context of PredInst.
5595     if (isScalarWithPredication(I))
5596       return false;
5597 
5598     // If any of the instruction's operands are uniform after vectorization,
5599     // the instruction cannot be scalarized. This prevents, for example, a
5600     // masked load from being scalarized.
5601     //
5602     // We assume we will only emit a value for lane zero of an instruction
5603     // marked uniform after vectorization, rather than VF identical values.
5604     // Thus, if we scalarize an instruction that uses a uniform, we would
5605     // create uses of values corresponding to the lanes we aren't emitting code
5606     // for. This behavior can be changed by allowing getScalarValue to clone
5607     // the lane zero values for uniforms rather than asserting.
5608     for (Use &U : I->operands())
5609       if (auto *J = dyn_cast<Instruction>(U.get()))
5610         if (isUniformAfterVectorization(J, VF))
5611           return false;
5612 
5613     // Otherwise, we can scalarize the instruction.
5614     return true;
5615   };
5616 
5617   // Compute the expected cost discount from scalarizing the entire expression
5618   // feeding the predicated instruction. We currently only consider expressions
5619   // that are single-use instruction chains.
5620   Worklist.push_back(PredInst);
5621   while (!Worklist.empty()) {
5622     Instruction *I = Worklist.pop_back_val();
5623 
5624     // If we've already analyzed the instruction, there's nothing to do.
5625     if (ScalarCosts.find(I) != ScalarCosts.end())
5626       continue;
5627 
5628     // Compute the cost of the vector instruction. Note that this cost already
5629     // includes the scalarization overhead of the predicated instruction.
5630     unsigned VectorCost = getInstructionCost(I, VF).first;
5631 
5632     // Compute the cost of the scalarized instruction. This cost is the cost of
5633     // the instruction as if it wasn't if-converted and instead remained in the
5634     // predicated block. We will scale this cost by block probability after
5635     // computing the scalarization overhead.
5636     unsigned ScalarCost = VF * getInstructionCost(I, 1).first;
5637 
5638     // Compute the scalarization overhead of needed insertelement instructions
5639     // and phi nodes.
5640     if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) {
5641       ScalarCost += TTI.getScalarizationOverhead(ToVectorTy(I->getType(), VF),
5642                                                  true, false);
5643       ScalarCost += VF * TTI.getCFInstrCost(Instruction::PHI);
5644     }
5645 
5646     // Compute the scalarization overhead of needed extractelement
5647     // instructions. For each of the instruction's operands, if the operand can
5648     // be scalarized, add it to the worklist; otherwise, account for the
5649     // overhead.
5650     for (Use &U : I->operands())
5651       if (auto *J = dyn_cast<Instruction>(U.get())) {
5652         assert(VectorType::isValidElementType(J->getType()) &&
5653                "Instruction has non-scalar type");
5654         if (canBeScalarized(J))
5655           Worklist.push_back(J);
5656         else if (needsExtract(J, VF))
5657           ScalarCost += TTI.getScalarizationOverhead(
5658                               ToVectorTy(J->getType(),VF), false, true);
5659       }
5660 
5661     // Scale the total scalar cost by block probability.
5662     ScalarCost /= getReciprocalPredBlockProb();
5663 
5664     // Compute the discount. A non-negative discount means the vector version
5665     // of the instruction costs more, and scalarizing would be beneficial.
5666     Discount += VectorCost - ScalarCost;
5667     ScalarCosts[I] = ScalarCost;
5668   }
5669 
5670   return Discount;
5671 }
5672 
5673 LoopVectorizationCostModel::VectorizationCostTy
5674 LoopVectorizationCostModel::expectedCost(unsigned VF) {
5675   VectorizationCostTy Cost;
5676 
5677   // For each block.
5678   for (BasicBlock *BB : TheLoop->blocks()) {
5679     VectorizationCostTy BlockCost;
5680 
5681     // For each instruction in the old loop.
5682     for (Instruction &I : BB->instructionsWithoutDebug()) {
5683       // Skip ignored values.
5684       if (ValuesToIgnore.find(&I) != ValuesToIgnore.end() ||
5685           (VF > 1 && VecValuesToIgnore.find(&I) != VecValuesToIgnore.end()))
5686         continue;
5687 
5688       VectorizationCostTy C = getInstructionCost(&I, VF);
5689 
5690       // Check if we should override the cost.
5691       if (ForceTargetInstructionCost.getNumOccurrences() > 0)
5692         C.first = ForceTargetInstructionCost;
5693 
5694       BlockCost.first += C.first;
5695       BlockCost.second |= C.second;
5696       LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first
5697                         << " for VF " << VF << " For instruction: " << I
5698                         << '\n');
5699     }
5700 
5701     // If we are vectorizing a predicated block, it will have been
5702     // if-converted. This means that the block's instructions (aside from
5703     // stores and instructions that may divide by zero) will now be
5704     // unconditionally executed. For the scalar case, we may not always execute
5705     // the predicated block. Thus, scale the block's cost by the probability of
5706     // executing it.
5707     if (VF == 1 && blockNeedsPredication(BB))
5708       BlockCost.first /= getReciprocalPredBlockProb();
5709 
5710     Cost.first += BlockCost.first;
5711     Cost.second |= BlockCost.second;
5712   }
5713 
5714   return Cost;
5715 }
5716 
5717 /// Gets Address Access SCEV after verifying that the access pattern
5718 /// is loop invariant except the induction variable dependence.
5719 ///
5720 /// This SCEV can be sent to the Target in order to estimate the address
5721 /// calculation cost.
5722 static const SCEV *getAddressAccessSCEV(
5723               Value *Ptr,
5724               LoopVectorizationLegality *Legal,
5725               PredicatedScalarEvolution &PSE,
5726               const Loop *TheLoop) {
5727 
5728   auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
5729   if (!Gep)
5730     return nullptr;
5731 
5732   // We are looking for a gep with all loop invariant indices except for one
5733   // which should be an induction variable.
5734   auto SE = PSE.getSE();
5735   unsigned NumOperands = Gep->getNumOperands();
5736   for (unsigned i = 1; i < NumOperands; ++i) {
5737     Value *Opd = Gep->getOperand(i);
5738     if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
5739         !Legal->isInductionVariable(Opd))
5740       return nullptr;
5741   }
5742 
5743   // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
5744   return PSE.getSCEV(Ptr);
5745 }
5746 
5747 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
5748   return Legal->hasStride(I->getOperand(0)) ||
5749          Legal->hasStride(I->getOperand(1));
5750 }
5751 
5752 unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
5753                                                                  unsigned VF) {
5754   assert(VF > 1 && "Scalarization cost of instruction implies vectorization.");
5755   Type *ValTy = getMemInstValueType(I);
5756   auto SE = PSE.getSE();
5757 
5758   unsigned AS = getLoadStoreAddressSpace(I);
5759   Value *Ptr = getLoadStorePointerOperand(I);
5760   Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
5761 
5762   // Figure out whether the access is strided and get the stride value
5763   // if it's known in compile time
5764   const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
5765 
5766   // Get the cost of the scalar memory instruction and address computation.
5767   unsigned Cost = VF * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
5768 
5769   // Don't pass *I here, since it is scalar but will actually be part of a
5770   // vectorized loop where the user of it is a vectorized instruction.
5771   const MaybeAlign Alignment = getLoadStoreAlignment(I);
5772   Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(),
5773                                    Alignment, AS);
5774 
5775   // Get the overhead of the extractelement and insertelement instructions
5776   // we might create due to scalarization.
5777   Cost += getScalarizationOverhead(I, VF);
5778 
5779   // If we have a predicated store, it may not be executed for each vector
5780   // lane. Scale the cost by the probability of executing the predicated
5781   // block.
5782   if (isPredicatedInst(I)) {
5783     Cost /= getReciprocalPredBlockProb();
5784 
5785     if (useEmulatedMaskMemRefHack(I))
5786       // Artificially setting to a high enough value to practically disable
5787       // vectorization with such operations.
5788       Cost = 3000000;
5789   }
5790 
5791   return Cost;
5792 }
5793 
5794 unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
5795                                                              unsigned VF) {
5796   Type *ValTy = getMemInstValueType(I);
5797   Type *VectorTy = ToVectorTy(ValTy, VF);
5798   Value *Ptr = getLoadStorePointerOperand(I);
5799   unsigned AS = getLoadStoreAddressSpace(I);
5800   int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
5801 
5802   assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
5803          "Stride should be 1 or -1 for consecutive memory access");
5804   const MaybeAlign Alignment = getLoadStoreAlignment(I);
5805   unsigned Cost = 0;
5806   if (Legal->isMaskRequired(I))
5807     Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy,
5808                                       Alignment ? Alignment->value() : 0, AS);
5809   else
5810     Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, I);
5811 
5812   bool Reverse = ConsecutiveStride < 0;
5813   if (Reverse)
5814     Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
5815   return Cost;
5816 }
5817 
5818 unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
5819                                                          unsigned VF) {
5820   Type *ValTy = getMemInstValueType(I);
5821   Type *VectorTy = ToVectorTy(ValTy, VF);
5822   const MaybeAlign Alignment = getLoadStoreAlignment(I);
5823   unsigned AS = getLoadStoreAddressSpace(I);
5824   if (isa<LoadInst>(I)) {
5825     return TTI.getAddressComputationCost(ValTy) +
5826            TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS) +
5827            TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
5828   }
5829   StoreInst *SI = cast<StoreInst>(I);
5830 
5831   bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand());
5832   return TTI.getAddressComputationCost(ValTy) +
5833          TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS) +
5834          (isLoopInvariantStoreValue
5835               ? 0
5836               : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
5837                                        VF - 1));
5838 }
5839 
5840 unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
5841                                                           unsigned VF) {
5842   Type *ValTy = getMemInstValueType(I);
5843   Type *VectorTy = ToVectorTy(ValTy, VF);
5844   const MaybeAlign Alignment = getLoadStoreAlignment(I);
5845   Value *Ptr = getLoadStorePointerOperand(I);
5846 
5847   return TTI.getAddressComputationCost(VectorTy) +
5848          TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr,
5849                                     Legal->isMaskRequired(I),
5850                                     Alignment ? Alignment->value() : 0);
5851 }
5852 
5853 unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
5854                                                             unsigned VF) {
5855   Type *ValTy = getMemInstValueType(I);
5856   Type *VectorTy = ToVectorTy(ValTy, VF);
5857   unsigned AS = getLoadStoreAddressSpace(I);
5858 
5859   auto Group = getInterleavedAccessGroup(I);
5860   assert(Group && "Fail to get an interleaved access group.");
5861 
5862   unsigned InterleaveFactor = Group->getFactor();
5863   Type *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
5864 
5865   // Holds the indices of existing members in an interleaved load group.
5866   // An interleaved store group doesn't need this as it doesn't allow gaps.
5867   SmallVector<unsigned, 4> Indices;
5868   if (isa<LoadInst>(I)) {
5869     for (unsigned i = 0; i < InterleaveFactor; i++)
5870       if (Group->getMember(i))
5871         Indices.push_back(i);
5872   }
5873 
5874   // Calculate the cost of the whole interleaved group.
5875   bool UseMaskForGaps =
5876       Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
5877   unsigned Cost = TTI.getInterleavedMemoryOpCost(
5878       I->getOpcode(), WideVecTy, Group->getFactor(), Indices,
5879       Group->getAlignment(), AS, Legal->isMaskRequired(I), UseMaskForGaps);
5880 
5881   if (Group->isReverse()) {
5882     // TODO: Add support for reversed masked interleaved access.
5883     assert(!Legal->isMaskRequired(I) &&
5884            "Reverse masked interleaved access not supported.");
5885     Cost += Group->getNumMembers() *
5886             TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
5887   }
5888   return Cost;
5889 }
5890 
5891 unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
5892                                                               unsigned VF) {
5893   // Calculate scalar cost only. Vectorization cost should be ready at this
5894   // moment.
5895   if (VF == 1) {
5896     Type *ValTy = getMemInstValueType(I);
5897     const MaybeAlign Alignment = getLoadStoreAlignment(I);
5898     unsigned AS = getLoadStoreAddressSpace(I);
5899 
5900     return TTI.getAddressComputationCost(ValTy) +
5901            TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, I);
5902   }
5903   return getWideningCost(I, VF);
5904 }
5905 
5906 LoopVectorizationCostModel::VectorizationCostTy
5907 LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
5908   // If we know that this instruction will remain uniform, check the cost of
5909   // the scalar version.
5910   if (isUniformAfterVectorization(I, VF))
5911     VF = 1;
5912 
5913   if (VF > 1 && isProfitableToScalarize(I, VF))
5914     return VectorizationCostTy(InstsToScalarize[VF][I], false);
5915 
5916   // Forced scalars do not have any scalarization overhead.
5917   auto ForcedScalar = ForcedScalars.find(VF);
5918   if (VF > 1 && ForcedScalar != ForcedScalars.end()) {
5919     auto InstSet = ForcedScalar->second;
5920     if (InstSet.find(I) != InstSet.end())
5921       return VectorizationCostTy((getInstructionCost(I, 1).first * VF), false);
5922   }
5923 
5924   Type *VectorTy;
5925   unsigned C = getInstructionCost(I, VF, VectorTy);
5926 
5927   bool TypeNotScalarized =
5928       VF > 1 && VectorTy->isVectorTy() && TTI.getNumberOfParts(VectorTy) < VF;
5929   return VectorizationCostTy(C, TypeNotScalarized);
5930 }
5931 
5932 unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
5933                                                               unsigned VF) {
5934 
5935   if (VF == 1)
5936     return 0;
5937 
5938   unsigned Cost = 0;
5939   Type *RetTy = ToVectorTy(I->getType(), VF);
5940   if (!RetTy->isVoidTy() &&
5941       (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
5942     Cost += TTI.getScalarizationOverhead(RetTy, true, false);
5943 
5944   // Some targets keep addresses scalar.
5945   if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
5946     return Cost;
5947 
5948   // Some targets support efficient element stores.
5949   if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
5950     return Cost;
5951 
5952   // Collect operands to consider.
5953   CallInst *CI = dyn_cast<CallInst>(I);
5954   Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands();
5955 
5956   // Skip operands that do not require extraction/scalarization and do not incur
5957   // any overhead.
5958   return Cost + TTI.getOperandsScalarizationOverhead(
5959                     filterExtractingOperands(Ops, VF), VF);
5960 }
5961 
5962 void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {
5963   if (VF == 1)
5964     return;
5965   NumPredStores = 0;
5966   for (BasicBlock *BB : TheLoop->blocks()) {
5967     // For each instruction in the old loop.
5968     for (Instruction &I : *BB) {
5969       Value *Ptr =  getLoadStorePointerOperand(&I);
5970       if (!Ptr)
5971         continue;
5972 
5973       // TODO: We should generate better code and update the cost model for
5974       // predicated uniform stores. Today they are treated as any other
5975       // predicated store (see added test cases in
5976       // invariant-store-vectorization.ll).
5977       if (isa<StoreInst>(&I) && isScalarWithPredication(&I))
5978         NumPredStores++;
5979 
5980       if (Legal->isUniform(Ptr) &&
5981           // Conditional loads and stores should be scalarized and predicated.
5982           // isScalarWithPredication cannot be used here since masked
5983           // gather/scatters are not considered scalar with predication.
5984           !Legal->blockNeedsPredication(I.getParent())) {
5985         // TODO: Avoid replicating loads and stores instead of
5986         // relying on instcombine to remove them.
5987         // Load: Scalar load + broadcast
5988         // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
5989         unsigned Cost = getUniformMemOpCost(&I, VF);
5990         setWideningDecision(&I, VF, CM_Scalarize, Cost);
5991         continue;
5992       }
5993 
5994       // We assume that widening is the best solution when possible.
5995       if (memoryInstructionCanBeWidened(&I, VF)) {
5996         unsigned Cost = getConsecutiveMemOpCost(&I, VF);
5997         int ConsecutiveStride =
5998                Legal->isConsecutivePtr(getLoadStorePointerOperand(&I));
5999         assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6000                "Expected consecutive stride.");
6001         InstWidening Decision =
6002             ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
6003         setWideningDecision(&I, VF, Decision, Cost);
6004         continue;
6005       }
6006 
6007       // Choose between Interleaving, Gather/Scatter or Scalarization.
6008       unsigned InterleaveCost = std::numeric_limits<unsigned>::max();
6009       unsigned NumAccesses = 1;
6010       if (isAccessInterleaved(&I)) {
6011         auto Group = getInterleavedAccessGroup(&I);
6012         assert(Group && "Fail to get an interleaved access group.");
6013 
6014         // Make one decision for the whole group.
6015         if (getWideningDecision(&I, VF) != CM_Unknown)
6016           continue;
6017 
6018         NumAccesses = Group->getNumMembers();
6019         if (interleavedAccessCanBeWidened(&I, VF))
6020           InterleaveCost = getInterleaveGroupCost(&I, VF);
6021       }
6022 
6023       unsigned GatherScatterCost =
6024           isLegalGatherOrScatter(&I)
6025               ? getGatherScatterCost(&I, VF) * NumAccesses
6026               : std::numeric_limits<unsigned>::max();
6027 
6028       unsigned ScalarizationCost =
6029           getMemInstScalarizationCost(&I, VF) * NumAccesses;
6030 
6031       // Choose better solution for the current VF,
6032       // write down this decision and use it during vectorization.
6033       unsigned Cost;
6034       InstWidening Decision;
6035       if (InterleaveCost <= GatherScatterCost &&
6036           InterleaveCost < ScalarizationCost) {
6037         Decision = CM_Interleave;
6038         Cost = InterleaveCost;
6039       } else if (GatherScatterCost < ScalarizationCost) {
6040         Decision = CM_GatherScatter;
6041         Cost = GatherScatterCost;
6042       } else {
6043         Decision = CM_Scalarize;
6044         Cost = ScalarizationCost;
6045       }
6046       // If the instructions belongs to an interleave group, the whole group
6047       // receives the same decision. The whole group receives the cost, but
6048       // the cost will actually be assigned to one instruction.
6049       if (auto Group = getInterleavedAccessGroup(&I))
6050         setWideningDecision(Group, VF, Decision, Cost);
6051       else
6052         setWideningDecision(&I, VF, Decision, Cost);
6053     }
6054   }
6055 
6056   // Make sure that any load of address and any other address computation
6057   // remains scalar unless there is gather/scatter support. This avoids
6058   // inevitable extracts into address registers, and also has the benefit of
6059   // activating LSR more, since that pass can't optimize vectorized
6060   // addresses.
6061   if (TTI.prefersVectorizedAddressing())
6062     return;
6063 
6064   // Start with all scalar pointer uses.
6065   SmallPtrSet<Instruction *, 8> AddrDefs;
6066   for (BasicBlock *BB : TheLoop->blocks())
6067     for (Instruction &I : *BB) {
6068       Instruction *PtrDef =
6069         dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
6070       if (PtrDef && TheLoop->contains(PtrDef) &&
6071           getWideningDecision(&I, VF) != CM_GatherScatter)
6072         AddrDefs.insert(PtrDef);
6073     }
6074 
6075   // Add all instructions used to generate the addresses.
6076   SmallVector<Instruction *, 4> Worklist;
6077   for (auto *I : AddrDefs)
6078     Worklist.push_back(I);
6079   while (!Worklist.empty()) {
6080     Instruction *I = Worklist.pop_back_val();
6081     for (auto &Op : I->operands())
6082       if (auto *InstOp = dyn_cast<Instruction>(Op))
6083         if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
6084             AddrDefs.insert(InstOp).second)
6085           Worklist.push_back(InstOp);
6086   }
6087 
6088   for (auto *I : AddrDefs) {
6089     if (isa<LoadInst>(I)) {
6090       // Setting the desired widening decision should ideally be handled in
6091       // by cost functions, but since this involves the task of finding out
6092       // if the loaded register is involved in an address computation, it is
6093       // instead changed here when we know this is the case.
6094       InstWidening Decision = getWideningDecision(I, VF);
6095       if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
6096         // Scalarize a widened load of address.
6097         setWideningDecision(I, VF, CM_Scalarize,
6098                             (VF * getMemoryInstructionCost(I, 1)));
6099       else if (auto Group = getInterleavedAccessGroup(I)) {
6100         // Scalarize an interleave group of address loads.
6101         for (unsigned I = 0; I < Group->getFactor(); ++I) {
6102           if (Instruction *Member = Group->getMember(I))
6103             setWideningDecision(Member, VF, CM_Scalarize,
6104                                 (VF * getMemoryInstructionCost(Member, 1)));
6105         }
6106       }
6107     } else
6108       // Make sure I gets scalarized and a cost estimate without
6109       // scalarization overhead.
6110       ForcedScalars[VF].insert(I);
6111   }
6112 }
6113 
6114 unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
6115                                                         unsigned VF,
6116                                                         Type *&VectorTy) {
6117   Type *RetTy = I->getType();
6118   if (canTruncateToMinimalBitwidth(I, VF))
6119     RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
6120   VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF);
6121   auto SE = PSE.getSE();
6122 
6123   // TODO: We need to estimate the cost of intrinsic calls.
6124   switch (I->getOpcode()) {
6125   case Instruction::GetElementPtr:
6126     // We mark this instruction as zero-cost because the cost of GEPs in
6127     // vectorized code depends on whether the corresponding memory instruction
6128     // is scalarized or not. Therefore, we handle GEPs with the memory
6129     // instruction cost.
6130     return 0;
6131   case Instruction::Br: {
6132     // In cases of scalarized and predicated instructions, there will be VF
6133     // predicated blocks in the vectorized loop. Each branch around these
6134     // blocks requires also an extract of its vector compare i1 element.
6135     bool ScalarPredicatedBB = false;
6136     BranchInst *BI = cast<BranchInst>(I);
6137     if (VF > 1 && BI->isConditional() &&
6138         (PredicatedBBsAfterVectorization.find(BI->getSuccessor(0)) !=
6139              PredicatedBBsAfterVectorization.end() ||
6140          PredicatedBBsAfterVectorization.find(BI->getSuccessor(1)) !=
6141              PredicatedBBsAfterVectorization.end()))
6142       ScalarPredicatedBB = true;
6143 
6144     if (ScalarPredicatedBB) {
6145       // Return cost for branches around scalarized and predicated blocks.
6146       Type *Vec_i1Ty =
6147           VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
6148       return (TTI.getScalarizationOverhead(Vec_i1Ty, false, true) +
6149               (TTI.getCFInstrCost(Instruction::Br) * VF));
6150     } else if (I->getParent() == TheLoop->getLoopLatch() || VF == 1)
6151       // The back-edge branch will remain, as will all scalar branches.
6152       return TTI.getCFInstrCost(Instruction::Br);
6153     else
6154       // This branch will be eliminated by if-conversion.
6155       return 0;
6156     // Note: We currently assume zero cost for an unconditional branch inside
6157     // a predicated block since it will become a fall-through, although we
6158     // may decide in the future to call TTI for all branches.
6159   }
6160   case Instruction::PHI: {
6161     auto *Phi = cast<PHINode>(I);
6162 
6163     // First-order recurrences are replaced by vector shuffles inside the loop.
6164     // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type.
6165     if (VF > 1 && Legal->isFirstOrderRecurrence(Phi))
6166       return TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector,
6167                                 VectorTy, VF - 1, VectorType::get(RetTy, 1));
6168 
6169     // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
6170     // converted into select instructions. We require N - 1 selects per phi
6171     // node, where N is the number of incoming values.
6172     if (VF > 1 && Phi->getParent() != TheLoop->getHeader())
6173       return (Phi->getNumIncomingValues() - 1) *
6174              TTI.getCmpSelInstrCost(
6175                  Instruction::Select, ToVectorTy(Phi->getType(), VF),
6176                  ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF));
6177 
6178     return TTI.getCFInstrCost(Instruction::PHI);
6179   }
6180   case Instruction::UDiv:
6181   case Instruction::SDiv:
6182   case Instruction::URem:
6183   case Instruction::SRem:
6184     // If we have a predicated instruction, it may not be executed for each
6185     // vector lane. Get the scalarization cost and scale this amount by the
6186     // probability of executing the predicated block. If the instruction is not
6187     // predicated, we fall through to the next case.
6188     if (VF > 1 && isScalarWithPredication(I)) {
6189       unsigned Cost = 0;
6190 
6191       // These instructions have a non-void type, so account for the phi nodes
6192       // that we will create. This cost is likely to be zero. The phi node
6193       // cost, if any, should be scaled by the block probability because it
6194       // models a copy at the end of each predicated block.
6195       Cost += VF * TTI.getCFInstrCost(Instruction::PHI);
6196 
6197       // The cost of the non-predicated instruction.
6198       Cost += VF * TTI.getArithmeticInstrCost(I->getOpcode(), RetTy);
6199 
6200       // The cost of insertelement and extractelement instructions needed for
6201       // scalarization.
6202       Cost += getScalarizationOverhead(I, VF);
6203 
6204       // Scale the cost by the probability of executing the predicated blocks.
6205       // This assumes the predicated block for each vector lane is equally
6206       // likely.
6207       return Cost / getReciprocalPredBlockProb();
6208     }
6209     LLVM_FALLTHROUGH;
6210   case Instruction::Add:
6211   case Instruction::FAdd:
6212   case Instruction::Sub:
6213   case Instruction::FSub:
6214   case Instruction::Mul:
6215   case Instruction::FMul:
6216   case Instruction::FDiv:
6217   case Instruction::FRem:
6218   case Instruction::Shl:
6219   case Instruction::LShr:
6220   case Instruction::AShr:
6221   case Instruction::And:
6222   case Instruction::Or:
6223   case Instruction::Xor: {
6224     // Since we will replace the stride by 1 the multiplication should go away.
6225     if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal))
6226       return 0;
6227     // Certain instructions can be cheaper to vectorize if they have a constant
6228     // second vector operand. One example of this are shifts on x86.
6229     Value *Op2 = I->getOperand(1);
6230     TargetTransformInfo::OperandValueProperties Op2VP;
6231     TargetTransformInfo::OperandValueKind Op2VK =
6232         TTI.getOperandInfo(Op2, Op2VP);
6233     if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2))
6234       Op2VK = TargetTransformInfo::OK_UniformValue;
6235 
6236     SmallVector<const Value *, 4> Operands(I->operand_values());
6237     unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
6238     return N * TTI.getArithmeticInstrCost(
6239                    I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue,
6240                    Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I);
6241   }
6242   case Instruction::FNeg: {
6243     unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
6244     return N * TTI.getArithmeticInstrCost(
6245                    I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue,
6246                    TargetTransformInfo::OK_AnyValue,
6247                    TargetTransformInfo::OP_None, TargetTransformInfo::OP_None,
6248                    I->getOperand(0), I);
6249   }
6250   case Instruction::Select: {
6251     SelectInst *SI = cast<SelectInst>(I);
6252     const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
6253     bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
6254     Type *CondTy = SI->getCondition()->getType();
6255     if (!ScalarCond)
6256       CondTy = VectorType::get(CondTy, VF);
6257 
6258     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, I);
6259   }
6260   case Instruction::ICmp:
6261   case Instruction::FCmp: {
6262     Type *ValTy = I->getOperand(0)->getType();
6263     Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
6264     if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
6265       ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
6266     VectorTy = ToVectorTy(ValTy, VF);
6267     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, I);
6268   }
6269   case Instruction::Store:
6270   case Instruction::Load: {
6271     unsigned Width = VF;
6272     if (Width > 1) {
6273       InstWidening Decision = getWideningDecision(I, Width);
6274       assert(Decision != CM_Unknown &&
6275              "CM decision should be taken at this point");
6276       if (Decision == CM_Scalarize)
6277         Width = 1;
6278     }
6279     VectorTy = ToVectorTy(getMemInstValueType(I), Width);
6280     return getMemoryInstructionCost(I, VF);
6281   }
6282   case Instruction::ZExt:
6283   case Instruction::SExt:
6284   case Instruction::FPToUI:
6285   case Instruction::FPToSI:
6286   case Instruction::FPExt:
6287   case Instruction::PtrToInt:
6288   case Instruction::IntToPtr:
6289   case Instruction::SIToFP:
6290   case Instruction::UIToFP:
6291   case Instruction::Trunc:
6292   case Instruction::FPTrunc:
6293   case Instruction::BitCast: {
6294     // We optimize the truncation of induction variables having constant
6295     // integer steps. The cost of these truncations is the same as the scalar
6296     // operation.
6297     if (isOptimizableIVTruncate(I, VF)) {
6298       auto *Trunc = cast<TruncInst>(I);
6299       return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
6300                                   Trunc->getSrcTy(), Trunc);
6301     }
6302 
6303     Type *SrcScalarTy = I->getOperand(0)->getType();
6304     Type *SrcVecTy =
6305         VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
6306     if (canTruncateToMinimalBitwidth(I, VF)) {
6307       // This cast is going to be shrunk. This may remove the cast or it might
6308       // turn it into slightly different cast. For example, if MinBW == 16,
6309       // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
6310       //
6311       // Calculate the modified src and dest types.
6312       Type *MinVecTy = VectorTy;
6313       if (I->getOpcode() == Instruction::Trunc) {
6314         SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
6315         VectorTy =
6316             largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
6317       } else if (I->getOpcode() == Instruction::ZExt ||
6318                  I->getOpcode() == Instruction::SExt) {
6319         SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy);
6320         VectorTy =
6321             smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
6322       }
6323     }
6324 
6325     unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
6326     return N * TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy, I);
6327   }
6328   case Instruction::Call: {
6329     bool NeedToScalarize;
6330     CallInst *CI = cast<CallInst>(I);
6331     unsigned CallCost = getVectorCallCost(CI, VF, NeedToScalarize);
6332     if (getVectorIntrinsicIDForCall(CI, TLI))
6333       return std::min(CallCost, getVectorIntrinsicCost(CI, VF));
6334     return CallCost;
6335   }
6336   default:
6337     // The cost of executing VF copies of the scalar instruction. This opcode
6338     // is unknown. Assume that it is the same as 'mul'.
6339     return VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy) +
6340            getScalarizationOverhead(I, VF);
6341   } // end of switch.
6342 }
6343 
6344 char LoopVectorize::ID = 0;
6345 
6346 static const char lv_name[] = "Loop Vectorization";
6347 
6348 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
6349 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
6350 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)
6351 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
6352 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
6353 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
6354 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
6355 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
6356 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
6357 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
6358 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
6359 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
6360 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
6361 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
6362 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
6363 
6364 namespace llvm {
6365 
6366 Pass *createLoopVectorizePass() { return new LoopVectorize(); }
6367 
6368 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced,
6369                               bool VectorizeOnlyWhenForced) {
6370   return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced);
6371 }
6372 
6373 } // end namespace llvm
6374 
6375 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
6376   // Check if the pointer operand of a load or store instruction is
6377   // consecutive.
6378   if (auto *Ptr = getLoadStorePointerOperand(Inst))
6379     return Legal->isConsecutivePtr(Ptr);
6380   return false;
6381 }
6382 
6383 void LoopVectorizationCostModel::collectValuesToIgnore() {
6384   // Ignore ephemeral values.
6385   CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
6386 
6387   // Ignore type-promoting instructions we identified during reduction
6388   // detection.
6389   for (auto &Reduction : *Legal->getReductionVars()) {
6390     RecurrenceDescriptor &RedDes = Reduction.second;
6391     SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
6392     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
6393   }
6394   // Ignore type-casting instructions we identified during induction
6395   // detection.
6396   for (auto &Induction : *Legal->getInductionVars()) {
6397     InductionDescriptor &IndDes = Induction.second;
6398     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
6399     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
6400   }
6401 }
6402 
6403 // TODO: we could return a pair of values that specify the max VF and
6404 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
6405 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
6406 // doesn't have a cost model that can choose which plan to execute if
6407 // more than one is generated.
6408 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits,
6409                                  LoopVectorizationCostModel &CM) {
6410   unsigned WidestType;
6411   std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
6412   return WidestVectorRegBits / WidestType;
6413 }
6414 
6415 VectorizationFactor
6416 LoopVectorizationPlanner::planInVPlanNativePath(unsigned UserVF) {
6417   unsigned VF = UserVF;
6418   // Outer loop handling: They may require CFG and instruction level
6419   // transformations before even evaluating whether vectorization is profitable.
6420   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
6421   // the vectorization pipeline.
6422   if (!OrigLoop->empty()) {
6423     // If the user doesn't provide a vectorization factor, determine a
6424     // reasonable one.
6425     if (!UserVF) {
6426       VF = determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM);
6427       LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
6428 
6429       // Make sure we have a VF > 1 for stress testing.
6430       if (VPlanBuildStressTest && VF < 2) {
6431         LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
6432                           << "overriding computed VF.\n");
6433         VF = 4;
6434       }
6435     }
6436     assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
6437     assert(isPowerOf2_32(VF) && "VF needs to be a power of two");
6438     LLVM_DEBUG(dbgs() << "LV: Using " << (UserVF ? "user " : "") << "VF " << VF
6439                       << " to build VPlans.\n");
6440     buildVPlans(VF, VF);
6441 
6442     // For VPlan build stress testing, we bail out after VPlan construction.
6443     if (VPlanBuildStressTest)
6444       return VectorizationFactor::Disabled();
6445 
6446     return {VF, 0};
6447   }
6448 
6449   LLVM_DEBUG(
6450       dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
6451                 "VPlan-native path.\n");
6452   return VectorizationFactor::Disabled();
6453 }
6454 
6455 Optional<VectorizationFactor> LoopVectorizationPlanner::plan(unsigned UserVF) {
6456   assert(OrigLoop->empty() && "Inner loop expected.");
6457   Optional<unsigned> MaybeMaxVF = CM.computeMaxVF();
6458   if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved.
6459     return None;
6460 
6461   // Invalidate interleave groups if all blocks of loop will be predicated.
6462   if (CM.blockNeedsPredication(OrigLoop->getHeader()) &&
6463       !useMaskedInterleavedAccesses(*TTI)) {
6464     LLVM_DEBUG(
6465         dbgs()
6466         << "LV: Invalidate all interleaved groups due to fold-tail by masking "
6467            "which requires masked-interleaved support.\n");
6468     CM.InterleaveInfo.reset();
6469   }
6470 
6471   if (UserVF) {
6472     LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
6473     assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two");
6474     // Collect the instructions (and their associated costs) that will be more
6475     // profitable to scalarize.
6476     CM.selectUserVectorizationFactor(UserVF);
6477     buildVPlansWithVPRecipes(UserVF, UserVF);
6478     LLVM_DEBUG(printPlans(dbgs()));
6479     return {{UserVF, 0}};
6480   }
6481 
6482   unsigned MaxVF = MaybeMaxVF.getValue();
6483   assert(MaxVF != 0 && "MaxVF is zero.");
6484 
6485   for (unsigned VF = 1; VF <= MaxVF; VF *= 2) {
6486     // Collect Uniform and Scalar instructions after vectorization with VF.
6487     CM.collectUniformsAndScalars(VF);
6488 
6489     // Collect the instructions (and their associated costs) that will be more
6490     // profitable to scalarize.
6491     if (VF > 1)
6492       CM.collectInstsToScalarize(VF);
6493   }
6494 
6495   buildVPlansWithVPRecipes(1, MaxVF);
6496   LLVM_DEBUG(printPlans(dbgs()));
6497   if (MaxVF == 1)
6498     return VectorizationFactor::Disabled();
6499 
6500   // Select the optimal vectorization factor.
6501   return CM.selectVectorizationFactor(MaxVF);
6502 }
6503 
6504 void LoopVectorizationPlanner::setBestPlan(unsigned VF, unsigned UF) {
6505   LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF
6506                     << '\n');
6507   BestVF = VF;
6508   BestUF = UF;
6509 
6510   erase_if(VPlans, [VF](const VPlanPtr &Plan) {
6511     return !Plan->hasVF(VF);
6512   });
6513   assert(VPlans.size() == 1 && "Best VF has not a single VPlan.");
6514 }
6515 
6516 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV,
6517                                            DominatorTree *DT) {
6518   // Perform the actual loop transformation.
6519 
6520   // 1. Create a new empty loop. Unlink the old loop and connect the new one.
6521   VPCallbackILV CallbackILV(ILV);
6522 
6523   VPTransformState State{BestVF, BestUF,      LI,
6524                          DT,     ILV.Builder, ILV.VectorLoopValueMap,
6525                          &ILV,   CallbackILV};
6526   State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
6527   State.TripCount = ILV.getOrCreateTripCount(nullptr);
6528 
6529   //===------------------------------------------------===//
6530   //
6531   // Notice: any optimization or new instruction that go
6532   // into the code below should also be implemented in
6533   // the cost-model.
6534   //
6535   //===------------------------------------------------===//
6536 
6537   // 2. Copy and widen instructions from the old loop into the new loop.
6538   assert(VPlans.size() == 1 && "Not a single VPlan to execute.");
6539   VPlans.front()->execute(&State);
6540 
6541   // 3. Fix the vectorized code: take care of header phi's, live-outs,
6542   //    predication, updating analyses.
6543   ILV.fixVectorizedLoop();
6544 }
6545 
6546 void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
6547     SmallPtrSetImpl<Instruction *> &DeadInstructions) {
6548   BasicBlock *Latch = OrigLoop->getLoopLatch();
6549 
6550   // We create new control-flow for the vectorized loop, so the original
6551   // condition will be dead after vectorization if it's only used by the
6552   // branch.
6553   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
6554   if (Cmp && Cmp->hasOneUse())
6555     DeadInstructions.insert(Cmp);
6556 
6557   // We create new "steps" for induction variable updates to which the original
6558   // induction variables map. An original update instruction will be dead if
6559   // all its users except the induction variable are dead.
6560   for (auto &Induction : *Legal->getInductionVars()) {
6561     PHINode *Ind = Induction.first;
6562     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
6563     if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
6564           return U == Ind || DeadInstructions.find(cast<Instruction>(U)) !=
6565                                  DeadInstructions.end();
6566         }))
6567       DeadInstructions.insert(IndUpdate);
6568 
6569     // We record as "Dead" also the type-casting instructions we had identified
6570     // during induction analysis. We don't need any handling for them in the
6571     // vectorized loop because we have proven that, under a proper runtime
6572     // test guarding the vectorized loop, the value of the phi, and the casted
6573     // value of the phi, are the same. The last instruction in this casting chain
6574     // will get its scalar/vector/widened def from the scalar/vector/widened def
6575     // of the respective phi node. Any other casts in the induction def-use chain
6576     // have no other uses outside the phi update chain, and will be ignored.
6577     InductionDescriptor &IndDes = Induction.second;
6578     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
6579     DeadInstructions.insert(Casts.begin(), Casts.end());
6580   }
6581 }
6582 
6583 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; }
6584 
6585 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; }
6586 
6587 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step,
6588                                         Instruction::BinaryOps BinOp) {
6589   // When unrolling and the VF is 1, we only need to add a simple scalar.
6590   Type *Ty = Val->getType();
6591   assert(!Ty->isVectorTy() && "Val must be a scalar");
6592 
6593   if (Ty->isFloatingPointTy()) {
6594     Constant *C = ConstantFP::get(Ty, (double)StartIdx);
6595 
6596     // Floating point operations had to be 'fast' to enable the unrolling.
6597     Value *MulOp = addFastMathFlag(Builder.CreateFMul(C, Step));
6598     return addFastMathFlag(Builder.CreateBinOp(BinOp, Val, MulOp));
6599   }
6600   Constant *C = ConstantInt::get(Ty, StartIdx);
6601   return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction");
6602 }
6603 
6604 static void AddRuntimeUnrollDisableMetaData(Loop *L) {
6605   SmallVector<Metadata *, 4> MDs;
6606   // Reserve first location for self reference to the LoopID metadata node.
6607   MDs.push_back(nullptr);
6608   bool IsUnrollMetadata = false;
6609   MDNode *LoopID = L->getLoopID();
6610   if (LoopID) {
6611     // First find existing loop unrolling disable metadata.
6612     for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
6613       auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
6614       if (MD) {
6615         const auto *S = dyn_cast<MDString>(MD->getOperand(0));
6616         IsUnrollMetadata =
6617             S && S->getString().startswith("llvm.loop.unroll.disable");
6618       }
6619       MDs.push_back(LoopID->getOperand(i));
6620     }
6621   }
6622 
6623   if (!IsUnrollMetadata) {
6624     // Add runtime unroll disable metadata.
6625     LLVMContext &Context = L->getHeader()->getContext();
6626     SmallVector<Metadata *, 1> DisableOperands;
6627     DisableOperands.push_back(
6628         MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
6629     MDNode *DisableNode = MDNode::get(Context, DisableOperands);
6630     MDs.push_back(DisableNode);
6631     MDNode *NewLoopID = MDNode::get(Context, MDs);
6632     // Set operand 0 to refer to the loop id itself.
6633     NewLoopID->replaceOperandWith(0, NewLoopID);
6634     L->setLoopID(NewLoopID);
6635   }
6636 }
6637 
6638 bool LoopVectorizationPlanner::getDecisionAndClampRange(
6639     const std::function<bool(unsigned)> &Predicate, VFRange &Range) {
6640   assert(Range.End > Range.Start && "Trying to test an empty VF range.");
6641   bool PredicateAtRangeStart = Predicate(Range.Start);
6642 
6643   for (unsigned TmpVF = Range.Start * 2; TmpVF < Range.End; TmpVF *= 2)
6644     if (Predicate(TmpVF) != PredicateAtRangeStart) {
6645       Range.End = TmpVF;
6646       break;
6647     }
6648 
6649   return PredicateAtRangeStart;
6650 }
6651 
6652 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
6653 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
6654 /// of VF's starting at a given VF and extending it as much as possible. Each
6655 /// vectorization decision can potentially shorten this sub-range during
6656 /// buildVPlan().
6657 void LoopVectorizationPlanner::buildVPlans(unsigned MinVF, unsigned MaxVF) {
6658   for (unsigned VF = MinVF; VF < MaxVF + 1;) {
6659     VFRange SubRange = {VF, MaxVF + 1};
6660     VPlans.push_back(buildVPlan(SubRange));
6661     VF = SubRange.End;
6662   }
6663 }
6664 
6665 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
6666                                          VPlanPtr &Plan) {
6667   assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
6668 
6669   // Look for cached value.
6670   std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
6671   EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
6672   if (ECEntryIt != EdgeMaskCache.end())
6673     return ECEntryIt->second;
6674 
6675   VPValue *SrcMask = createBlockInMask(Src, Plan);
6676 
6677   // The terminator has to be a branch inst!
6678   BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
6679   assert(BI && "Unexpected terminator found");
6680 
6681   if (!BI->isConditional())
6682     return EdgeMaskCache[Edge] = SrcMask;
6683 
6684   VPValue *EdgeMask = Plan->getVPValue(BI->getCondition());
6685   assert(EdgeMask && "No Edge Mask found for condition");
6686 
6687   if (BI->getSuccessor(0) != Dst)
6688     EdgeMask = Builder.createNot(EdgeMask);
6689 
6690   if (SrcMask) // Otherwise block in-mask is all-one, no need to AND.
6691     EdgeMask = Builder.createAnd(EdgeMask, SrcMask);
6692 
6693   return EdgeMaskCache[Edge] = EdgeMask;
6694 }
6695 
6696 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
6697   assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
6698 
6699   // Look for cached value.
6700   BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB);
6701   if (BCEntryIt != BlockMaskCache.end())
6702     return BCEntryIt->second;
6703 
6704   // All-one mask is modelled as no-mask following the convention for masked
6705   // load/store/gather/scatter. Initialize BlockMask to no-mask.
6706   VPValue *BlockMask = nullptr;
6707 
6708   if (OrigLoop->getHeader() == BB) {
6709     if (!CM.blockNeedsPredication(BB))
6710       return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one.
6711 
6712     // Introduce the early-exit compare IV <= BTC to form header block mask.
6713     // This is used instead of IV < TC because TC may wrap, unlike BTC.
6714     VPValue *IV = Plan->getVPValue(Legal->getPrimaryInduction());
6715     VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
6716     BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
6717     return BlockMaskCache[BB] = BlockMask;
6718   }
6719 
6720   // This is the block mask. We OR all incoming edges.
6721   for (auto *Predecessor : predecessors(BB)) {
6722     VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan);
6723     if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too.
6724       return BlockMaskCache[BB] = EdgeMask;
6725 
6726     if (!BlockMask) { // BlockMask has its initialized nullptr value.
6727       BlockMask = EdgeMask;
6728       continue;
6729     }
6730 
6731     BlockMask = Builder.createOr(BlockMask, EdgeMask);
6732   }
6733 
6734   return BlockMaskCache[BB] = BlockMask;
6735 }
6736 
6737 VPWidenMemoryInstructionRecipe *
6738 VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range,
6739                                   VPlanPtr &Plan) {
6740   if (!isa<LoadInst>(I) && !isa<StoreInst>(I))
6741     return nullptr;
6742 
6743   auto willWiden = [&](unsigned VF) -> bool {
6744     if (VF == 1)
6745       return false;
6746     LoopVectorizationCostModel::InstWidening Decision =
6747         CM.getWideningDecision(I, VF);
6748     assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
6749            "CM decision should be taken at this point.");
6750     if (Decision == LoopVectorizationCostModel::CM_Interleave)
6751       return true;
6752     if (CM.isScalarAfterVectorization(I, VF) ||
6753         CM.isProfitableToScalarize(I, VF))
6754       return false;
6755     return Decision != LoopVectorizationCostModel::CM_Scalarize;
6756   };
6757 
6758   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
6759     return nullptr;
6760 
6761   VPValue *Mask = nullptr;
6762   if (Legal->isMaskRequired(I))
6763     Mask = createBlockInMask(I->getParent(), Plan);
6764 
6765   return new VPWidenMemoryInstructionRecipe(*I, Mask);
6766 }
6767 
6768 VPWidenIntOrFpInductionRecipe *
6769 VPRecipeBuilder::tryToOptimizeInduction(Instruction *I, VFRange &Range) {
6770   if (PHINode *Phi = dyn_cast<PHINode>(I)) {
6771     // Check if this is an integer or fp induction. If so, build the recipe that
6772     // produces its scalar and vector values.
6773     InductionDescriptor II = Legal->getInductionVars()->lookup(Phi);
6774     if (II.getKind() == InductionDescriptor::IK_IntInduction ||
6775         II.getKind() == InductionDescriptor::IK_FpInduction)
6776       return new VPWidenIntOrFpInductionRecipe(Phi);
6777 
6778     return nullptr;
6779   }
6780 
6781   // Optimize the special case where the source is a constant integer
6782   // induction variable. Notice that we can only optimize the 'trunc' case
6783   // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
6784   // (c) other casts depend on pointer size.
6785 
6786   // Determine whether \p K is a truncation based on an induction variable that
6787   // can be optimized.
6788   auto isOptimizableIVTruncate =
6789       [&](Instruction *K) -> std::function<bool(unsigned)> {
6790     return
6791         [=](unsigned VF) -> bool { return CM.isOptimizableIVTruncate(K, VF); };
6792   };
6793 
6794   if (isa<TruncInst>(I) && LoopVectorizationPlanner::getDecisionAndClampRange(
6795                                isOptimizableIVTruncate(I), Range))
6796     return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)),
6797                                              cast<TruncInst>(I));
6798   return nullptr;
6799 }
6800 
6801 VPBlendRecipe *VPRecipeBuilder::tryToBlend(Instruction *I, VPlanPtr &Plan) {
6802   PHINode *Phi = dyn_cast<PHINode>(I);
6803   if (!Phi || Phi->getParent() == OrigLoop->getHeader())
6804     return nullptr;
6805 
6806   // We know that all PHIs in non-header blocks are converted into selects, so
6807   // we don't have to worry about the insertion order and we can just use the
6808   // builder. At this point we generate the predication tree. There may be
6809   // duplications since this is a simple recursive scan, but future
6810   // optimizations will clean it up.
6811 
6812   SmallVector<VPValue *, 2> Masks;
6813   unsigned NumIncoming = Phi->getNumIncomingValues();
6814   for (unsigned In = 0; In < NumIncoming; In++) {
6815     VPValue *EdgeMask =
6816       createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan);
6817     assert((EdgeMask || NumIncoming == 1) &&
6818            "Multiple predecessors with one having a full mask");
6819     if (EdgeMask)
6820       Masks.push_back(EdgeMask);
6821   }
6822   return new VPBlendRecipe(Phi, Masks);
6823 }
6824 
6825 bool VPRecipeBuilder::tryToWiden(Instruction *I, VPBasicBlock *VPBB,
6826                                  VFRange &Range) {
6827 
6828   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
6829       [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range);
6830 
6831   if (IsPredicated)
6832     return false;
6833 
6834   auto IsVectorizableOpcode = [](unsigned Opcode) {
6835     switch (Opcode) {
6836     case Instruction::Add:
6837     case Instruction::And:
6838     case Instruction::AShr:
6839     case Instruction::BitCast:
6840     case Instruction::Br:
6841     case Instruction::Call:
6842     case Instruction::FAdd:
6843     case Instruction::FCmp:
6844     case Instruction::FDiv:
6845     case Instruction::FMul:
6846     case Instruction::FNeg:
6847     case Instruction::FPExt:
6848     case Instruction::FPToSI:
6849     case Instruction::FPToUI:
6850     case Instruction::FPTrunc:
6851     case Instruction::FRem:
6852     case Instruction::FSub:
6853     case Instruction::ICmp:
6854     case Instruction::IntToPtr:
6855     case Instruction::Load:
6856     case Instruction::LShr:
6857     case Instruction::Mul:
6858     case Instruction::Or:
6859     case Instruction::PHI:
6860     case Instruction::PtrToInt:
6861     case Instruction::SDiv:
6862     case Instruction::Select:
6863     case Instruction::SExt:
6864     case Instruction::Shl:
6865     case Instruction::SIToFP:
6866     case Instruction::SRem:
6867     case Instruction::Store:
6868     case Instruction::Sub:
6869     case Instruction::Trunc:
6870     case Instruction::UDiv:
6871     case Instruction::UIToFP:
6872     case Instruction::URem:
6873     case Instruction::Xor:
6874     case Instruction::ZExt:
6875       return true;
6876     }
6877     return false;
6878   };
6879 
6880   if (!IsVectorizableOpcode(I->getOpcode()))
6881     return false;
6882 
6883   if (CallInst *CI = dyn_cast<CallInst>(I)) {
6884     Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
6885     if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
6886                ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect))
6887       return false;
6888   }
6889 
6890   auto willWiden = [&](unsigned VF) -> bool {
6891     if (!isa<PHINode>(I) && (CM.isScalarAfterVectorization(I, VF) ||
6892                              CM.isProfitableToScalarize(I, VF)))
6893       return false;
6894     if (CallInst *CI = dyn_cast<CallInst>(I)) {
6895       Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
6896       // The following case may be scalarized depending on the VF.
6897       // The flag shows whether we use Intrinsic or a usual Call for vectorized
6898       // version of the instruction.
6899       // Is it beneficial to perform intrinsic call compared to lib call?
6900       bool NeedToScalarize;
6901       unsigned CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize);
6902       bool UseVectorIntrinsic =
6903           ID && CM.getVectorIntrinsicCost(CI, VF) <= CallCost;
6904       return UseVectorIntrinsic || !NeedToScalarize;
6905     }
6906     if (isa<LoadInst>(I) || isa<StoreInst>(I)) {
6907       assert(CM.getWideningDecision(I, VF) ==
6908                  LoopVectorizationCostModel::CM_Scalarize &&
6909              "Memory widening decisions should have been taken care by now");
6910       return false;
6911     }
6912     return true;
6913   };
6914 
6915   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
6916     return false;
6917   // If this ingredient's recipe is to be recorded, keep its recipe a singleton
6918   // to avoid having to split recipes later.
6919   bool IsSingleton = Ingredient2Recipe.count(I);
6920 
6921   // Success: widen this instruction.
6922 
6923   // Use the default widening recipe. We optimize the common case where
6924   // consecutive instructions can be represented by a single recipe.
6925   if (!IsSingleton && !VPBB->empty() && LastExtensibleRecipe == &VPBB->back() &&
6926       LastExtensibleRecipe->appendInstruction(I))
6927     return true;
6928 
6929   VPWidenRecipe *WidenRecipe = new VPWidenRecipe(I);
6930   if (!IsSingleton)
6931     LastExtensibleRecipe = WidenRecipe;
6932   setRecipe(I, WidenRecipe);
6933   VPBB->appendRecipe(WidenRecipe);
6934   return true;
6935 }
6936 
6937 VPBasicBlock *VPRecipeBuilder::handleReplication(
6938     Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
6939     DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe,
6940     VPlanPtr &Plan) {
6941   bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
6942       [&](unsigned VF) { return CM.isUniformAfterVectorization(I, VF); },
6943       Range);
6944 
6945   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
6946       [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range);
6947 
6948   auto *Recipe = new VPReplicateRecipe(I, IsUniform, IsPredicated);
6949   setRecipe(I, Recipe);
6950 
6951   // Find if I uses a predicated instruction. If so, it will use its scalar
6952   // value. Avoid hoisting the insert-element which packs the scalar value into
6953   // a vector value, as that happens iff all users use the vector value.
6954   for (auto &Op : I->operands())
6955     if (auto *PredInst = dyn_cast<Instruction>(Op))
6956       if (PredInst2Recipe.find(PredInst) != PredInst2Recipe.end())
6957         PredInst2Recipe[PredInst]->setAlsoPack(false);
6958 
6959   // Finalize the recipe for Instr, first if it is not predicated.
6960   if (!IsPredicated) {
6961     LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
6962     VPBB->appendRecipe(Recipe);
6963     return VPBB;
6964   }
6965   LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
6966   assert(VPBB->getSuccessors().empty() &&
6967          "VPBB has successors when handling predicated replication.");
6968   // Record predicated instructions for above packing optimizations.
6969   PredInst2Recipe[I] = Recipe;
6970   VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan);
6971   VPBlockUtils::insertBlockAfter(Region, VPBB);
6972   auto *RegSucc = new VPBasicBlock();
6973   VPBlockUtils::insertBlockAfter(RegSucc, Region);
6974   return RegSucc;
6975 }
6976 
6977 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr,
6978                                                       VPRecipeBase *PredRecipe,
6979                                                       VPlanPtr &Plan) {
6980   // Instructions marked for predication are replicated and placed under an
6981   // if-then construct to prevent side-effects.
6982 
6983   // Generate recipes to compute the block mask for this region.
6984   VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan);
6985 
6986   // Build the triangular if-then region.
6987   std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
6988   assert(Instr->getParent() && "Predicated instruction not in any basic block");
6989   auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask);
6990   auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
6991   auto *PHIRecipe =
6992       Instr->getType()->isVoidTy() ? nullptr : new VPPredInstPHIRecipe(Instr);
6993   auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
6994   auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe);
6995   VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true);
6996 
6997   // Note: first set Entry as region entry and then connect successors starting
6998   // from it in order, to propagate the "parent" of each VPBasicBlock.
6999   VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry);
7000   VPBlockUtils::connectBlocks(Pred, Exit);
7001 
7002   return Region;
7003 }
7004 
7005 bool VPRecipeBuilder::tryToCreateRecipe(Instruction *Instr, VFRange &Range,
7006                                         VPlanPtr &Plan, VPBasicBlock *VPBB) {
7007   VPRecipeBase *Recipe = nullptr;
7008 
7009   // First, check for specific widening recipes that deal with memory
7010   // operations, inductions and Phi nodes.
7011   if ((Recipe = tryToWidenMemory(Instr, Range, Plan)) ||
7012       (Recipe = tryToOptimizeInduction(Instr, Range)) ||
7013       (Recipe = tryToBlend(Instr, Plan)) ||
7014       (isa<PHINode>(Instr) &&
7015        (Recipe = new VPWidenPHIRecipe(cast<PHINode>(Instr))))) {
7016     setRecipe(Instr, Recipe);
7017     VPBB->appendRecipe(Recipe);
7018     return true;
7019   }
7020 
7021   // Handle GEP widening.
7022   if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Instr)) {
7023     auto Scalarize = [&](unsigned VF) {
7024       return CM.isScalarWithPredication(Instr, VF) ||
7025              CM.isScalarAfterVectorization(Instr, VF) ||
7026              CM.isProfitableToScalarize(Instr, VF);
7027     };
7028     if (LoopVectorizationPlanner::getDecisionAndClampRange(Scalarize, Range))
7029       return false;
7030     VPWidenGEPRecipe *Recipe = new VPWidenGEPRecipe(GEP, OrigLoop);
7031     setRecipe(Instr, Recipe);
7032     VPBB->appendRecipe(Recipe);
7033     return true;
7034   }
7035 
7036   // Check if Instr is to be widened by a general VPWidenRecipe, after
7037   // having first checked for specific widening recipes.
7038   if (tryToWiden(Instr, VPBB, Range))
7039     return true;
7040 
7041   return false;
7042 }
7043 
7044 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF,
7045                                                         unsigned MaxVF) {
7046   assert(OrigLoop->empty() && "Inner loop expected.");
7047 
7048   // Collect conditions feeding internal conditional branches; they need to be
7049   // represented in VPlan for it to model masking.
7050   SmallPtrSet<Value *, 1> NeedDef;
7051 
7052   auto *Latch = OrigLoop->getLoopLatch();
7053   for (BasicBlock *BB : OrigLoop->blocks()) {
7054     if (BB == Latch)
7055       continue;
7056     BranchInst *Branch = dyn_cast<BranchInst>(BB->getTerminator());
7057     if (Branch && Branch->isConditional())
7058       NeedDef.insert(Branch->getCondition());
7059   }
7060 
7061   // If the tail is to be folded by masking, the primary induction variable
7062   // needs to be represented in VPlan for it to model early-exit masking.
7063   // Also, both the Phi and the live-out instruction of each reduction are
7064   // required in order to introduce a select between them in VPlan.
7065   if (CM.foldTailByMasking()) {
7066     NeedDef.insert(Legal->getPrimaryInduction());
7067     for (auto &Reduction : *Legal->getReductionVars()) {
7068       NeedDef.insert(Reduction.first);
7069       NeedDef.insert(Reduction.second.getLoopExitInstr());
7070     }
7071   }
7072 
7073   // Collect instructions from the original loop that will become trivially dead
7074   // in the vectorized loop. We don't need to vectorize these instructions. For
7075   // example, original induction update instructions can become dead because we
7076   // separately emit induction "steps" when generating code for the new loop.
7077   // Similarly, we create a new latch condition when setting up the structure
7078   // of the new loop, so the old one can become dead.
7079   SmallPtrSet<Instruction *, 4> DeadInstructions;
7080   collectTriviallyDeadInstructions(DeadInstructions);
7081 
7082   for (unsigned VF = MinVF; VF < MaxVF + 1;) {
7083     VFRange SubRange = {VF, MaxVF + 1};
7084     VPlans.push_back(
7085         buildVPlanWithVPRecipes(SubRange, NeedDef, DeadInstructions));
7086     VF = SubRange.End;
7087   }
7088 }
7089 
7090 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
7091     VFRange &Range, SmallPtrSetImpl<Value *> &NeedDef,
7092     SmallPtrSetImpl<Instruction *> &DeadInstructions) {
7093 
7094   // Hold a mapping from predicated instructions to their recipes, in order to
7095   // fix their AlsoPack behavior if a user is determined to replicate and use a
7096   // scalar instead of vector value.
7097   DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe;
7098 
7099   DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter();
7100 
7101   SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
7102 
7103   VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, Builder);
7104 
7105   // ---------------------------------------------------------------------------
7106   // Pre-construction: record ingredients whose recipes we'll need to further
7107   // process after constructing the initial VPlan.
7108   // ---------------------------------------------------------------------------
7109 
7110   // Mark instructions we'll need to sink later and their targets as
7111   // ingredients whose recipe we'll need to record.
7112   for (auto &Entry : SinkAfter) {
7113     RecipeBuilder.recordRecipeOf(Entry.first);
7114     RecipeBuilder.recordRecipeOf(Entry.second);
7115   }
7116 
7117   // For each interleave group which is relevant for this (possibly trimmed)
7118   // Range, add it to the set of groups to be later applied to the VPlan and add
7119   // placeholders for its members' Recipes which we'll be replacing with a
7120   // single VPInterleaveRecipe.
7121   for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
7122     auto applyIG = [IG, this](unsigned VF) -> bool {
7123       return (VF >= 2 && // Query is illegal for VF == 1
7124               CM.getWideningDecision(IG->getInsertPos(), VF) ==
7125                   LoopVectorizationCostModel::CM_Interleave);
7126     };
7127     if (!getDecisionAndClampRange(applyIG, Range))
7128       continue;
7129     InterleaveGroups.insert(IG);
7130     for (unsigned i = 0; i < IG->getFactor(); i++)
7131       if (Instruction *Member = IG->getMember(i))
7132         RecipeBuilder.recordRecipeOf(Member);
7133   };
7134 
7135   // ---------------------------------------------------------------------------
7136   // Build initial VPlan: Scan the body of the loop in a topological order to
7137   // visit each basic block after having visited its predecessor basic blocks.
7138   // ---------------------------------------------------------------------------
7139 
7140   // Create a dummy pre-entry VPBasicBlock to start building the VPlan.
7141   VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry");
7142   auto Plan = std::make_unique<VPlan>(VPBB);
7143 
7144   // Represent values that will have defs inside VPlan.
7145   for (Value *V : NeedDef)
7146     Plan->addVPValue(V);
7147 
7148   // Scan the body of the loop in a topological order to visit each basic block
7149   // after having visited its predecessor basic blocks.
7150   LoopBlocksDFS DFS(OrigLoop);
7151   DFS.perform(LI);
7152 
7153   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
7154     // Relevant instructions from basic block BB will be grouped into VPRecipe
7155     // ingredients and fill a new VPBasicBlock.
7156     unsigned VPBBsForBB = 0;
7157     auto *FirstVPBBForBB = new VPBasicBlock(BB->getName());
7158     VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB);
7159     VPBB = FirstVPBBForBB;
7160     Builder.setInsertPoint(VPBB);
7161 
7162     // Introduce each ingredient into VPlan.
7163     for (Instruction &I : BB->instructionsWithoutDebug()) {
7164       Instruction *Instr = &I;
7165 
7166       // First filter out irrelevant instructions, to ensure no recipes are
7167       // built for them.
7168       if (isa<BranchInst>(Instr) ||
7169           DeadInstructions.find(Instr) != DeadInstructions.end())
7170         continue;
7171 
7172       if (RecipeBuilder.tryToCreateRecipe(Instr, Range, Plan, VPBB))
7173         continue;
7174 
7175       // Otherwise, if all widening options failed, Instruction is to be
7176       // replicated. This may create a successor for VPBB.
7177       VPBasicBlock *NextVPBB = RecipeBuilder.handleReplication(
7178           Instr, Range, VPBB, PredInst2Recipe, Plan);
7179       if (NextVPBB != VPBB) {
7180         VPBB = NextVPBB;
7181         VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++)
7182                                     : "");
7183       }
7184     }
7185   }
7186 
7187   // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks
7188   // may also be empty, such as the last one VPBB, reflecting original
7189   // basic-blocks with no recipes.
7190   VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry());
7191   assert(PreEntry->empty() && "Expecting empty pre-entry block.");
7192   VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor());
7193   VPBlockUtils::disconnectBlocks(PreEntry, Entry);
7194   delete PreEntry;
7195 
7196   // ---------------------------------------------------------------------------
7197   // Transform initial VPlan: Apply previously taken decisions, in order, to
7198   // bring the VPlan to its final state.
7199   // ---------------------------------------------------------------------------
7200 
7201   // Apply Sink-After legal constraints.
7202   for (auto &Entry : SinkAfter) {
7203     VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first);
7204     VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second);
7205     Sink->moveAfter(Target);
7206   }
7207 
7208   // Interleave memory: for each Interleave Group we marked earlier as relevant
7209   // for this VPlan, replace the Recipes widening its memory instructions with a
7210   // single VPInterleaveRecipe at its insertion point.
7211   for (auto IG : InterleaveGroups) {
7212     auto *Recipe = cast<VPWidenMemoryInstructionRecipe>(
7213         RecipeBuilder.getRecipe(IG->getInsertPos()));
7214     (new VPInterleaveRecipe(IG, Recipe->getMask()))->insertBefore(Recipe);
7215 
7216     for (unsigned i = 0; i < IG->getFactor(); ++i)
7217       if (Instruction *Member = IG->getMember(i)) {
7218         RecipeBuilder.getRecipe(Member)->eraseFromParent();
7219       }
7220   }
7221 
7222   // Finally, if tail is folded by masking, introduce selects between the phi
7223   // and the live-out instruction of each reduction, at the end of the latch.
7224   if (CM.foldTailByMasking()) {
7225     Builder.setInsertPoint(VPBB);
7226     auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan);
7227     for (auto &Reduction : *Legal->getReductionVars()) {
7228       VPValue *Phi = Plan->getVPValue(Reduction.first);
7229       VPValue *Red = Plan->getVPValue(Reduction.second.getLoopExitInstr());
7230       Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi});
7231     }
7232   }
7233 
7234   std::string PlanName;
7235   raw_string_ostream RSO(PlanName);
7236   unsigned VF = Range.Start;
7237   Plan->addVF(VF);
7238   RSO << "Initial VPlan for VF={" << VF;
7239   for (VF *= 2; VF < Range.End; VF *= 2) {
7240     Plan->addVF(VF);
7241     RSO << "," << VF;
7242   }
7243   RSO << "},UF>=1";
7244   RSO.flush();
7245   Plan->setName(PlanName);
7246 
7247   return Plan;
7248 }
7249 
7250 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
7251   // Outer loop handling: They may require CFG and instruction level
7252   // transformations before even evaluating whether vectorization is profitable.
7253   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
7254   // the vectorization pipeline.
7255   assert(!OrigLoop->empty());
7256   assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
7257 
7258   // Create new empty VPlan
7259   auto Plan = std::make_unique<VPlan>();
7260 
7261   // Build hierarchical CFG
7262   VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
7263   HCFGBuilder.buildHierarchicalCFG();
7264 
7265   for (unsigned VF = Range.Start; VF < Range.End; VF *= 2)
7266     Plan->addVF(VF);
7267 
7268   if (EnableVPlanPredication) {
7269     VPlanPredicator VPP(*Plan);
7270     VPP.predicate();
7271 
7272     // Avoid running transformation to recipes until masked code generation in
7273     // VPlan-native path is in place.
7274     return Plan;
7275   }
7276 
7277   SmallPtrSet<Instruction *, 1> DeadInstructions;
7278   VPlanTransforms::VPInstructionsToVPRecipes(
7279       OrigLoop, Plan, Legal->getInductionVars(), DeadInstructions);
7280   return Plan;
7281 }
7282 
7283 Value* LoopVectorizationPlanner::VPCallbackILV::
7284 getOrCreateVectorValues(Value *V, unsigned Part) {
7285       return ILV.getOrCreateVectorValue(V, Part);
7286 }
7287 
7288 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent) const {
7289   O << " +\n"
7290     << Indent << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
7291   IG->getInsertPos()->printAsOperand(O, false);
7292   if (User) {
7293     O << ", ";
7294     User->getOperand(0)->printAsOperand(O);
7295   }
7296   O << "\\l\"";
7297   for (unsigned i = 0; i < IG->getFactor(); ++i)
7298     if (Instruction *I = IG->getMember(i))
7299       O << " +\n"
7300         << Indent << "\"  " << VPlanIngredient(I) << " " << i << "\\l\"";
7301 }
7302 
7303 void VPWidenRecipe::execute(VPTransformState &State) {
7304   for (auto &Instr : make_range(Begin, End))
7305     State.ILV->widenInstruction(Instr);
7306 }
7307 
7308 void VPWidenGEPRecipe::execute(VPTransformState &State) {
7309   State.ILV->widenGEP(GEP, State.UF, State.VF, IsPtrLoopInvariant,
7310                       IsIndexLoopInvariant);
7311 }
7312 
7313 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
7314   assert(!State.Instance && "Int or FP induction being replicated.");
7315   State.ILV->widenIntOrFpInduction(IV, Trunc);
7316 }
7317 
7318 void VPWidenPHIRecipe::execute(VPTransformState &State) {
7319   State.ILV->widenPHIInstruction(Phi, State.UF, State.VF);
7320 }
7321 
7322 void VPBlendRecipe::execute(VPTransformState &State) {
7323   State.ILV->setDebugLocFromInst(State.Builder, Phi);
7324   // We know that all PHIs in non-header blocks are converted into
7325   // selects, so we don't have to worry about the insertion order and we
7326   // can just use the builder.
7327   // At this point we generate the predication tree. There may be
7328   // duplications since this is a simple recursive scan, but future
7329   // optimizations will clean it up.
7330 
7331   unsigned NumIncoming = Phi->getNumIncomingValues();
7332 
7333   assert((User || NumIncoming == 1) &&
7334          "Multiple predecessors with predecessors having a full mask");
7335   // Generate a sequence of selects of the form:
7336   // SELECT(Mask3, In3,
7337   //      SELECT(Mask2, In2,
7338   //                   ( ...)))
7339   InnerLoopVectorizer::VectorParts Entry(State.UF);
7340   for (unsigned In = 0; In < NumIncoming; ++In) {
7341     for (unsigned Part = 0; Part < State.UF; ++Part) {
7342       // We might have single edge PHIs (blocks) - use an identity
7343       // 'select' for the first PHI operand.
7344       Value *In0 =
7345           State.ILV->getOrCreateVectorValue(Phi->getIncomingValue(In), Part);
7346       if (In == 0)
7347         Entry[Part] = In0; // Initialize with the first incoming value.
7348       else {
7349         // Select between the current value and the previous incoming edge
7350         // based on the incoming mask.
7351         Value *Cond = State.get(User->getOperand(In), Part);
7352         Entry[Part] =
7353             State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi");
7354       }
7355     }
7356   }
7357   for (unsigned Part = 0; Part < State.UF; ++Part)
7358     State.ValueMap.setVectorValue(Phi, Part, Entry[Part]);
7359 }
7360 
7361 void VPInterleaveRecipe::execute(VPTransformState &State) {
7362   assert(!State.Instance && "Interleave group being replicated.");
7363   if (!User)
7364     return State.ILV->vectorizeInterleaveGroup(IG->getInsertPos());
7365 
7366   // Last (and currently only) operand is a mask.
7367   InnerLoopVectorizer::VectorParts MaskValues(State.UF);
7368   VPValue *Mask = User->getOperand(User->getNumOperands() - 1);
7369   for (unsigned Part = 0; Part < State.UF; ++Part)
7370     MaskValues[Part] = State.get(Mask, Part);
7371   State.ILV->vectorizeInterleaveGroup(IG->getInsertPos(), &MaskValues);
7372 }
7373 
7374 void VPReplicateRecipe::execute(VPTransformState &State) {
7375   if (State.Instance) { // Generate a single instance.
7376     State.ILV->scalarizeInstruction(Ingredient, *State.Instance, IsPredicated);
7377     // Insert scalar instance packing it into a vector.
7378     if (AlsoPack && State.VF > 1) {
7379       // If we're constructing lane 0, initialize to start from undef.
7380       if (State.Instance->Lane == 0) {
7381         Value *Undef =
7382             UndefValue::get(VectorType::get(Ingredient->getType(), State.VF));
7383         State.ValueMap.setVectorValue(Ingredient, State.Instance->Part, Undef);
7384       }
7385       State.ILV->packScalarIntoVectorValue(Ingredient, *State.Instance);
7386     }
7387     return;
7388   }
7389 
7390   // Generate scalar instances for all VF lanes of all UF parts, unless the
7391   // instruction is uniform inwhich case generate only the first lane for each
7392   // of the UF parts.
7393   unsigned EndLane = IsUniform ? 1 : State.VF;
7394   for (unsigned Part = 0; Part < State.UF; ++Part)
7395     for (unsigned Lane = 0; Lane < EndLane; ++Lane)
7396       State.ILV->scalarizeInstruction(Ingredient, {Part, Lane}, IsPredicated);
7397 }
7398 
7399 void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
7400   assert(State.Instance && "Branch on Mask works only on single instance.");
7401 
7402   unsigned Part = State.Instance->Part;
7403   unsigned Lane = State.Instance->Lane;
7404 
7405   Value *ConditionBit = nullptr;
7406   if (!User) // Block in mask is all-one.
7407     ConditionBit = State.Builder.getTrue();
7408   else {
7409     VPValue *BlockInMask = User->getOperand(0);
7410     ConditionBit = State.get(BlockInMask, Part);
7411     if (ConditionBit->getType()->isVectorTy())
7412       ConditionBit = State.Builder.CreateExtractElement(
7413           ConditionBit, State.Builder.getInt32(Lane));
7414   }
7415 
7416   // Replace the temporary unreachable terminator with a new conditional branch,
7417   // whose two destinations will be set later when they are created.
7418   auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
7419   assert(isa<UnreachableInst>(CurrentTerminator) &&
7420          "Expected to replace unreachable terminator with conditional branch.");
7421   auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
7422   CondBr->setSuccessor(0, nullptr);
7423   ReplaceInstWithInst(CurrentTerminator, CondBr);
7424 }
7425 
7426 void VPPredInstPHIRecipe::execute(VPTransformState &State) {
7427   assert(State.Instance && "Predicated instruction PHI works per instance.");
7428   Instruction *ScalarPredInst = cast<Instruction>(
7429       State.ValueMap.getScalarValue(PredInst, *State.Instance));
7430   BasicBlock *PredicatedBB = ScalarPredInst->getParent();
7431   BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
7432   assert(PredicatingBB && "Predicated block has no single predecessor.");
7433 
7434   // By current pack/unpack logic we need to generate only a single phi node: if
7435   // a vector value for the predicated instruction exists at this point it means
7436   // the instruction has vector users only, and a phi for the vector value is
7437   // needed. In this case the recipe of the predicated instruction is marked to
7438   // also do that packing, thereby "hoisting" the insert-element sequence.
7439   // Otherwise, a phi node for the scalar value is needed.
7440   unsigned Part = State.Instance->Part;
7441   if (State.ValueMap.hasVectorValue(PredInst, Part)) {
7442     Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part);
7443     InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
7444     PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
7445     VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
7446     VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
7447     State.ValueMap.resetVectorValue(PredInst, Part, VPhi); // Update cache.
7448   } else {
7449     Type *PredInstType = PredInst->getType();
7450     PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
7451     Phi->addIncoming(UndefValue::get(ScalarPredInst->getType()), PredicatingBB);
7452     Phi->addIncoming(ScalarPredInst, PredicatedBB);
7453     State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi);
7454   }
7455 }
7456 
7457 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
7458   VPValue *Mask = getMask();
7459   if (!Mask)
7460     return State.ILV->vectorizeMemoryInstruction(&Instr);
7461 
7462   InnerLoopVectorizer::VectorParts MaskValues(State.UF);
7463   for (unsigned Part = 0; Part < State.UF; ++Part)
7464     MaskValues[Part] = State.get(Mask, Part);
7465   State.ILV->vectorizeMemoryInstruction(&Instr, &MaskValues);
7466 }
7467 
7468 static ScalarEpilogueLowering
7469 getScalarEpilogueLowering(Function *F, Loop *L, LoopVectorizeHints &Hints,
7470                           ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI,
7471                           TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
7472                           AssumptionCache *AC, LoopInfo *LI,
7473                           ScalarEvolution *SE, DominatorTree *DT,
7474                           const LoopAccessInfo *LAI) {
7475   ScalarEpilogueLowering SEL = CM_ScalarEpilogueAllowed;
7476   bool PredicateOptDisabled = PreferPredicateOverEpilog.getNumOccurrences() &&
7477                               !PreferPredicateOverEpilog;
7478 
7479   if (Hints.getForce() != LoopVectorizeHints::FK_Enabled &&
7480       (F->hasOptSize() ||
7481        llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
7482                                    PGSOQueryType::IRPass)))
7483     SEL = CM_ScalarEpilogueNotAllowedOptSize;
7484   else if (PreferPredicateOverEpilog ||
7485            Hints.getPredicate() == LoopVectorizeHints::FK_Enabled ||
7486            (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, LAI) &&
7487             Hints.getPredicate() != LoopVectorizeHints::FK_Disabled &&
7488             !PredicateOptDisabled))
7489     SEL = CM_ScalarEpilogueNotNeededUsePredicate;
7490 
7491   return SEL;
7492 }
7493 
7494 // Process the loop in the VPlan-native vectorization path. This path builds
7495 // VPlan upfront in the vectorization pipeline, which allows to apply
7496 // VPlan-to-VPlan transformations from the very beginning without modifying the
7497 // input LLVM IR.
7498 static bool processLoopInVPlanNativePath(
7499     Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
7500     LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
7501     TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
7502     OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
7503     ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) {
7504 
7505   assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
7506   Function *F = L->getHeader()->getParent();
7507   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
7508 
7509   ScalarEpilogueLowering SEL =
7510     getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, AC, LI,
7511                               PSE.getSE(), DT, LVL->getLAI());
7512 
7513   LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
7514                                 &Hints, IAI);
7515   // Use the planner for outer loop vectorization.
7516   // TODO: CM is not used at this point inside the planner. Turn CM into an
7517   // optional argument if we don't need it in the future.
7518   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI);
7519 
7520   // Get user vectorization factor.
7521   const unsigned UserVF = Hints.getWidth();
7522 
7523   // Plan how to best vectorize, return the best VF and its cost.
7524   const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
7525 
7526   // If we are stress testing VPlan builds, do not attempt to generate vector
7527   // code. Masked vector code generation support will follow soon.
7528   // Also, do not attempt to vectorize if no vector code will be produced.
7529   if (VPlanBuildStressTest || EnableVPlanPredication ||
7530       VectorizationFactor::Disabled() == VF)
7531     return false;
7532 
7533   LVP.setBestPlan(VF.Width, 1);
7534 
7535   InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL,
7536                          &CM);
7537   LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
7538                     << L->getHeader()->getParent()->getName() << "\"\n");
7539   LVP.executePlan(LB, DT);
7540 
7541   // Mark the loop as already vectorized to avoid vectorizing again.
7542   Hints.setAlreadyVectorized();
7543 
7544   LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent()));
7545   return true;
7546 }
7547 
7548 bool LoopVectorizePass::processLoop(Loop *L) {
7549   assert((EnableVPlanNativePath || L->empty()) &&
7550          "VPlan-native path is not enabled. Only process inner loops.");
7551 
7552 #ifndef NDEBUG
7553   const std::string DebugLocStr = getDebugLocString(L);
7554 #endif /* NDEBUG */
7555 
7556   LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \""
7557                     << L->getHeader()->getParent()->getName() << "\" from "
7558                     << DebugLocStr << "\n");
7559 
7560   LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE);
7561 
7562   LLVM_DEBUG(
7563       dbgs() << "LV: Loop hints:"
7564              << " force="
7565              << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
7566                      ? "disabled"
7567                      : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
7568                             ? "enabled"
7569                             : "?"))
7570              << " width=" << Hints.getWidth()
7571              << " unroll=" << Hints.getInterleave() << "\n");
7572 
7573   // Function containing loop
7574   Function *F = L->getHeader()->getParent();
7575 
7576   // Looking at the diagnostic output is the only way to determine if a loop
7577   // was vectorized (other than looking at the IR or machine code), so it
7578   // is important to generate an optimization remark for each loop. Most of
7579   // these messages are generated as OptimizationRemarkAnalysis. Remarks
7580   // generated as OptimizationRemark and OptimizationRemarkMissed are
7581   // less verbose reporting vectorized loops and unvectorized loops that may
7582   // benefit from vectorization, respectively.
7583 
7584   if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
7585     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
7586     return false;
7587   }
7588 
7589   PredicatedScalarEvolution PSE(*SE, *L);
7590 
7591   // Check if it is legal to vectorize the loop.
7592   LoopVectorizationRequirements Requirements(*ORE);
7593   LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE,
7594                                 &Requirements, &Hints, DB, AC);
7595   if (!LVL.canVectorize(EnableVPlanNativePath)) {
7596     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
7597     Hints.emitRemarkWithHints();
7598     return false;
7599   }
7600 
7601   // Check the function attributes and profiles to find out if this function
7602   // should be optimized for size.
7603   ScalarEpilogueLowering SEL =
7604     getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, AC, LI,
7605                               PSE.getSE(), DT, LVL.getLAI());
7606 
7607   // Entrance to the VPlan-native vectorization path. Outer loops are processed
7608   // here. They may require CFG and instruction level transformations before
7609   // even evaluating whether vectorization is profitable. Since we cannot modify
7610   // the incoming IR, we need to build VPlan upfront in the vectorization
7611   // pipeline.
7612   if (!L->empty())
7613     return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
7614                                         ORE, BFI, PSI, Hints);
7615 
7616   assert(L->empty() && "Inner loop expected.");
7617 
7618   // Check the loop for a trip count threshold: vectorize loops with a tiny trip
7619   // count by optimizing for size, to minimize overheads.
7620   auto ExpectedTC = getSmallBestKnownTC(*SE, L);
7621   if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
7622     LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
7623                       << "This loop is worth vectorizing only if no scalar "
7624                       << "iteration overheads are incurred.");
7625     if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
7626       LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
7627     else {
7628       LLVM_DEBUG(dbgs() << "\n");
7629       SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
7630     }
7631   }
7632 
7633   // Check the function attributes to see if implicit floats are allowed.
7634   // FIXME: This check doesn't seem possibly correct -- what if the loop is
7635   // an integer loop and the vector instructions selected are purely integer
7636   // vector instructions?
7637   if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
7638     reportVectorizationFailure(
7639         "Can't vectorize when the NoImplicitFloat attribute is used",
7640         "loop not vectorized due to NoImplicitFloat attribute",
7641         "NoImplicitFloat", ORE, L);
7642     Hints.emitRemarkWithHints();
7643     return false;
7644   }
7645 
7646   // Check if the target supports potentially unsafe FP vectorization.
7647   // FIXME: Add a check for the type of safety issue (denormal, signaling)
7648   // for the target we're vectorizing for, to make sure none of the
7649   // additional fp-math flags can help.
7650   if (Hints.isPotentiallyUnsafe() &&
7651       TTI->isFPVectorizationPotentiallyUnsafe()) {
7652     reportVectorizationFailure(
7653         "Potentially unsafe FP op prevents vectorization",
7654         "loop not vectorized due to unsafe FP support.",
7655         "UnsafeFP", ORE, L);
7656     Hints.emitRemarkWithHints();
7657     return false;
7658   }
7659 
7660   bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
7661   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
7662 
7663   // If an override option has been passed in for interleaved accesses, use it.
7664   if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
7665     UseInterleaved = EnableInterleavedMemAccesses;
7666 
7667   // Analyze interleaved memory accesses.
7668   if (UseInterleaved) {
7669     IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
7670   }
7671 
7672   // Use the cost model.
7673   LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
7674                                 F, &Hints, IAI);
7675   CM.collectValuesToIgnore();
7676 
7677   // Use the planner for vectorization.
7678   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI);
7679 
7680   // Get user vectorization factor.
7681   unsigned UserVF = Hints.getWidth();
7682 
7683   // Plan how to best vectorize, return the best VF and its cost.
7684   Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF);
7685 
7686   VectorizationFactor VF = VectorizationFactor::Disabled();
7687   unsigned IC = 1;
7688   unsigned UserIC = Hints.getInterleave();
7689 
7690   if (MaybeVF) {
7691     VF = *MaybeVF;
7692     // Select the interleave count.
7693     IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
7694   }
7695 
7696   // Identify the diagnostic messages that should be produced.
7697   std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
7698   bool VectorizeLoop = true, InterleaveLoop = true;
7699   if (Requirements.doesNotMeet(F, L, Hints)) {
7700     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization "
7701                          "requirements.\n");
7702     Hints.emitRemarkWithHints();
7703     return false;
7704   }
7705 
7706   if (VF.Width == 1) {
7707     LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
7708     VecDiagMsg = std::make_pair(
7709         "VectorizationNotBeneficial",
7710         "the cost-model indicates that vectorization is not beneficial");
7711     VectorizeLoop = false;
7712   }
7713 
7714   if (!MaybeVF && UserIC > 1) {
7715     // Tell the user interleaving was avoided up-front, despite being explicitly
7716     // requested.
7717     LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
7718                          "interleaving should be avoided up front\n");
7719     IntDiagMsg = std::make_pair(
7720         "InterleavingAvoided",
7721         "Ignoring UserIC, because interleaving was avoided up front");
7722     InterleaveLoop = false;
7723   } else if (IC == 1 && UserIC <= 1) {
7724     // Tell the user interleaving is not beneficial.
7725     LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
7726     IntDiagMsg = std::make_pair(
7727         "InterleavingNotBeneficial",
7728         "the cost-model indicates that interleaving is not beneficial");
7729     InterleaveLoop = false;
7730     if (UserIC == 1) {
7731       IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
7732       IntDiagMsg.second +=
7733           " and is explicitly disabled or interleave count is set to 1";
7734     }
7735   } else if (IC > 1 && UserIC == 1) {
7736     // Tell the user interleaving is beneficial, but it explicitly disabled.
7737     LLVM_DEBUG(
7738         dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
7739     IntDiagMsg = std::make_pair(
7740         "InterleavingBeneficialButDisabled",
7741         "the cost-model indicates that interleaving is beneficial "
7742         "but is explicitly disabled or interleave count is set to 1");
7743     InterleaveLoop = false;
7744   }
7745 
7746   // Override IC if user provided an interleave count.
7747   IC = UserIC > 0 ? UserIC : IC;
7748 
7749   // Emit diagnostic messages, if any.
7750   const char *VAPassName = Hints.vectorizeAnalysisPassName();
7751   if (!VectorizeLoop && !InterleaveLoop) {
7752     // Do not vectorize or interleaving the loop.
7753     ORE->emit([&]() {
7754       return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
7755                                       L->getStartLoc(), L->getHeader())
7756              << VecDiagMsg.second;
7757     });
7758     ORE->emit([&]() {
7759       return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
7760                                       L->getStartLoc(), L->getHeader())
7761              << IntDiagMsg.second;
7762     });
7763     return false;
7764   } else if (!VectorizeLoop && InterleaveLoop) {
7765     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
7766     ORE->emit([&]() {
7767       return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
7768                                         L->getStartLoc(), L->getHeader())
7769              << VecDiagMsg.second;
7770     });
7771   } else if (VectorizeLoop && !InterleaveLoop) {
7772     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
7773                       << ") in " << DebugLocStr << '\n');
7774     ORE->emit([&]() {
7775       return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
7776                                         L->getStartLoc(), L->getHeader())
7777              << IntDiagMsg.second;
7778     });
7779   } else if (VectorizeLoop && InterleaveLoop) {
7780     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
7781                       << ") in " << DebugLocStr << '\n');
7782     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
7783   }
7784 
7785   LVP.setBestPlan(VF.Width, IC);
7786 
7787   using namespace ore;
7788   bool DisableRuntimeUnroll = false;
7789   MDNode *OrigLoopID = L->getLoopID();
7790 
7791   if (!VectorizeLoop) {
7792     assert(IC > 1 && "interleave count should not be 1 or 0");
7793     // If we decided that it is not legal to vectorize the loop, then
7794     // interleave it.
7795     InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
7796                                &CM);
7797     LVP.executePlan(Unroller, DT);
7798 
7799     ORE->emit([&]() {
7800       return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
7801                                 L->getHeader())
7802              << "interleaved loop (interleaved count: "
7803              << NV("InterleaveCount", IC) << ")";
7804     });
7805   } else {
7806     // If we decided that it is *legal* to vectorize the loop, then do it.
7807     InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
7808                            &LVL, &CM);
7809     LVP.executePlan(LB, DT);
7810     ++LoopsVectorized;
7811 
7812     // Add metadata to disable runtime unrolling a scalar loop when there are
7813     // no runtime checks about strides and memory. A scalar loop that is
7814     // rarely used is not worth unrolling.
7815     if (!LB.areSafetyChecksAdded())
7816       DisableRuntimeUnroll = true;
7817 
7818     // Report the vectorization decision.
7819     ORE->emit([&]() {
7820       return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(),
7821                                 L->getHeader())
7822              << "vectorized loop (vectorization width: "
7823              << NV("VectorizationFactor", VF.Width)
7824              << ", interleaved count: " << NV("InterleaveCount", IC) << ")";
7825     });
7826   }
7827 
7828   Optional<MDNode *> RemainderLoopID =
7829       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
7830                                       LLVMLoopVectorizeFollowupEpilogue});
7831   if (RemainderLoopID.hasValue()) {
7832     L->setLoopID(RemainderLoopID.getValue());
7833   } else {
7834     if (DisableRuntimeUnroll)
7835       AddRuntimeUnrollDisableMetaData(L);
7836 
7837     // Mark the loop as already vectorized to avoid vectorizing again.
7838     Hints.setAlreadyVectorized();
7839   }
7840 
7841   LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent()));
7842   return true;
7843 }
7844 
7845 bool LoopVectorizePass::runImpl(
7846     Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_,
7847     DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_,
7848     DemandedBits &DB_, AliasAnalysis &AA_, AssumptionCache &AC_,
7849     std::function<const LoopAccessInfo &(Loop &)> &GetLAA_,
7850     OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) {
7851   SE = &SE_;
7852   LI = &LI_;
7853   TTI = &TTI_;
7854   DT = &DT_;
7855   BFI = &BFI_;
7856   TLI = TLI_;
7857   AA = &AA_;
7858   AC = &AC_;
7859   GetLAA = &GetLAA_;
7860   DB = &DB_;
7861   ORE = &ORE_;
7862   PSI = PSI_;
7863 
7864   // Don't attempt if
7865   // 1. the target claims to have no vector registers, and
7866   // 2. interleaving won't help ILP.
7867   //
7868   // The second condition is necessary because, even if the target has no
7869   // vector registers, loop vectorization may still enable scalar
7870   // interleaving.
7871   if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) &&
7872       TTI->getMaxInterleaveFactor(1) < 2)
7873     return false;
7874 
7875   bool Changed = false;
7876 
7877   // The vectorizer requires loops to be in simplified form.
7878   // Since simplification may add new inner loops, it has to run before the
7879   // legality and profitability checks. This means running the loop vectorizer
7880   // will simplify all loops, regardless of whether anything end up being
7881   // vectorized.
7882   for (auto &L : *LI)
7883     Changed |=
7884         simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
7885 
7886   // Build up a worklist of inner-loops to vectorize. This is necessary as
7887   // the act of vectorizing or partially unrolling a loop creates new loops
7888   // and can invalidate iterators across the loops.
7889   SmallVector<Loop *, 8> Worklist;
7890 
7891   for (Loop *L : *LI)
7892     collectSupportedLoops(*L, LI, ORE, Worklist);
7893 
7894   LoopsAnalyzed += Worklist.size();
7895 
7896   // Now walk the identified inner loops.
7897   while (!Worklist.empty()) {
7898     Loop *L = Worklist.pop_back_val();
7899 
7900     // For the inner loops we actually process, form LCSSA to simplify the
7901     // transform.
7902     Changed |= formLCSSARecursively(*L, *DT, LI, SE);
7903 
7904     Changed |= processLoop(L);
7905   }
7906 
7907   // Process each loop nest in the function.
7908   return Changed;
7909 }
7910 
7911 PreservedAnalyses LoopVectorizePass::run(Function &F,
7912                                          FunctionAnalysisManager &AM) {
7913     auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
7914     auto &LI = AM.getResult<LoopAnalysis>(F);
7915     auto &TTI = AM.getResult<TargetIRAnalysis>(F);
7916     auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
7917     auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F);
7918     auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
7919     auto &AA = AM.getResult<AAManager>(F);
7920     auto &AC = AM.getResult<AssumptionAnalysis>(F);
7921     auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
7922     auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
7923     MemorySSA *MSSA = EnableMSSALoopDependency
7924                           ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA()
7925                           : nullptr;
7926 
7927     auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
7928     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
7929         [&](Loop &L) -> const LoopAccessInfo & {
7930       LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI, MSSA};
7931       return LAM.getResult<LoopAccessAnalysis>(L, AR);
7932     };
7933     const ModuleAnalysisManager &MAM =
7934         AM.getResult<ModuleAnalysisManagerFunctionProxy>(F).getManager();
7935     ProfileSummaryInfo *PSI =
7936         MAM.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
7937     bool Changed =
7938         runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI);
7939     if (!Changed)
7940       return PreservedAnalyses::all();
7941     PreservedAnalyses PA;
7942 
7943     // We currently do not preserve loopinfo/dominator analyses with outer loop
7944     // vectorization. Until this is addressed, mark these analyses as preserved
7945     // only for non-VPlan-native path.
7946     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
7947     if (!EnableVPlanNativePath) {
7948       PA.preserve<LoopAnalysis>();
7949       PA.preserve<DominatorTreeAnalysis>();
7950     }
7951     PA.preserve<BasicAA>();
7952     PA.preserve<GlobalsAA>();
7953     return PA;
7954 }
7955