1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10 // and generates target-independent LLVM-IR.
11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12 // of instructions in order to estimate the profitability of vectorization.
13 //
14 // The loop vectorizer combines consecutive loop iterations into a single
15 // 'wide' iteration. After this transformation the index is incremented
16 // by the SIMD vector width, and not by one.
17 //
18 // This pass has three parts:
19 // 1. The main loop pass that drives the different parts.
20 // 2. LoopVectorizationLegality - A unit that checks for the legality
21 //    of the vectorization.
22 // 3. InnerLoopVectorizer - A unit that performs the actual
23 //    widening of instructions.
24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
25 //    of vectorization. It decides on the optimal vector width, which
26 //    can be one, if vectorization is not profitable.
27 //
28 // There is a development effort going on to migrate loop vectorizer to the
29 // VPlan infrastructure and to introduce outer loop vectorization support (see
30 // docs/Proposal/VectorizationPlan.rst and
31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32 // purpose, we temporarily introduced the VPlan-native vectorization path: an
33 // alternative vectorization path that is natively implemented on top of the
34 // VPlan infrastructure. See EnableVPlanNativePath for enabling.
35 //
36 //===----------------------------------------------------------------------===//
37 //
38 // The reduction-variable vectorization is based on the paper:
39 //  D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40 //
41 // Variable uniformity checks are inspired by:
42 //  Karrenberg, R. and Hack, S. Whole Function Vectorization.
43 //
44 // The interleaved access vectorization is based on the paper:
45 //  Dorit Nuzman, Ira Rosen and Ayal Zaks.  Auto-Vectorization of Interleaved
46 //  Data for SIMD
47 //
48 // Other ideas/concepts are from:
49 //  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50 //
51 //  S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua.  An Evaluation of
52 //  Vectorizing Compilers.
53 //
54 //===----------------------------------------------------------------------===//
55 
56 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
57 #include "LoopVectorizationPlanner.h"
58 #include "VPRecipeBuilder.h"
59 #include "VPlan.h"
60 #include "VPlanHCFGBuilder.h"
61 #include "VPlanPredicator.h"
62 #include "VPlanTransforms.h"
63 #include "llvm/ADT/APInt.h"
64 #include "llvm/ADT/ArrayRef.h"
65 #include "llvm/ADT/DenseMap.h"
66 #include "llvm/ADT/DenseMapInfo.h"
67 #include "llvm/ADT/Hashing.h"
68 #include "llvm/ADT/MapVector.h"
69 #include "llvm/ADT/None.h"
70 #include "llvm/ADT/Optional.h"
71 #include "llvm/ADT/STLExtras.h"
72 #include "llvm/ADT/SetVector.h"
73 #include "llvm/ADT/SmallPtrSet.h"
74 #include "llvm/ADT/SmallVector.h"
75 #include "llvm/ADT/Statistic.h"
76 #include "llvm/ADT/StringRef.h"
77 #include "llvm/ADT/Twine.h"
78 #include "llvm/ADT/iterator_range.h"
79 #include "llvm/Analysis/AssumptionCache.h"
80 #include "llvm/Analysis/BasicAliasAnalysis.h"
81 #include "llvm/Analysis/BlockFrequencyInfo.h"
82 #include "llvm/Analysis/CFG.h"
83 #include "llvm/Analysis/CodeMetrics.h"
84 #include "llvm/Analysis/DemandedBits.h"
85 #include "llvm/Analysis/GlobalsModRef.h"
86 #include "llvm/Analysis/LoopAccessAnalysis.h"
87 #include "llvm/Analysis/LoopAnalysisManager.h"
88 #include "llvm/Analysis/LoopInfo.h"
89 #include "llvm/Analysis/LoopIterator.h"
90 #include "llvm/Analysis/MemorySSA.h"
91 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
92 #include "llvm/Analysis/ProfileSummaryInfo.h"
93 #include "llvm/Analysis/ScalarEvolution.h"
94 #include "llvm/Analysis/ScalarEvolutionExpander.h"
95 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
96 #include "llvm/Analysis/TargetLibraryInfo.h"
97 #include "llvm/Analysis/TargetTransformInfo.h"
98 #include "llvm/Analysis/VectorUtils.h"
99 #include "llvm/IR/Attributes.h"
100 #include "llvm/IR/BasicBlock.h"
101 #include "llvm/IR/CFG.h"
102 #include "llvm/IR/Constant.h"
103 #include "llvm/IR/Constants.h"
104 #include "llvm/IR/DataLayout.h"
105 #include "llvm/IR/DebugInfoMetadata.h"
106 #include "llvm/IR/DebugLoc.h"
107 #include "llvm/IR/DerivedTypes.h"
108 #include "llvm/IR/DiagnosticInfo.h"
109 #include "llvm/IR/Dominators.h"
110 #include "llvm/IR/Function.h"
111 #include "llvm/IR/IRBuilder.h"
112 #include "llvm/IR/InstrTypes.h"
113 #include "llvm/IR/Instruction.h"
114 #include "llvm/IR/Instructions.h"
115 #include "llvm/IR/IntrinsicInst.h"
116 #include "llvm/IR/Intrinsics.h"
117 #include "llvm/IR/LLVMContext.h"
118 #include "llvm/IR/Metadata.h"
119 #include "llvm/IR/Module.h"
120 #include "llvm/IR/Operator.h"
121 #include "llvm/IR/Type.h"
122 #include "llvm/IR/Use.h"
123 #include "llvm/IR/User.h"
124 #include "llvm/IR/Value.h"
125 #include "llvm/IR/ValueHandle.h"
126 #include "llvm/IR/Verifier.h"
127 #include "llvm/InitializePasses.h"
128 #include "llvm/Pass.h"
129 #include "llvm/Support/Casting.h"
130 #include "llvm/Support/CommandLine.h"
131 #include "llvm/Support/Compiler.h"
132 #include "llvm/Support/Debug.h"
133 #include "llvm/Support/ErrorHandling.h"
134 #include "llvm/Support/MathExtras.h"
135 #include "llvm/Support/raw_ostream.h"
136 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
137 #include "llvm/Transforms/Utils/InjectTLIMappings.h"
138 #include "llvm/Transforms/Utils/LoopSimplify.h"
139 #include "llvm/Transforms/Utils/LoopUtils.h"
140 #include "llvm/Transforms/Utils/LoopVersioning.h"
141 #include "llvm/Transforms/Utils/SizeOpts.h"
142 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
143 #include <algorithm>
144 #include <cassert>
145 #include <cstdint>
146 #include <cstdlib>
147 #include <functional>
148 #include <iterator>
149 #include <limits>
150 #include <memory>
151 #include <string>
152 #include <tuple>
153 #include <utility>
154 
155 using namespace llvm;
156 
157 #define LV_NAME "loop-vectorize"
158 #define DEBUG_TYPE LV_NAME
159 
160 /// @{
161 /// Metadata attribute names
162 static const char *const LLVMLoopVectorizeFollowupAll =
163     "llvm.loop.vectorize.followup_all";
164 static const char *const LLVMLoopVectorizeFollowupVectorized =
165     "llvm.loop.vectorize.followup_vectorized";
166 static const char *const LLVMLoopVectorizeFollowupEpilogue =
167     "llvm.loop.vectorize.followup_epilogue";
168 /// @}
169 
170 STATISTIC(LoopsVectorized, "Number of loops vectorized");
171 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
172 
173 /// Loops with a known constant trip count below this number are vectorized only
174 /// if no scalar iteration overheads are incurred.
175 static cl::opt<unsigned> TinyTripCountVectorThreshold(
176     "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
177     cl::desc("Loops with a constant trip count that is smaller than this "
178              "value are vectorized only if no scalar iteration overheads "
179              "are incurred."));
180 
181 // Indicates that an epilogue is undesired, predication is preferred.
182 // This means that the vectorizer will try to fold the loop-tail (epilogue)
183 // into the loop and predicate the loop body accordingly.
184 static cl::opt<bool> PreferPredicateOverEpilog(
185     "prefer-predicate-over-epilog", cl::init(false), cl::Hidden,
186     cl::desc("Indicate that an epilogue is undesired, predication should be "
187              "used instead."));
188 
189 static cl::opt<bool> MaximizeBandwidth(
190     "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
191     cl::desc("Maximize bandwidth when selecting vectorization factor which "
192              "will be determined by the smallest type in loop."));
193 
194 static cl::opt<bool> EnableInterleavedMemAccesses(
195     "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
196     cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
197 
198 /// An interleave-group may need masking if it resides in a block that needs
199 /// predication, or in order to mask away gaps.
200 static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
201     "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
202     cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
203 
204 static cl::opt<unsigned> TinyTripCountInterleaveThreshold(
205     "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden,
206     cl::desc("We don't interleave loops with a estimated constant trip count "
207              "below this number"));
208 
209 static cl::opt<unsigned> ForceTargetNumScalarRegs(
210     "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
211     cl::desc("A flag that overrides the target's number of scalar registers."));
212 
213 static cl::opt<unsigned> ForceTargetNumVectorRegs(
214     "force-target-num-vector-regs", cl::init(0), cl::Hidden,
215     cl::desc("A flag that overrides the target's number of vector registers."));
216 
217 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
218     "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
219     cl::desc("A flag that overrides the target's max interleave factor for "
220              "scalar loops."));
221 
222 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
223     "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
224     cl::desc("A flag that overrides the target's max interleave factor for "
225              "vectorized loops."));
226 
227 static cl::opt<unsigned> ForceTargetInstructionCost(
228     "force-target-instruction-cost", cl::init(0), cl::Hidden,
229     cl::desc("A flag that overrides the target's expected cost for "
230              "an instruction to a single constant value. Mostly "
231              "useful for getting consistent testing."));
232 
233 static cl::opt<unsigned> SmallLoopCost(
234     "small-loop-cost", cl::init(20), cl::Hidden,
235     cl::desc(
236         "The cost of a loop that is considered 'small' by the interleaver."));
237 
238 static cl::opt<bool> LoopVectorizeWithBlockFrequency(
239     "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
240     cl::desc("Enable the use of the block frequency analysis to access PGO "
241              "heuristics minimizing code growth in cold regions and being more "
242              "aggressive in hot regions."));
243 
244 // Runtime interleave loops for load/store throughput.
245 static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
246     "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
247     cl::desc(
248         "Enable runtime interleaving until load/store ports are saturated"));
249 
250 /// The number of stores in a loop that are allowed to need predication.
251 static cl::opt<unsigned> NumberOfStoresToPredicate(
252     "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
253     cl::desc("Max number of stores to be predicated behind an if."));
254 
255 static cl::opt<bool> EnableIndVarRegisterHeur(
256     "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
257     cl::desc("Count the induction variable only once when interleaving"));
258 
259 static cl::opt<bool> EnableCondStoresVectorization(
260     "enable-cond-stores-vec", cl::init(true), cl::Hidden,
261     cl::desc("Enable if predication of stores during vectorization."));
262 
263 static cl::opt<unsigned> MaxNestedScalarReductionIC(
264     "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
265     cl::desc("The maximum interleave count to use when interleaving a scalar "
266              "reduction in a nested loop."));
267 
268 cl::opt<bool> EnableVPlanNativePath(
269     "enable-vplan-native-path", cl::init(false), cl::Hidden,
270     cl::desc("Enable VPlan-native vectorization path with "
271              "support for outer loop vectorization."));
272 
273 // FIXME: Remove this switch once we have divergence analysis. Currently we
274 // assume divergent non-backedge branches when this switch is true.
275 cl::opt<bool> EnableVPlanPredication(
276     "enable-vplan-predication", cl::init(false), cl::Hidden,
277     cl::desc("Enable VPlan-native vectorization path predicator with "
278              "support for outer loop vectorization."));
279 
280 // This flag enables the stress testing of the VPlan H-CFG construction in the
281 // VPlan-native vectorization path. It must be used in conjuction with
282 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
283 // verification of the H-CFGs built.
284 static cl::opt<bool> VPlanBuildStressTest(
285     "vplan-build-stress-test", cl::init(false), cl::Hidden,
286     cl::desc(
287         "Build VPlan for every supported loop nest in the function and bail "
288         "out right after the build (stress test the VPlan H-CFG construction "
289         "in the VPlan-native vectorization path)."));
290 
291 cl::opt<bool> llvm::EnableLoopInterleaving(
292     "interleave-loops", cl::init(true), cl::Hidden,
293     cl::desc("Enable loop interleaving in Loop vectorization passes"));
294 cl::opt<bool> llvm::EnableLoopVectorization(
295     "vectorize-loops", cl::init(true), cl::Hidden,
296     cl::desc("Run the Loop vectorization passes"));
297 
298 /// A helper function that returns the type of loaded or stored value.
299 static Type *getMemInstValueType(Value *I) {
300   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
301          "Expected Load or Store instruction");
302   if (auto *LI = dyn_cast<LoadInst>(I))
303     return LI->getType();
304   return cast<StoreInst>(I)->getValueOperand()->getType();
305 }
306 
307 /// A helper function that returns true if the given type is irregular. The
308 /// type is irregular if its allocated size doesn't equal the store size of an
309 /// element of the corresponding vector type at the given vectorization factor.
310 static bool hasIrregularType(Type *Ty, const DataLayout &DL, unsigned VF) {
311   // Determine if an array of VF elements of type Ty is "bitcast compatible"
312   // with a <VF x Ty> vector.
313   if (VF > 1) {
314     auto *VectorTy = VectorType::get(Ty, VF);
315     return VF * DL.getTypeAllocSize(Ty) != DL.getTypeStoreSize(VectorTy);
316   }
317 
318   // If the vectorization factor is one, we just check if an array of type Ty
319   // requires padding between elements.
320   return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
321 }
322 
323 /// A helper function that returns the reciprocal of the block probability of
324 /// predicated blocks. If we return X, we are assuming the predicated block
325 /// will execute once for every X iterations of the loop header.
326 ///
327 /// TODO: We should use actual block probability here, if available. Currently,
328 ///       we always assume predicated blocks have a 50% chance of executing.
329 static unsigned getReciprocalPredBlockProb() { return 2; }
330 
331 /// A helper function that adds a 'fast' flag to floating-point operations.
332 static Value *addFastMathFlag(Value *V) {
333   if (isa<FPMathOperator>(V))
334     cast<Instruction>(V)->setFastMathFlags(FastMathFlags::getFast());
335   return V;
336 }
337 
338 static Value *addFastMathFlag(Value *V, FastMathFlags FMF) {
339   if (isa<FPMathOperator>(V))
340     cast<Instruction>(V)->setFastMathFlags(FMF);
341   return V;
342 }
343 
344 /// A helper function that returns an integer or floating-point constant with
345 /// value C.
346 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
347   return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
348                            : ConstantFP::get(Ty, C);
349 }
350 
351 /// Returns "best known" trip count for the specified loop \p L as defined by
352 /// the following procedure:
353 ///   1) Returns exact trip count if it is known.
354 ///   2) Returns expected trip count according to profile data if any.
355 ///   3) Returns upper bound estimate if it is known.
356 ///   4) Returns None if all of the above failed.
357 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) {
358   // Check if exact trip count is known.
359   if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
360     return ExpectedTC;
361 
362   // Check if there is an expected trip count available from profile data.
363   if (LoopVectorizeWithBlockFrequency)
364     if (auto EstimatedTC = getLoopEstimatedTripCount(L))
365       return EstimatedTC;
366 
367   // Check if upper bound estimate is known.
368   if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
369     return ExpectedTC;
370 
371   return None;
372 }
373 
374 namespace llvm {
375 
376 /// InnerLoopVectorizer vectorizes loops which contain only one basic
377 /// block to a specified vectorization factor (VF).
378 /// This class performs the widening of scalars into vectors, or multiple
379 /// scalars. This class also implements the following features:
380 /// * It inserts an epilogue loop for handling loops that don't have iteration
381 ///   counts that are known to be a multiple of the vectorization factor.
382 /// * It handles the code generation for reduction variables.
383 /// * Scalarization (implementation using scalars) of un-vectorizable
384 ///   instructions.
385 /// InnerLoopVectorizer does not perform any vectorization-legality
386 /// checks, and relies on the caller to check for the different legality
387 /// aspects. The InnerLoopVectorizer relies on the
388 /// LoopVectorizationLegality class to provide information about the induction
389 /// and reduction variables that were found to a given vectorization factor.
390 class InnerLoopVectorizer {
391 public:
392   InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
393                       LoopInfo *LI, DominatorTree *DT,
394                       const TargetLibraryInfo *TLI,
395                       const TargetTransformInfo *TTI, AssumptionCache *AC,
396                       OptimizationRemarkEmitter *ORE, unsigned VecWidth,
397                       unsigned UnrollFactor, LoopVectorizationLegality *LVL,
398                       LoopVectorizationCostModel *CM)
399       : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
400         AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
401         Builder(PSE.getSE()->getContext()),
402         VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM) {}
403   virtual ~InnerLoopVectorizer() = default;
404 
405   /// Create a new empty loop. Unlink the old loop and connect the new one.
406   /// Return the pre-header block of the new loop.
407   BasicBlock *createVectorizedLoopSkeleton();
408 
409   /// Widen a single instruction within the innermost loop.
410   void widenInstruction(Instruction &I);
411 
412   /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
413   void fixVectorizedLoop();
414 
415   // Return true if any runtime check is added.
416   bool areSafetyChecksAdded() { return AddedSafetyChecks; }
417 
418   /// A type for vectorized values in the new loop. Each value from the
419   /// original loop, when vectorized, is represented by UF vector values in the
420   /// new unrolled loop, where UF is the unroll factor.
421   using VectorParts = SmallVector<Value *, 2>;
422 
423   /// Vectorize a single GetElementPtrInst based on information gathered and
424   /// decisions taken during planning.
425   void widenGEP(GetElementPtrInst *GEP, unsigned UF, unsigned VF,
426                 bool IsPtrLoopInvariant, SmallBitVector &IsIndexLoopInvariant);
427 
428   /// Vectorize a single PHINode in a block. This method handles the induction
429   /// variable canonicalization. It supports both VF = 1 for unrolled loops and
430   /// arbitrary length vectors.
431   void widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF);
432 
433   /// A helper function to scalarize a single Instruction in the innermost loop.
434   /// Generates a sequence of scalar instances for each lane between \p MinLane
435   /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
436   /// inclusive..
437   void scalarizeInstruction(Instruction *Instr, const VPIteration &Instance,
438                             bool IfPredicateInstr);
439 
440   /// Widen an integer or floating-point induction variable \p IV. If \p Trunc
441   /// is provided, the integer induction variable will first be truncated to
442   /// the corresponding type.
443   void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr);
444 
445   /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a
446   /// vector or scalar value on-demand if one is not yet available. When
447   /// vectorizing a loop, we visit the definition of an instruction before its
448   /// uses. When visiting the definition, we either vectorize or scalarize the
449   /// instruction, creating an entry for it in the corresponding map. (In some
450   /// cases, such as induction variables, we will create both vector and scalar
451   /// entries.) Then, as we encounter uses of the definition, we derive values
452   /// for each scalar or vector use unless such a value is already available.
453   /// For example, if we scalarize a definition and one of its uses is vector,
454   /// we build the required vector on-demand with an insertelement sequence
455   /// when visiting the use. Otherwise, if the use is scalar, we can use the
456   /// existing scalar definition.
457   ///
458   /// Return a value in the new loop corresponding to \p V from the original
459   /// loop at unroll index \p Part. If the value has already been vectorized,
460   /// the corresponding vector entry in VectorLoopValueMap is returned. If,
461   /// however, the value has a scalar entry in VectorLoopValueMap, we construct
462   /// a new vector value on-demand by inserting the scalar values into a vector
463   /// with an insertelement sequence. If the value has been neither vectorized
464   /// nor scalarized, it must be loop invariant, so we simply broadcast the
465   /// value into a vector.
466   Value *getOrCreateVectorValue(Value *V, unsigned Part);
467 
468   /// Return a value in the new loop corresponding to \p V from the original
469   /// loop at unroll and vector indices \p Instance. If the value has been
470   /// vectorized but not scalarized, the necessary extractelement instruction
471   /// will be generated.
472   Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance);
473 
474   /// Construct the vector value of a scalarized value \p V one lane at a time.
475   void packScalarIntoVectorValue(Value *V, const VPIteration &Instance);
476 
477   /// Try to vectorize the interleaved access group that \p Instr belongs to
478   /// with the base address given in \p Addr, optionally masking the vector
479   /// operations if \p BlockInMask is non-null. Use \p State to translate given
480   /// VPValues to IR values in the vectorized loop.
481   void vectorizeInterleaveGroup(Instruction *Instr, VPTransformState &State,
482                                 VPValue *Addr, VPValue *BlockInMask = nullptr);
483 
484   /// Vectorize Load and Store instructions with the base address given in \p
485   /// Addr, optionally masking the vector operations if \p BlockInMask is
486   /// non-null. Use \p State to translate given VPValues to IR values in the
487   /// vectorized loop.
488   void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State,
489                                   VPValue *Addr,
490                                   VPValue *BlockInMask = nullptr);
491 
492   /// Set the debug location in the builder using the debug location in
493   /// the instruction.
494   void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr);
495 
496   /// Fix the non-induction PHIs in the OrigPHIsToFix vector.
497   void fixNonInductionPHIs(void);
498 
499 protected:
500   friend class LoopVectorizationPlanner;
501 
502   /// A small list of PHINodes.
503   using PhiVector = SmallVector<PHINode *, 4>;
504 
505   /// A type for scalarized values in the new loop. Each value from the
506   /// original loop, when scalarized, is represented by UF x VF scalar values
507   /// in the new unrolled loop, where UF is the unroll factor and VF is the
508   /// vectorization factor.
509   using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
510 
511   /// Set up the values of the IVs correctly when exiting the vector loop.
512   void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
513                     Value *CountRoundDown, Value *EndValue,
514                     BasicBlock *MiddleBlock);
515 
516   /// Create a new induction variable inside L.
517   PHINode *createInductionVariable(Loop *L, Value *Start, Value *End,
518                                    Value *Step, Instruction *DL);
519 
520   /// Handle all cross-iteration phis in the header.
521   void fixCrossIterationPHIs();
522 
523   /// Fix a first-order recurrence. This is the second phase of vectorizing
524   /// this phi node.
525   void fixFirstOrderRecurrence(PHINode *Phi);
526 
527   /// Fix a reduction cross-iteration phi. This is the second phase of
528   /// vectorizing this phi node.
529   void fixReduction(PHINode *Phi);
530 
531   /// Clear NSW/NUW flags from reduction instructions if necessary.
532   void clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc);
533 
534   /// The Loop exit block may have single value PHI nodes with some
535   /// incoming value. While vectorizing we only handled real values
536   /// that were defined inside the loop and we should have one value for
537   /// each predecessor of its parent basic block. See PR14725.
538   void fixLCSSAPHIs();
539 
540   /// Iteratively sink the scalarized operands of a predicated instruction into
541   /// the block that was created for it.
542   void sinkScalarOperands(Instruction *PredInst);
543 
544   /// Shrinks vector element sizes to the smallest bitwidth they can be legally
545   /// represented as.
546   void truncateToMinimalBitwidths();
547 
548   /// Create a broadcast instruction. This method generates a broadcast
549   /// instruction (shuffle) for loop invariant values and for the induction
550   /// value. If this is the induction variable then we extend it to N, N+1, ...
551   /// this is needed because each iteration in the loop corresponds to a SIMD
552   /// element.
553   virtual Value *getBroadcastInstrs(Value *V);
554 
555   /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...)
556   /// to each vector element of Val. The sequence starts at StartIndex.
557   /// \p Opcode is relevant for FP induction variable.
558   virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step,
559                                Instruction::BinaryOps Opcode =
560                                Instruction::BinaryOpsEnd);
561 
562   /// Compute scalar induction steps. \p ScalarIV is the scalar induction
563   /// variable on which to base the steps, \p Step is the size of the step, and
564   /// \p EntryVal is the value from the original loop that maps to the steps.
565   /// Note that \p EntryVal doesn't have to be an induction variable - it
566   /// can also be a truncate instruction.
567   void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal,
568                         const InductionDescriptor &ID);
569 
570   /// Create a vector induction phi node based on an existing scalar one. \p
571   /// EntryVal is the value from the original loop that maps to the vector phi
572   /// node, and \p Step is the loop-invariant step. If \p EntryVal is a
573   /// truncate instruction, instead of widening the original IV, we widen a
574   /// version of the IV truncated to \p EntryVal's type.
575   void createVectorIntOrFpInductionPHI(const InductionDescriptor &II,
576                                        Value *Step, Instruction *EntryVal);
577 
578   /// Returns true if an instruction \p I should be scalarized instead of
579   /// vectorized for the chosen vectorization factor.
580   bool shouldScalarizeInstruction(Instruction *I) const;
581 
582   /// Returns true if we should generate a scalar version of \p IV.
583   bool needsScalarInduction(Instruction *IV) const;
584 
585   /// If there is a cast involved in the induction variable \p ID, which should
586   /// be ignored in the vectorized loop body, this function records the
587   /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the
588   /// cast. We had already proved that the casted Phi is equal to the uncasted
589   /// Phi in the vectorized loop (under a runtime guard), and therefore
590   /// there is no need to vectorize the cast - the same value can be used in the
591   /// vector loop for both the Phi and the cast.
592   /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified,
593   /// Otherwise, \p VectorLoopValue is a widened/vectorized value.
594   ///
595   /// \p EntryVal is the value from the original loop that maps to the vector
596   /// phi node and is used to distinguish what is the IV currently being
597   /// processed - original one (if \p EntryVal is a phi corresponding to the
598   /// original IV) or the "newly-created" one based on the proof mentioned above
599   /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the
600   /// latter case \p EntryVal is a TruncInst and we must not record anything for
601   /// that IV, but it's error-prone to expect callers of this routine to care
602   /// about that, hence this explicit parameter.
603   void recordVectorLoopValueForInductionCast(const InductionDescriptor &ID,
604                                              const Instruction *EntryVal,
605                                              Value *VectorLoopValue,
606                                              unsigned Part,
607                                              unsigned Lane = UINT_MAX);
608 
609   /// Generate a shuffle sequence that will reverse the vector Vec.
610   virtual Value *reverseVector(Value *Vec);
611 
612   /// Returns (and creates if needed) the original loop trip count.
613   Value *getOrCreateTripCount(Loop *NewLoop);
614 
615   /// Returns (and creates if needed) the trip count of the widened loop.
616   Value *getOrCreateVectorTripCount(Loop *NewLoop);
617 
618   /// Returns a bitcasted value to the requested vector type.
619   /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
620   Value *createBitOrPointerCast(Value *V, VectorType *DstVTy,
621                                 const DataLayout &DL);
622 
623   /// Emit a bypass check to see if the vector trip count is zero, including if
624   /// it overflows.
625   void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass);
626 
627   /// Emit a bypass check to see if all of the SCEV assumptions we've
628   /// had to make are correct.
629   void emitSCEVChecks(Loop *L, BasicBlock *Bypass);
630 
631   /// Emit bypass checks to check any memory assumptions we may have made.
632   void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass);
633 
634   /// Compute the transformed value of Index at offset StartValue using step
635   /// StepValue.
636   /// For integer induction, returns StartValue + Index * StepValue.
637   /// For pointer induction, returns StartValue[Index * StepValue].
638   /// FIXME: The newly created binary instructions should contain nsw/nuw
639   /// flags, which can be found from the original scalar operations.
640   Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE,
641                               const DataLayout &DL,
642                               const InductionDescriptor &ID) const;
643 
644   /// Add additional metadata to \p To that was not present on \p Orig.
645   ///
646   /// Currently this is used to add the noalias annotations based on the
647   /// inserted memchecks.  Use this for instructions that are *cloned* into the
648   /// vector loop.
649   void addNewMetadata(Instruction *To, const Instruction *Orig);
650 
651   /// Add metadata from one instruction to another.
652   ///
653   /// This includes both the original MDs from \p From and additional ones (\see
654   /// addNewMetadata).  Use this for *newly created* instructions in the vector
655   /// loop.
656   void addMetadata(Instruction *To, Instruction *From);
657 
658   /// Similar to the previous function but it adds the metadata to a
659   /// vector of instructions.
660   void addMetadata(ArrayRef<Value *> To, Instruction *From);
661 
662   /// The original loop.
663   Loop *OrigLoop;
664 
665   /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
666   /// dynamic knowledge to simplify SCEV expressions and converts them to a
667   /// more usable form.
668   PredicatedScalarEvolution &PSE;
669 
670   /// Loop Info.
671   LoopInfo *LI;
672 
673   /// Dominator Tree.
674   DominatorTree *DT;
675 
676   /// Alias Analysis.
677   AliasAnalysis *AA;
678 
679   /// Target Library Info.
680   const TargetLibraryInfo *TLI;
681 
682   /// Target Transform Info.
683   const TargetTransformInfo *TTI;
684 
685   /// Assumption Cache.
686   AssumptionCache *AC;
687 
688   /// Interface to emit optimization remarks.
689   OptimizationRemarkEmitter *ORE;
690 
691   /// LoopVersioning.  It's only set up (non-null) if memchecks were
692   /// used.
693   ///
694   /// This is currently only used to add no-alias metadata based on the
695   /// memchecks.  The actually versioning is performed manually.
696   std::unique_ptr<LoopVersioning> LVer;
697 
698   /// The vectorization SIMD factor to use. Each vector will have this many
699   /// vector elements.
700   unsigned VF;
701 
702   /// The vectorization unroll factor to use. Each scalar is vectorized to this
703   /// many different vector instructions.
704   unsigned UF;
705 
706   /// The builder that we use
707   IRBuilder<> Builder;
708 
709   // --- Vectorization state ---
710 
711   /// The vector-loop preheader.
712   BasicBlock *LoopVectorPreHeader;
713 
714   /// The scalar-loop preheader.
715   BasicBlock *LoopScalarPreHeader;
716 
717   /// Middle Block between the vector and the scalar.
718   BasicBlock *LoopMiddleBlock;
719 
720   /// The ExitBlock of the scalar loop.
721   BasicBlock *LoopExitBlock;
722 
723   /// The vector loop body.
724   BasicBlock *LoopVectorBody;
725 
726   /// The scalar loop body.
727   BasicBlock *LoopScalarBody;
728 
729   /// A list of all bypass blocks. The first block is the entry of the loop.
730   SmallVector<BasicBlock *, 4> LoopBypassBlocks;
731 
732   /// The new Induction variable which was added to the new block.
733   PHINode *Induction = nullptr;
734 
735   /// The induction variable of the old basic block.
736   PHINode *OldInduction = nullptr;
737 
738   /// Maps values from the original loop to their corresponding values in the
739   /// vectorized loop. A key value can map to either vector values, scalar
740   /// values or both kinds of values, depending on whether the key was
741   /// vectorized and scalarized.
742   VectorizerValueMap VectorLoopValueMap;
743 
744   /// Store instructions that were predicated.
745   SmallVector<Instruction *, 4> PredicatedInstructions;
746 
747   /// Trip count of the original loop.
748   Value *TripCount = nullptr;
749 
750   /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
751   Value *VectorTripCount = nullptr;
752 
753   /// The legality analysis.
754   LoopVectorizationLegality *Legal;
755 
756   /// The profitablity analysis.
757   LoopVectorizationCostModel *Cost;
758 
759   // Record whether runtime checks are added.
760   bool AddedSafetyChecks = false;
761 
762   // Holds the end values for each induction variable. We save the end values
763   // so we can later fix-up the external users of the induction variables.
764   DenseMap<PHINode *, Value *> IVEndValues;
765 
766   // Vector of original scalar PHIs whose corresponding widened PHIs need to be
767   // fixed up at the end of vector code generation.
768   SmallVector<PHINode *, 8> OrigPHIsToFix;
769 };
770 
771 class InnerLoopUnroller : public InnerLoopVectorizer {
772 public:
773   InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
774                     LoopInfo *LI, DominatorTree *DT,
775                     const TargetLibraryInfo *TLI,
776                     const TargetTransformInfo *TTI, AssumptionCache *AC,
777                     OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
778                     LoopVectorizationLegality *LVL,
779                     LoopVectorizationCostModel *CM)
780       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1,
781                             UnrollFactor, LVL, CM) {}
782 
783 private:
784   Value *getBroadcastInstrs(Value *V) override;
785   Value *getStepVector(Value *Val, int StartIdx, Value *Step,
786                        Instruction::BinaryOps Opcode =
787                        Instruction::BinaryOpsEnd) override;
788   Value *reverseVector(Value *Vec) override;
789 };
790 
791 } // end namespace llvm
792 
793 /// Look for a meaningful debug location on the instruction or it's
794 /// operands.
795 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
796   if (!I)
797     return I;
798 
799   DebugLoc Empty;
800   if (I->getDebugLoc() != Empty)
801     return I;
802 
803   for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) {
804     if (Instruction *OpInst = dyn_cast<Instruction>(*OI))
805       if (OpInst->getDebugLoc() != Empty)
806         return OpInst;
807   }
808 
809   return I;
810 }
811 
812 void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) {
813   if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) {
814     const DILocation *DIL = Inst->getDebugLoc();
815     if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
816         !isa<DbgInfoIntrinsic>(Inst)) {
817       auto NewDIL = DIL->cloneByMultiplyingDuplicationFactor(UF * VF);
818       if (NewDIL)
819         B.SetCurrentDebugLocation(NewDIL.getValue());
820       else
821         LLVM_DEBUG(dbgs()
822                    << "Failed to create new discriminator: "
823                    << DIL->getFilename() << " Line: " << DIL->getLine());
824     }
825     else
826       B.SetCurrentDebugLocation(DIL);
827   } else
828     B.SetCurrentDebugLocation(DebugLoc());
829 }
830 
831 /// Write a record \p DebugMsg about vectorization failure to the debug
832 /// output stream. If \p I is passed, it is an instruction that prevents
833 /// vectorization.
834 #ifndef NDEBUG
835 static void debugVectorizationFailure(const StringRef DebugMsg,
836     Instruction *I) {
837   dbgs() << "LV: Not vectorizing: " << DebugMsg;
838   if (I != nullptr)
839     dbgs() << " " << *I;
840   else
841     dbgs() << '.';
842   dbgs() << '\n';
843 }
844 #endif
845 
846 /// Create an analysis remark that explains why vectorization failed
847 ///
848 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint).  \p
849 /// RemarkName is the identifier for the remark.  If \p I is passed it is an
850 /// instruction that prevents vectorization.  Otherwise \p TheLoop is used for
851 /// the location of the remark.  \return the remark object that can be
852 /// streamed to.
853 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
854     StringRef RemarkName, Loop *TheLoop, Instruction *I) {
855   Value *CodeRegion = TheLoop->getHeader();
856   DebugLoc DL = TheLoop->getStartLoc();
857 
858   if (I) {
859     CodeRegion = I->getParent();
860     // If there is no debug location attached to the instruction, revert back to
861     // using the loop's.
862     if (I->getDebugLoc())
863       DL = I->getDebugLoc();
864   }
865 
866   OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion);
867   R << "loop not vectorized: ";
868   return R;
869 }
870 
871 namespace llvm {
872 
873 void reportVectorizationFailure(const StringRef DebugMsg,
874     const StringRef OREMsg, const StringRef ORETag,
875     OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) {
876   LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I));
877   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
878   ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(),
879                 ORETag, TheLoop, I) << OREMsg);
880 }
881 
882 } // end namespace llvm
883 
884 #ifndef NDEBUG
885 /// \return string containing a file name and a line # for the given loop.
886 static std::string getDebugLocString(const Loop *L) {
887   std::string Result;
888   if (L) {
889     raw_string_ostream OS(Result);
890     if (const DebugLoc LoopDbgLoc = L->getStartLoc())
891       LoopDbgLoc.print(OS);
892     else
893       // Just print the module name.
894       OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
895     OS.flush();
896   }
897   return Result;
898 }
899 #endif
900 
901 void InnerLoopVectorizer::addNewMetadata(Instruction *To,
902                                          const Instruction *Orig) {
903   // If the loop was versioned with memchecks, add the corresponding no-alias
904   // metadata.
905   if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig)))
906     LVer->annotateInstWithNoAlias(To, Orig);
907 }
908 
909 void InnerLoopVectorizer::addMetadata(Instruction *To,
910                                       Instruction *From) {
911   propagateMetadata(To, From);
912   addNewMetadata(To, From);
913 }
914 
915 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,
916                                       Instruction *From) {
917   for (Value *V : To) {
918     if (Instruction *I = dyn_cast<Instruction>(V))
919       addMetadata(I, From);
920   }
921 }
922 
923 namespace llvm {
924 
925 // Loop vectorization cost-model hints how the scalar epilogue loop should be
926 // lowered.
927 enum ScalarEpilogueLowering {
928 
929   // The default: allowing scalar epilogues.
930   CM_ScalarEpilogueAllowed,
931 
932   // Vectorization with OptForSize: don't allow epilogues.
933   CM_ScalarEpilogueNotAllowedOptSize,
934 
935   // A special case of vectorisation with OptForSize: loops with a very small
936   // trip count are considered for vectorization under OptForSize, thereby
937   // making sure the cost of their loop body is dominant, free of runtime
938   // guards and scalar iteration overheads.
939   CM_ScalarEpilogueNotAllowedLowTripLoop,
940 
941   // Loop hint predicate indicating an epilogue is undesired.
942   CM_ScalarEpilogueNotNeededUsePredicate
943 };
944 
945 /// LoopVectorizationCostModel - estimates the expected speedups due to
946 /// vectorization.
947 /// In many cases vectorization is not profitable. This can happen because of
948 /// a number of reasons. In this class we mainly attempt to predict the
949 /// expected speedup/slowdowns due to the supported instruction set. We use the
950 /// TargetTransformInfo to query the different backends for the cost of
951 /// different operations.
952 class LoopVectorizationCostModel {
953 public:
954   LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
955                              PredicatedScalarEvolution &PSE, LoopInfo *LI,
956                              LoopVectorizationLegality *Legal,
957                              const TargetTransformInfo &TTI,
958                              const TargetLibraryInfo *TLI, DemandedBits *DB,
959                              AssumptionCache *AC,
960                              OptimizationRemarkEmitter *ORE, const Function *F,
961                              const LoopVectorizeHints *Hints,
962                              InterleavedAccessInfo &IAI)
963       : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
964         TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
965         Hints(Hints), InterleaveInfo(IAI) {}
966 
967   /// \return An upper bound for the vectorization factor, or None if
968   /// vectorization and interleaving should be avoided up front.
969   Optional<unsigned> computeMaxVF();
970 
971   /// \return True if runtime checks are required for vectorization, and false
972   /// otherwise.
973   bool runtimeChecksRequired();
974 
975   /// \return The most profitable vectorization factor and the cost of that VF.
976   /// This method checks every power of two up to MaxVF. If UserVF is not ZERO
977   /// then this vectorization factor will be selected if vectorization is
978   /// possible.
979   VectorizationFactor selectVectorizationFactor(unsigned MaxVF);
980 
981   /// Setup cost-based decisions for user vectorization factor.
982   void selectUserVectorizationFactor(unsigned UserVF) {
983     collectUniformsAndScalars(UserVF);
984     collectInstsToScalarize(UserVF);
985   }
986 
987   /// \return The size (in bits) of the smallest and widest types in the code
988   /// that needs to be vectorized. We ignore values that remain scalar such as
989   /// 64 bit loop indices.
990   std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
991 
992   /// \return The desired interleave count.
993   /// If interleave count has been specified by metadata it will be returned.
994   /// Otherwise, the interleave count is computed and returned. VF and LoopCost
995   /// are the selected vectorization factor and the cost of the selected VF.
996   unsigned selectInterleaveCount(unsigned VF, unsigned LoopCost);
997 
998   /// Memory access instruction may be vectorized in more than one way.
999   /// Form of instruction after vectorization depends on cost.
1000   /// This function takes cost-based decisions for Load/Store instructions
1001   /// and collects them in a map. This decisions map is used for building
1002   /// the lists of loop-uniform and loop-scalar instructions.
1003   /// The calculated cost is saved with widening decision in order to
1004   /// avoid redundant calculations.
1005   void setCostBasedWideningDecision(unsigned VF);
1006 
1007   /// A struct that represents some properties of the register usage
1008   /// of a loop.
1009   struct RegisterUsage {
1010     /// Holds the number of loop invariant values that are used in the loop.
1011     /// The key is ClassID of target-provided register class.
1012     SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs;
1013     /// Holds the maximum number of concurrent live intervals in the loop.
1014     /// The key is ClassID of target-provided register class.
1015     SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
1016   };
1017 
1018   /// \return Returns information about the register usages of the loop for the
1019   /// given vectorization factors.
1020   SmallVector<RegisterUsage, 8> calculateRegisterUsage(ArrayRef<unsigned> VFs);
1021 
1022   /// Collect values we want to ignore in the cost model.
1023   void collectValuesToIgnore();
1024 
1025   /// \returns The smallest bitwidth each instruction can be represented with.
1026   /// The vector equivalents of these instructions should be truncated to this
1027   /// type.
1028   const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
1029     return MinBWs;
1030   }
1031 
1032   /// \returns True if it is more profitable to scalarize instruction \p I for
1033   /// vectorization factor \p VF.
1034   bool isProfitableToScalarize(Instruction *I, unsigned VF) const {
1035     assert(VF > 1 && "Profitable to scalarize relevant only for VF > 1.");
1036 
1037     // Cost model is not run in the VPlan-native path - return conservative
1038     // result until this changes.
1039     if (EnableVPlanNativePath)
1040       return false;
1041 
1042     auto Scalars = InstsToScalarize.find(VF);
1043     assert(Scalars != InstsToScalarize.end() &&
1044            "VF not yet analyzed for scalarization profitability");
1045     return Scalars->second.find(I) != Scalars->second.end();
1046   }
1047 
1048   /// Returns true if \p I is known to be uniform after vectorization.
1049   bool isUniformAfterVectorization(Instruction *I, unsigned VF) const {
1050     if (VF == 1)
1051       return true;
1052 
1053     // Cost model is not run in the VPlan-native path - return conservative
1054     // result until this changes.
1055     if (EnableVPlanNativePath)
1056       return false;
1057 
1058     auto UniformsPerVF = Uniforms.find(VF);
1059     assert(UniformsPerVF != Uniforms.end() &&
1060            "VF not yet analyzed for uniformity");
1061     return UniformsPerVF->second.find(I) != UniformsPerVF->second.end();
1062   }
1063 
1064   /// Returns true if \p I is known to be scalar after vectorization.
1065   bool isScalarAfterVectorization(Instruction *I, unsigned VF) const {
1066     if (VF == 1)
1067       return true;
1068 
1069     // Cost model is not run in the VPlan-native path - return conservative
1070     // result until this changes.
1071     if (EnableVPlanNativePath)
1072       return false;
1073 
1074     auto ScalarsPerVF = Scalars.find(VF);
1075     assert(ScalarsPerVF != Scalars.end() &&
1076            "Scalar values are not calculated for VF");
1077     return ScalarsPerVF->second.find(I) != ScalarsPerVF->second.end();
1078   }
1079 
1080   /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1081   /// for vectorization factor \p VF.
1082   bool canTruncateToMinimalBitwidth(Instruction *I, unsigned VF) const {
1083     return VF > 1 && MinBWs.find(I) != MinBWs.end() &&
1084            !isProfitableToScalarize(I, VF) &&
1085            !isScalarAfterVectorization(I, VF);
1086   }
1087 
1088   /// Decision that was taken during cost calculation for memory instruction.
1089   enum InstWidening {
1090     CM_Unknown,
1091     CM_Widen,         // For consecutive accesses with stride +1.
1092     CM_Widen_Reverse, // For consecutive accesses with stride -1.
1093     CM_Interleave,
1094     CM_GatherScatter,
1095     CM_Scalarize
1096   };
1097 
1098   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1099   /// instruction \p I and vector width \p VF.
1100   void setWideningDecision(Instruction *I, unsigned VF, InstWidening W,
1101                            unsigned Cost) {
1102     assert(VF >= 2 && "Expected VF >=2");
1103     WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1104   }
1105 
1106   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1107   /// interleaving group \p Grp and vector width \p VF.
1108   void setWideningDecision(const InterleaveGroup<Instruction> *Grp, unsigned VF,
1109                            InstWidening W, unsigned Cost) {
1110     assert(VF >= 2 && "Expected VF >=2");
1111     /// Broadcast this decicion to all instructions inside the group.
1112     /// But the cost will be assigned to one instruction only.
1113     for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1114       if (auto *I = Grp->getMember(i)) {
1115         if (Grp->getInsertPos() == I)
1116           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1117         else
1118           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1119       }
1120     }
1121   }
1122 
1123   /// Return the cost model decision for the given instruction \p I and vector
1124   /// width \p VF. Return CM_Unknown if this instruction did not pass
1125   /// through the cost modeling.
1126   InstWidening getWideningDecision(Instruction *I, unsigned VF) {
1127     assert(VF >= 2 && "Expected VF >=2");
1128 
1129     // Cost model is not run in the VPlan-native path - return conservative
1130     // result until this changes.
1131     if (EnableVPlanNativePath)
1132       return CM_GatherScatter;
1133 
1134     std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
1135     auto Itr = WideningDecisions.find(InstOnVF);
1136     if (Itr == WideningDecisions.end())
1137       return CM_Unknown;
1138     return Itr->second.first;
1139   }
1140 
1141   /// Return the vectorization cost for the given instruction \p I and vector
1142   /// width \p VF.
1143   unsigned getWideningCost(Instruction *I, unsigned VF) {
1144     assert(VF >= 2 && "Expected VF >=2");
1145     std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
1146     assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&
1147            "The cost is not calculated");
1148     return WideningDecisions[InstOnVF].second;
1149   }
1150 
1151   /// Return True if instruction \p I is an optimizable truncate whose operand
1152   /// is an induction variable. Such a truncate will be removed by adding a new
1153   /// induction variable with the destination type.
1154   bool isOptimizableIVTruncate(Instruction *I, unsigned VF) {
1155     // If the instruction is not a truncate, return false.
1156     auto *Trunc = dyn_cast<TruncInst>(I);
1157     if (!Trunc)
1158       return false;
1159 
1160     // Get the source and destination types of the truncate.
1161     Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1162     Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1163 
1164     // If the truncate is free for the given types, return false. Replacing a
1165     // free truncate with an induction variable would add an induction variable
1166     // update instruction to each iteration of the loop. We exclude from this
1167     // check the primary induction variable since it will need an update
1168     // instruction regardless.
1169     Value *Op = Trunc->getOperand(0);
1170     if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1171       return false;
1172 
1173     // If the truncated value is not an induction variable, return false.
1174     return Legal->isInductionPhi(Op);
1175   }
1176 
1177   /// Collects the instructions to scalarize for each predicated instruction in
1178   /// the loop.
1179   void collectInstsToScalarize(unsigned VF);
1180 
1181   /// Collect Uniform and Scalar values for the given \p VF.
1182   /// The sets depend on CM decision for Load/Store instructions
1183   /// that may be vectorized as interleave, gather-scatter or scalarized.
1184   void collectUniformsAndScalars(unsigned VF) {
1185     // Do the analysis once.
1186     if (VF == 1 || Uniforms.find(VF) != Uniforms.end())
1187       return;
1188     setCostBasedWideningDecision(VF);
1189     collectLoopUniforms(VF);
1190     collectLoopScalars(VF);
1191   }
1192 
1193   /// Returns true if the target machine supports masked store operation
1194   /// for the given \p DataType and kind of access to \p Ptr.
1195   bool isLegalMaskedStore(Type *DataType, Value *Ptr, MaybeAlign Alignment) {
1196     return Legal->isConsecutivePtr(Ptr) &&
1197            TTI.isLegalMaskedStore(DataType, Alignment);
1198   }
1199 
1200   /// Returns true if the target machine supports masked load operation
1201   /// for the given \p DataType and kind of access to \p Ptr.
1202   bool isLegalMaskedLoad(Type *DataType, Value *Ptr, MaybeAlign Alignment) {
1203     return Legal->isConsecutivePtr(Ptr) &&
1204            TTI.isLegalMaskedLoad(DataType, Alignment);
1205   }
1206 
1207   /// Returns true if the target machine supports masked scatter operation
1208   /// for the given \p DataType.
1209   bool isLegalMaskedScatter(Type *DataType, MaybeAlign Alignment) {
1210     return TTI.isLegalMaskedScatter(DataType, Alignment);
1211   }
1212 
1213   /// Returns true if the target machine supports masked gather operation
1214   /// for the given \p DataType.
1215   bool isLegalMaskedGather(Type *DataType, MaybeAlign Alignment) {
1216     return TTI.isLegalMaskedGather(DataType, Alignment);
1217   }
1218 
1219   /// Returns true if the target machine can represent \p V as a masked gather
1220   /// or scatter operation.
1221   bool isLegalGatherOrScatter(Value *V) {
1222     bool LI = isa<LoadInst>(V);
1223     bool SI = isa<StoreInst>(V);
1224     if (!LI && !SI)
1225       return false;
1226     auto *Ty = getMemInstValueType(V);
1227     MaybeAlign Align = getLoadStoreAlignment(V);
1228     return (LI && isLegalMaskedGather(Ty, Align)) ||
1229            (SI && isLegalMaskedScatter(Ty, Align));
1230   }
1231 
1232   /// Returns true if \p I is an instruction that will be scalarized with
1233   /// predication. Such instructions include conditional stores and
1234   /// instructions that may divide by zero.
1235   /// If a non-zero VF has been calculated, we check if I will be scalarized
1236   /// predication for that VF.
1237   bool isScalarWithPredication(Instruction *I, unsigned VF = 1);
1238 
1239   // Returns true if \p I is an instruction that will be predicated either
1240   // through scalar predication or masked load/store or masked gather/scatter.
1241   // Superset of instructions that return true for isScalarWithPredication.
1242   bool isPredicatedInst(Instruction *I) {
1243     if (!blockNeedsPredication(I->getParent()))
1244       return false;
1245     // Loads and stores that need some form of masked operation are predicated
1246     // instructions.
1247     if (isa<LoadInst>(I) || isa<StoreInst>(I))
1248       return Legal->isMaskRequired(I);
1249     return isScalarWithPredication(I);
1250   }
1251 
1252   /// Returns true if \p I is a memory instruction with consecutive memory
1253   /// access that can be widened.
1254   bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1);
1255 
1256   /// Returns true if \p I is a memory instruction in an interleaved-group
1257   /// of memory accesses that can be vectorized with wide vector loads/stores
1258   /// and shuffles.
1259   bool interleavedAccessCanBeWidened(Instruction *I, unsigned VF = 1);
1260 
1261   /// Check if \p Instr belongs to any interleaved access group.
1262   bool isAccessInterleaved(Instruction *Instr) {
1263     return InterleaveInfo.isInterleaved(Instr);
1264   }
1265 
1266   /// Get the interleaved access group that \p Instr belongs to.
1267   const InterleaveGroup<Instruction> *
1268   getInterleavedAccessGroup(Instruction *Instr) {
1269     return InterleaveInfo.getInterleaveGroup(Instr);
1270   }
1271 
1272   /// Returns true if an interleaved group requires a scalar iteration
1273   /// to handle accesses with gaps, and there is nothing preventing us from
1274   /// creating a scalar epilogue.
1275   bool requiresScalarEpilogue() const {
1276     return isScalarEpilogueAllowed() && InterleaveInfo.requiresScalarEpilogue();
1277   }
1278 
1279   /// Returns true if a scalar epilogue is not allowed due to optsize or a
1280   /// loop hint annotation.
1281   bool isScalarEpilogueAllowed() const {
1282     return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1283   }
1284 
1285   /// Returns true if all loop blocks should be masked to fold tail loop.
1286   bool foldTailByMasking() const { return FoldTailByMasking; }
1287 
1288   bool blockNeedsPredication(BasicBlock *BB) {
1289     return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1290   }
1291 
1292   /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1293   /// with factor VF.  Return the cost of the instruction, including
1294   /// scalarization overhead if it's needed.
1295   unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF);
1296 
1297   /// Estimate cost of a call instruction CI if it were vectorized with factor
1298   /// VF. Return the cost of the instruction, including scalarization overhead
1299   /// if it's needed. The flag NeedToScalarize shows if the call needs to be
1300   /// scalarized -
1301   /// i.e. either vector version isn't available, or is too expensive.
1302   unsigned getVectorCallCost(CallInst *CI, unsigned VF, bool &NeedToScalarize);
1303 
1304 private:
1305   unsigned NumPredStores = 0;
1306 
1307   /// \return An upper bound for the vectorization factor, larger than zero.
1308   /// One is returned if vectorization should best be avoided due to cost.
1309   unsigned computeFeasibleMaxVF(unsigned ConstTripCount);
1310 
1311   /// The vectorization cost is a combination of the cost itself and a boolean
1312   /// indicating whether any of the contributing operations will actually
1313   /// operate on
1314   /// vector values after type legalization in the backend. If this latter value
1315   /// is
1316   /// false, then all operations will be scalarized (i.e. no vectorization has
1317   /// actually taken place).
1318   using VectorizationCostTy = std::pair<unsigned, bool>;
1319 
1320   /// Returns the expected execution cost. The unit of the cost does
1321   /// not matter because we use the 'cost' units to compare different
1322   /// vector widths. The cost that is returned is *not* normalized by
1323   /// the factor width.
1324   VectorizationCostTy expectedCost(unsigned VF);
1325 
1326   /// Returns the execution time cost of an instruction for a given vector
1327   /// width. Vector width of one means scalar.
1328   VectorizationCostTy getInstructionCost(Instruction *I, unsigned VF);
1329 
1330   /// The cost-computation logic from getInstructionCost which provides
1331   /// the vector type as an output parameter.
1332   unsigned getInstructionCost(Instruction *I, unsigned VF, Type *&VectorTy);
1333 
1334   /// Calculate vectorization cost of memory instruction \p I.
1335   unsigned getMemoryInstructionCost(Instruction *I, unsigned VF);
1336 
1337   /// The cost computation for scalarized memory instruction.
1338   unsigned getMemInstScalarizationCost(Instruction *I, unsigned VF);
1339 
1340   /// The cost computation for interleaving group of memory instructions.
1341   unsigned getInterleaveGroupCost(Instruction *I, unsigned VF);
1342 
1343   /// The cost computation for Gather/Scatter instruction.
1344   unsigned getGatherScatterCost(Instruction *I, unsigned VF);
1345 
1346   /// The cost computation for widening instruction \p I with consecutive
1347   /// memory access.
1348   unsigned getConsecutiveMemOpCost(Instruction *I, unsigned VF);
1349 
1350   /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1351   /// Load: scalar load + broadcast.
1352   /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1353   /// element)
1354   unsigned getUniformMemOpCost(Instruction *I, unsigned VF);
1355 
1356   /// Estimate the overhead of scalarizing an instruction. This is a
1357   /// convenience wrapper for the type-based getScalarizationOverhead API.
1358   unsigned getScalarizationOverhead(Instruction *I, unsigned VF);
1359 
1360   /// Returns whether the instruction is a load or store and will be a emitted
1361   /// as a vector operation.
1362   bool isConsecutiveLoadOrStore(Instruction *I);
1363 
1364   /// Returns true if an artificially high cost for emulated masked memrefs
1365   /// should be used.
1366   bool useEmulatedMaskMemRefHack(Instruction *I);
1367 
1368   /// Map of scalar integer values to the smallest bitwidth they can be legally
1369   /// represented as. The vector equivalents of these values should be truncated
1370   /// to this type.
1371   MapVector<Instruction *, uint64_t> MinBWs;
1372 
1373   /// A type representing the costs for instructions if they were to be
1374   /// scalarized rather than vectorized. The entries are Instruction-Cost
1375   /// pairs.
1376   using ScalarCostsTy = DenseMap<Instruction *, unsigned>;
1377 
1378   /// A set containing all BasicBlocks that are known to present after
1379   /// vectorization as a predicated block.
1380   SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;
1381 
1382   /// Records whether it is allowed to have the original scalar loop execute at
1383   /// least once. This may be needed as a fallback loop in case runtime
1384   /// aliasing/dependence checks fail, or to handle the tail/remainder
1385   /// iterations when the trip count is unknown or doesn't divide by the VF,
1386   /// or as a peel-loop to handle gaps in interleave-groups.
1387   /// Under optsize and when the trip count is very small we don't allow any
1388   /// iterations to execute in the scalar loop.
1389   ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1390 
1391   /// All blocks of loop are to be masked to fold tail of scalar iterations.
1392   bool FoldTailByMasking = false;
1393 
1394   /// A map holding scalar costs for different vectorization factors. The
1395   /// presence of a cost for an instruction in the mapping indicates that the
1396   /// instruction will be scalarized when vectorizing with the associated
1397   /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1398   DenseMap<unsigned, ScalarCostsTy> InstsToScalarize;
1399 
1400   /// Holds the instructions known to be uniform after vectorization.
1401   /// The data is collected per VF.
1402   DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Uniforms;
1403 
1404   /// Holds the instructions known to be scalar after vectorization.
1405   /// The data is collected per VF.
1406   DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Scalars;
1407 
1408   /// Holds the instructions (address computations) that are forced to be
1409   /// scalarized.
1410   DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1411 
1412   /// Returns the expected difference in cost from scalarizing the expression
1413   /// feeding a predicated instruction \p PredInst. The instructions to
1414   /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1415   /// non-negative return value implies the expression will be scalarized.
1416   /// Currently, only single-use chains are considered for scalarization.
1417   int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
1418                               unsigned VF);
1419 
1420   /// Collect the instructions that are uniform after vectorization. An
1421   /// instruction is uniform if we represent it with a single scalar value in
1422   /// the vectorized loop corresponding to each vector iteration. Examples of
1423   /// uniform instructions include pointer operands of consecutive or
1424   /// interleaved memory accesses. Note that although uniformity implies an
1425   /// instruction will be scalar, the reverse is not true. In general, a
1426   /// scalarized instruction will be represented by VF scalar values in the
1427   /// vectorized loop, each corresponding to an iteration of the original
1428   /// scalar loop.
1429   void collectLoopUniforms(unsigned VF);
1430 
1431   /// Collect the instructions that are scalar after vectorization. An
1432   /// instruction is scalar if it is known to be uniform or will be scalarized
1433   /// during vectorization. Non-uniform scalarized instructions will be
1434   /// represented by VF values in the vectorized loop, each corresponding to an
1435   /// iteration of the original scalar loop.
1436   void collectLoopScalars(unsigned VF);
1437 
1438   /// Keeps cost model vectorization decision and cost for instructions.
1439   /// Right now it is used for memory instructions only.
1440   using DecisionList = DenseMap<std::pair<Instruction *, unsigned>,
1441                                 std::pair<InstWidening, unsigned>>;
1442 
1443   DecisionList WideningDecisions;
1444 
1445   /// Returns true if \p V is expected to be vectorized and it needs to be
1446   /// extracted.
1447   bool needsExtract(Value *V, unsigned VF) const {
1448     Instruction *I = dyn_cast<Instruction>(V);
1449     if (VF == 1 || !I || !TheLoop->contains(I) || TheLoop->isLoopInvariant(I))
1450       return false;
1451 
1452     // Assume we can vectorize V (and hence we need extraction) if the
1453     // scalars are not computed yet. This can happen, because it is called
1454     // via getScalarizationOverhead from setCostBasedWideningDecision, before
1455     // the scalars are collected. That should be a safe assumption in most
1456     // cases, because we check if the operands have vectorizable types
1457     // beforehand in LoopVectorizationLegality.
1458     return Scalars.find(VF) == Scalars.end() ||
1459            !isScalarAfterVectorization(I, VF);
1460   };
1461 
1462   /// Returns a range containing only operands needing to be extracted.
1463   SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1464                                                    unsigned VF) {
1465     return SmallVector<Value *, 4>(make_filter_range(
1466         Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1467   }
1468 
1469 public:
1470   /// The loop that we evaluate.
1471   Loop *TheLoop;
1472 
1473   /// Predicated scalar evolution analysis.
1474   PredicatedScalarEvolution &PSE;
1475 
1476   /// Loop Info analysis.
1477   LoopInfo *LI;
1478 
1479   /// Vectorization legality.
1480   LoopVectorizationLegality *Legal;
1481 
1482   /// Vector target information.
1483   const TargetTransformInfo &TTI;
1484 
1485   /// Target Library Info.
1486   const TargetLibraryInfo *TLI;
1487 
1488   /// Demanded bits analysis.
1489   DemandedBits *DB;
1490 
1491   /// Assumption cache.
1492   AssumptionCache *AC;
1493 
1494   /// Interface to emit optimization remarks.
1495   OptimizationRemarkEmitter *ORE;
1496 
1497   const Function *TheFunction;
1498 
1499   /// Loop Vectorize Hint.
1500   const LoopVectorizeHints *Hints;
1501 
1502   /// The interleave access information contains groups of interleaved accesses
1503   /// with the same stride and close to each other.
1504   InterleavedAccessInfo &InterleaveInfo;
1505 
1506   /// Values to ignore in the cost model.
1507   SmallPtrSet<const Value *, 16> ValuesToIgnore;
1508 
1509   /// Values to ignore in the cost model when VF > 1.
1510   SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1511 };
1512 
1513 } // end namespace llvm
1514 
1515 // Return true if \p OuterLp is an outer loop annotated with hints for explicit
1516 // vectorization. The loop needs to be annotated with #pragma omp simd
1517 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
1518 // vector length information is not provided, vectorization is not considered
1519 // explicit. Interleave hints are not allowed either. These limitations will be
1520 // relaxed in the future.
1521 // Please, note that we are currently forced to abuse the pragma 'clang
1522 // vectorize' semantics. This pragma provides *auto-vectorization hints*
1523 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
1524 // provides *explicit vectorization hints* (LV can bypass legal checks and
1525 // assume that vectorization is legal). However, both hints are implemented
1526 // using the same metadata (llvm.loop.vectorize, processed by
1527 // LoopVectorizeHints). This will be fixed in the future when the native IR
1528 // representation for pragma 'omp simd' is introduced.
1529 static bool isExplicitVecOuterLoop(Loop *OuterLp,
1530                                    OptimizationRemarkEmitter *ORE) {
1531   assert(!OuterLp->empty() && "This is not an outer loop");
1532   LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
1533 
1534   // Only outer loops with an explicit vectorization hint are supported.
1535   // Unannotated outer loops are ignored.
1536   if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
1537     return false;
1538 
1539   Function *Fn = OuterLp->getHeader()->getParent();
1540   if (!Hints.allowVectorization(Fn, OuterLp,
1541                                 true /*VectorizeOnlyWhenForced*/)) {
1542     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
1543     return false;
1544   }
1545 
1546   if (Hints.getInterleave() > 1) {
1547     // TODO: Interleave support is future work.
1548     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
1549                          "outer loops.\n");
1550     Hints.emitRemarkWithHints();
1551     return false;
1552   }
1553 
1554   return true;
1555 }
1556 
1557 static void collectSupportedLoops(Loop &L, LoopInfo *LI,
1558                                   OptimizationRemarkEmitter *ORE,
1559                                   SmallVectorImpl<Loop *> &V) {
1560   // Collect inner loops and outer loops without irreducible control flow. For
1561   // now, only collect outer loops that have explicit vectorization hints. If we
1562   // are stress testing the VPlan H-CFG construction, we collect the outermost
1563   // loop of every loop nest.
1564   if (L.empty() || VPlanBuildStressTest ||
1565       (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
1566     LoopBlocksRPO RPOT(&L);
1567     RPOT.perform(LI);
1568     if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
1569       V.push_back(&L);
1570       // TODO: Collect inner loops inside marked outer loops in case
1571       // vectorization fails for the outer loop. Do not invoke
1572       // 'containsIrreducibleCFG' again for inner loops when the outer loop is
1573       // already known to be reducible. We can use an inherited attribute for
1574       // that.
1575       return;
1576     }
1577   }
1578   for (Loop *InnerL : L)
1579     collectSupportedLoops(*InnerL, LI, ORE, V);
1580 }
1581 
1582 namespace {
1583 
1584 /// The LoopVectorize Pass.
1585 struct LoopVectorize : public FunctionPass {
1586   /// Pass identification, replacement for typeid
1587   static char ID;
1588 
1589   LoopVectorizePass Impl;
1590 
1591   explicit LoopVectorize(bool InterleaveOnlyWhenForced = false,
1592                          bool VectorizeOnlyWhenForced = false)
1593       : FunctionPass(ID) {
1594     Impl.InterleaveOnlyWhenForced = InterleaveOnlyWhenForced;
1595     Impl.VectorizeOnlyWhenForced = VectorizeOnlyWhenForced;
1596     initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
1597   }
1598 
1599   bool runOnFunction(Function &F) override {
1600     if (skipFunction(F))
1601       return false;
1602 
1603     auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
1604     auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
1605     auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
1606     auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
1607     auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
1608     auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
1609     auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
1610     auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
1611     auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
1612     auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
1613     auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
1614     auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
1615     auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
1616 
1617     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
1618         [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
1619 
1620     return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC,
1621                         GetLAA, *ORE, PSI);
1622   }
1623 
1624   void getAnalysisUsage(AnalysisUsage &AU) const override {
1625     AU.addRequired<AssumptionCacheTracker>();
1626     AU.addRequired<BlockFrequencyInfoWrapperPass>();
1627     AU.addRequired<DominatorTreeWrapperPass>();
1628     AU.addRequired<LoopInfoWrapperPass>();
1629     AU.addRequired<ScalarEvolutionWrapperPass>();
1630     AU.addRequired<TargetTransformInfoWrapperPass>();
1631     AU.addRequired<AAResultsWrapperPass>();
1632     AU.addRequired<LoopAccessLegacyAnalysis>();
1633     AU.addRequired<DemandedBitsWrapperPass>();
1634     AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
1635     AU.addRequired<InjectTLIMappingsLegacy>();
1636 
1637     // We currently do not preserve loopinfo/dominator analyses with outer loop
1638     // vectorization. Until this is addressed, mark these analyses as preserved
1639     // only for non-VPlan-native path.
1640     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
1641     if (!EnableVPlanNativePath) {
1642       AU.addPreserved<LoopInfoWrapperPass>();
1643       AU.addPreserved<DominatorTreeWrapperPass>();
1644     }
1645 
1646     AU.addPreserved<BasicAAWrapperPass>();
1647     AU.addPreserved<GlobalsAAWrapperPass>();
1648     AU.addRequired<ProfileSummaryInfoWrapperPass>();
1649   }
1650 };
1651 
1652 } // end anonymous namespace
1653 
1654 //===----------------------------------------------------------------------===//
1655 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
1656 // LoopVectorizationCostModel and LoopVectorizationPlanner.
1657 //===----------------------------------------------------------------------===//
1658 
1659 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
1660   // We need to place the broadcast of invariant variables outside the loop,
1661   // but only if it's proven safe to do so. Else, broadcast will be inside
1662   // vector loop body.
1663   Instruction *Instr = dyn_cast<Instruction>(V);
1664   bool SafeToHoist = OrigLoop->isLoopInvariant(V) &&
1665                      (!Instr ||
1666                       DT->dominates(Instr->getParent(), LoopVectorPreHeader));
1667   // Place the code for broadcasting invariant variables in the new preheader.
1668   IRBuilder<>::InsertPointGuard Guard(Builder);
1669   if (SafeToHoist)
1670     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
1671 
1672   // Broadcast the scalar into all locations in the vector.
1673   Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
1674 
1675   return Shuf;
1676 }
1677 
1678 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
1679     const InductionDescriptor &II, Value *Step, Instruction *EntryVal) {
1680   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1681          "Expected either an induction phi-node or a truncate of it!");
1682   Value *Start = II.getStartValue();
1683 
1684   // Construct the initial value of the vector IV in the vector loop preheader
1685   auto CurrIP = Builder.saveIP();
1686   Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
1687   if (isa<TruncInst>(EntryVal)) {
1688     assert(Start->getType()->isIntegerTy() &&
1689            "Truncation requires an integer type");
1690     auto *TruncType = cast<IntegerType>(EntryVal->getType());
1691     Step = Builder.CreateTrunc(Step, TruncType);
1692     Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
1693   }
1694   Value *SplatStart = Builder.CreateVectorSplat(VF, Start);
1695   Value *SteppedStart =
1696       getStepVector(SplatStart, 0, Step, II.getInductionOpcode());
1697 
1698   // We create vector phi nodes for both integer and floating-point induction
1699   // variables. Here, we determine the kind of arithmetic we will perform.
1700   Instruction::BinaryOps AddOp;
1701   Instruction::BinaryOps MulOp;
1702   if (Step->getType()->isIntegerTy()) {
1703     AddOp = Instruction::Add;
1704     MulOp = Instruction::Mul;
1705   } else {
1706     AddOp = II.getInductionOpcode();
1707     MulOp = Instruction::FMul;
1708   }
1709 
1710   // Multiply the vectorization factor by the step using integer or
1711   // floating-point arithmetic as appropriate.
1712   Value *ConstVF = getSignedIntOrFpConstant(Step->getType(), VF);
1713   Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF));
1714 
1715   // Create a vector splat to use in the induction update.
1716   //
1717   // FIXME: If the step is non-constant, we create the vector splat with
1718   //        IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
1719   //        handle a constant vector splat.
1720   Value *SplatVF = isa<Constant>(Mul)
1721                        ? ConstantVector::getSplat(VF, cast<Constant>(Mul))
1722                        : Builder.CreateVectorSplat(VF, Mul);
1723   Builder.restoreIP(CurrIP);
1724 
1725   // We may need to add the step a number of times, depending on the unroll
1726   // factor. The last of those goes into the PHI.
1727   PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
1728                                     &*LoopVectorBody->getFirstInsertionPt());
1729   VecInd->setDebugLoc(EntryVal->getDebugLoc());
1730   Instruction *LastInduction = VecInd;
1731   for (unsigned Part = 0; Part < UF; ++Part) {
1732     VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction);
1733 
1734     if (isa<TruncInst>(EntryVal))
1735       addMetadata(LastInduction, EntryVal);
1736     recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, Part);
1737 
1738     LastInduction = cast<Instruction>(addFastMathFlag(
1739         Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")));
1740     LastInduction->setDebugLoc(EntryVal->getDebugLoc());
1741   }
1742 
1743   // Move the last step to the end of the latch block. This ensures consistent
1744   // placement of all induction updates.
1745   auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
1746   auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator());
1747   auto *ICmp = cast<Instruction>(Br->getCondition());
1748   LastInduction->moveBefore(ICmp);
1749   LastInduction->setName("vec.ind.next");
1750 
1751   VecInd->addIncoming(SteppedStart, LoopVectorPreHeader);
1752   VecInd->addIncoming(LastInduction, LoopVectorLatch);
1753 }
1754 
1755 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const {
1756   return Cost->isScalarAfterVectorization(I, VF) ||
1757          Cost->isProfitableToScalarize(I, VF);
1758 }
1759 
1760 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const {
1761   if (shouldScalarizeInstruction(IV))
1762     return true;
1763   auto isScalarInst = [&](User *U) -> bool {
1764     auto *I = cast<Instruction>(U);
1765     return (OrigLoop->contains(I) && shouldScalarizeInstruction(I));
1766   };
1767   return llvm::any_of(IV->users(), isScalarInst);
1768 }
1769 
1770 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast(
1771     const InductionDescriptor &ID, const Instruction *EntryVal,
1772     Value *VectorLoopVal, unsigned Part, unsigned Lane) {
1773   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1774          "Expected either an induction phi-node or a truncate of it!");
1775 
1776   // This induction variable is not the phi from the original loop but the
1777   // newly-created IV based on the proof that casted Phi is equal to the
1778   // uncasted Phi in the vectorized loop (under a runtime guard possibly). It
1779   // re-uses the same InductionDescriptor that original IV uses but we don't
1780   // have to do any recording in this case - that is done when original IV is
1781   // processed.
1782   if (isa<TruncInst>(EntryVal))
1783     return;
1784 
1785   const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts();
1786   if (Casts.empty())
1787     return;
1788   // Only the first Cast instruction in the Casts vector is of interest.
1789   // The rest of the Casts (if exist) have no uses outside the
1790   // induction update chain itself.
1791   Instruction *CastInst = *Casts.begin();
1792   if (Lane < UINT_MAX)
1793     VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal);
1794   else
1795     VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal);
1796 }
1797 
1798 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) {
1799   assert((IV->getType()->isIntegerTy() || IV != OldInduction) &&
1800          "Primary induction variable must have an integer type");
1801 
1802   auto II = Legal->getInductionVars()->find(IV);
1803   assert(II != Legal->getInductionVars()->end() && "IV is not an induction");
1804 
1805   auto ID = II->second;
1806   assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
1807 
1808   // The scalar value to broadcast. This will be derived from the canonical
1809   // induction variable.
1810   Value *ScalarIV = nullptr;
1811 
1812   // The value from the original loop to which we are mapping the new induction
1813   // variable.
1814   Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
1815 
1816   // True if we have vectorized the induction variable.
1817   auto VectorizedIV = false;
1818 
1819   // Determine if we want a scalar version of the induction variable. This is
1820   // true if the induction variable itself is not widened, or if it has at
1821   // least one user in the loop that is not widened.
1822   auto NeedsScalarIV = VF > 1 && needsScalarInduction(EntryVal);
1823 
1824   // Generate code for the induction step. Note that induction steps are
1825   // required to be loop-invariant
1826   assert(PSE.getSE()->isLoopInvariant(ID.getStep(), OrigLoop) &&
1827          "Induction step should be loop invariant");
1828   auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
1829   Value *Step = nullptr;
1830   if (PSE.getSE()->isSCEVable(IV->getType())) {
1831     SCEVExpander Exp(*PSE.getSE(), DL, "induction");
1832     Step = Exp.expandCodeFor(ID.getStep(), ID.getStep()->getType(),
1833                              LoopVectorPreHeader->getTerminator());
1834   } else {
1835     Step = cast<SCEVUnknown>(ID.getStep())->getValue();
1836   }
1837 
1838   // Try to create a new independent vector induction variable. If we can't
1839   // create the phi node, we will splat the scalar induction variable in each
1840   // loop iteration.
1841   if (VF > 1 && !shouldScalarizeInstruction(EntryVal)) {
1842     createVectorIntOrFpInductionPHI(ID, Step, EntryVal);
1843     VectorizedIV = true;
1844   }
1845 
1846   // If we haven't yet vectorized the induction variable, or if we will create
1847   // a scalar one, we need to define the scalar induction variable and step
1848   // values. If we were given a truncation type, truncate the canonical
1849   // induction variable and step. Otherwise, derive these values from the
1850   // induction descriptor.
1851   if (!VectorizedIV || NeedsScalarIV) {
1852     ScalarIV = Induction;
1853     if (IV != OldInduction) {
1854       ScalarIV = IV->getType()->isIntegerTy()
1855                      ? Builder.CreateSExtOrTrunc(Induction, IV->getType())
1856                      : Builder.CreateCast(Instruction::SIToFP, Induction,
1857                                           IV->getType());
1858       ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID);
1859       ScalarIV->setName("offset.idx");
1860     }
1861     if (Trunc) {
1862       auto *TruncType = cast<IntegerType>(Trunc->getType());
1863       assert(Step->getType()->isIntegerTy() &&
1864              "Truncation requires an integer step");
1865       ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType);
1866       Step = Builder.CreateTrunc(Step, TruncType);
1867     }
1868   }
1869 
1870   // If we haven't yet vectorized the induction variable, splat the scalar
1871   // induction variable, and build the necessary step vectors.
1872   // TODO: Don't do it unless the vectorized IV is really required.
1873   if (!VectorizedIV) {
1874     Value *Broadcasted = getBroadcastInstrs(ScalarIV);
1875     for (unsigned Part = 0; Part < UF; ++Part) {
1876       Value *EntryPart =
1877           getStepVector(Broadcasted, VF * Part, Step, ID.getInductionOpcode());
1878       VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart);
1879       if (Trunc)
1880         addMetadata(EntryPart, Trunc);
1881       recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part);
1882     }
1883   }
1884 
1885   // If an induction variable is only used for counting loop iterations or
1886   // calculating addresses, it doesn't need to be widened. Create scalar steps
1887   // that can be used by instructions we will later scalarize. Note that the
1888   // addition of the scalar steps will not increase the number of instructions
1889   // in the loop in the common case prior to InstCombine. We will be trading
1890   // one vector extract for each scalar step.
1891   if (NeedsScalarIV)
1892     buildScalarSteps(ScalarIV, Step, EntryVal, ID);
1893 }
1894 
1895 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step,
1896                                           Instruction::BinaryOps BinOp) {
1897   // Create and check the types.
1898   assert(Val->getType()->isVectorTy() && "Must be a vector");
1899   int VLen = Val->getType()->getVectorNumElements();
1900 
1901   Type *STy = Val->getType()->getScalarType();
1902   assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
1903          "Induction Step must be an integer or FP");
1904   assert(Step->getType() == STy && "Step has wrong type");
1905 
1906   SmallVector<Constant *, 8> Indices;
1907 
1908   if (STy->isIntegerTy()) {
1909     // Create a vector of consecutive numbers from zero to VF.
1910     for (int i = 0; i < VLen; ++i)
1911       Indices.push_back(ConstantInt::get(STy, StartIdx + i));
1912 
1913     // Add the consecutive indices to the vector value.
1914     Constant *Cv = ConstantVector::get(Indices);
1915     assert(Cv->getType() == Val->getType() && "Invalid consecutive vec");
1916     Step = Builder.CreateVectorSplat(VLen, Step);
1917     assert(Step->getType() == Val->getType() && "Invalid step vec");
1918     // FIXME: The newly created binary instructions should contain nsw/nuw flags,
1919     // which can be found from the original scalar operations.
1920     Step = Builder.CreateMul(Cv, Step);
1921     return Builder.CreateAdd(Val, Step, "induction");
1922   }
1923 
1924   // Floating point induction.
1925   assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
1926          "Binary Opcode should be specified for FP induction");
1927   // Create a vector of consecutive numbers from zero to VF.
1928   for (int i = 0; i < VLen; ++i)
1929     Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i)));
1930 
1931   // Add the consecutive indices to the vector value.
1932   Constant *Cv = ConstantVector::get(Indices);
1933 
1934   Step = Builder.CreateVectorSplat(VLen, Step);
1935 
1936   // Floating point operations had to be 'fast' to enable the induction.
1937   FastMathFlags Flags;
1938   Flags.setFast();
1939 
1940   Value *MulOp = Builder.CreateFMul(Cv, Step);
1941   if (isa<Instruction>(MulOp))
1942     // Have to check, MulOp may be a constant
1943     cast<Instruction>(MulOp)->setFastMathFlags(Flags);
1944 
1945   Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
1946   if (isa<Instruction>(BOp))
1947     cast<Instruction>(BOp)->setFastMathFlags(Flags);
1948   return BOp;
1949 }
1950 
1951 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
1952                                            Instruction *EntryVal,
1953                                            const InductionDescriptor &ID) {
1954   // We shouldn't have to build scalar steps if we aren't vectorizing.
1955   assert(VF > 1 && "VF should be greater than one");
1956 
1957   // Get the value type and ensure it and the step have the same integer type.
1958   Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
1959   assert(ScalarIVTy == Step->getType() &&
1960          "Val and Step should have the same type");
1961 
1962   // We build scalar steps for both integer and floating-point induction
1963   // variables. Here, we determine the kind of arithmetic we will perform.
1964   Instruction::BinaryOps AddOp;
1965   Instruction::BinaryOps MulOp;
1966   if (ScalarIVTy->isIntegerTy()) {
1967     AddOp = Instruction::Add;
1968     MulOp = Instruction::Mul;
1969   } else {
1970     AddOp = ID.getInductionOpcode();
1971     MulOp = Instruction::FMul;
1972   }
1973 
1974   // Determine the number of scalars we need to generate for each unroll
1975   // iteration. If EntryVal is uniform, we only need to generate the first
1976   // lane. Otherwise, we generate all VF values.
1977   unsigned Lanes =
1978       Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) ? 1
1979                                                                          : VF;
1980   // Compute the scalar steps and save the results in VectorLoopValueMap.
1981   for (unsigned Part = 0; Part < UF; ++Part) {
1982     for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
1983       auto *StartIdx = getSignedIntOrFpConstant(ScalarIVTy, VF * Part + Lane);
1984       auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step));
1985       auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul));
1986       VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add);
1987       recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane);
1988     }
1989   }
1990 }
1991 
1992 Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) {
1993   assert(V != Induction && "The new induction variable should not be used.");
1994   assert(!V->getType()->isVectorTy() && "Can't widen a vector");
1995   assert(!V->getType()->isVoidTy() && "Type does not produce a value");
1996 
1997   // If we have a stride that is replaced by one, do it here. Defer this for
1998   // the VPlan-native path until we start running Legal checks in that path.
1999   if (!EnableVPlanNativePath && Legal->hasStride(V))
2000     V = ConstantInt::get(V->getType(), 1);
2001 
2002   // If we have a vector mapped to this value, return it.
2003   if (VectorLoopValueMap.hasVectorValue(V, Part))
2004     return VectorLoopValueMap.getVectorValue(V, Part);
2005 
2006   // If the value has not been vectorized, check if it has been scalarized
2007   // instead. If it has been scalarized, and we actually need the value in
2008   // vector form, we will construct the vector values on demand.
2009   if (VectorLoopValueMap.hasAnyScalarValue(V)) {
2010     Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0});
2011 
2012     // If we've scalarized a value, that value should be an instruction.
2013     auto *I = cast<Instruction>(V);
2014 
2015     // If we aren't vectorizing, we can just copy the scalar map values over to
2016     // the vector map.
2017     if (VF == 1) {
2018       VectorLoopValueMap.setVectorValue(V, Part, ScalarValue);
2019       return ScalarValue;
2020     }
2021 
2022     // Get the last scalar instruction we generated for V and Part. If the value
2023     // is known to be uniform after vectorization, this corresponds to lane zero
2024     // of the Part unroll iteration. Otherwise, the last instruction is the one
2025     // we created for the last vector lane of the Part unroll iteration.
2026     unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) ? 0 : VF - 1;
2027     auto *LastInst = cast<Instruction>(
2028         VectorLoopValueMap.getScalarValue(V, {Part, LastLane}));
2029 
2030     // Set the insert point after the last scalarized instruction. This ensures
2031     // the insertelement sequence will directly follow the scalar definitions.
2032     auto OldIP = Builder.saveIP();
2033     auto NewIP = std::next(BasicBlock::iterator(LastInst));
2034     Builder.SetInsertPoint(&*NewIP);
2035 
2036     // However, if we are vectorizing, we need to construct the vector values.
2037     // If the value is known to be uniform after vectorization, we can just
2038     // broadcast the scalar value corresponding to lane zero for each unroll
2039     // iteration. Otherwise, we construct the vector values using insertelement
2040     // instructions. Since the resulting vectors are stored in
2041     // VectorLoopValueMap, we will only generate the insertelements once.
2042     Value *VectorValue = nullptr;
2043     if (Cost->isUniformAfterVectorization(I, VF)) {
2044       VectorValue = getBroadcastInstrs(ScalarValue);
2045       VectorLoopValueMap.setVectorValue(V, Part, VectorValue);
2046     } else {
2047       // Initialize packing with insertelements to start from undef.
2048       Value *Undef = UndefValue::get(VectorType::get(V->getType(), VF));
2049       VectorLoopValueMap.setVectorValue(V, Part, Undef);
2050       for (unsigned Lane = 0; Lane < VF; ++Lane)
2051         packScalarIntoVectorValue(V, {Part, Lane});
2052       VectorValue = VectorLoopValueMap.getVectorValue(V, Part);
2053     }
2054     Builder.restoreIP(OldIP);
2055     return VectorValue;
2056   }
2057 
2058   // If this scalar is unknown, assume that it is a constant or that it is
2059   // loop invariant. Broadcast V and save the value for future uses.
2060   Value *B = getBroadcastInstrs(V);
2061   VectorLoopValueMap.setVectorValue(V, Part, B);
2062   return B;
2063 }
2064 
2065 Value *
2066 InnerLoopVectorizer::getOrCreateScalarValue(Value *V,
2067                                             const VPIteration &Instance) {
2068   // If the value is not an instruction contained in the loop, it should
2069   // already be scalar.
2070   if (OrigLoop->isLoopInvariant(V))
2071     return V;
2072 
2073   assert(Instance.Lane > 0
2074              ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF)
2075              : true && "Uniform values only have lane zero");
2076 
2077   // If the value from the original loop has not been vectorized, it is
2078   // represented by UF x VF scalar values in the new loop. Return the requested
2079   // scalar value.
2080   if (VectorLoopValueMap.hasScalarValue(V, Instance))
2081     return VectorLoopValueMap.getScalarValue(V, Instance);
2082 
2083   // If the value has not been scalarized, get its entry in VectorLoopValueMap
2084   // for the given unroll part. If this entry is not a vector type (i.e., the
2085   // vectorization factor is one), there is no need to generate an
2086   // extractelement instruction.
2087   auto *U = getOrCreateVectorValue(V, Instance.Part);
2088   if (!U->getType()->isVectorTy()) {
2089     assert(VF == 1 && "Value not scalarized has non-vector type");
2090     return U;
2091   }
2092 
2093   // Otherwise, the value from the original loop has been vectorized and is
2094   // represented by UF vector values. Extract and return the requested scalar
2095   // value from the appropriate vector lane.
2096   return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane));
2097 }
2098 
2099 void InnerLoopVectorizer::packScalarIntoVectorValue(
2100     Value *V, const VPIteration &Instance) {
2101   assert(V != Induction && "The new induction variable should not be used.");
2102   assert(!V->getType()->isVectorTy() && "Can't pack a vector");
2103   assert(!V->getType()->isVoidTy() && "Type does not produce a value");
2104 
2105   Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance);
2106   Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part);
2107   VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst,
2108                                             Builder.getInt32(Instance.Lane));
2109   VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue);
2110 }
2111 
2112 Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
2113   assert(Vec->getType()->isVectorTy() && "Invalid type");
2114   SmallVector<Constant *, 8> ShuffleMask;
2115   for (unsigned i = 0; i < VF; ++i)
2116     ShuffleMask.push_back(Builder.getInt32(VF - i - 1));
2117 
2118   return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()),
2119                                      ConstantVector::get(ShuffleMask),
2120                                      "reverse");
2121 }
2122 
2123 // Return whether we allow using masked interleave-groups (for dealing with
2124 // strided loads/stores that reside in predicated blocks, or for dealing
2125 // with gaps).
2126 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2127   // If an override option has been passed in for interleaved accesses, use it.
2128   if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2129     return EnableMaskedInterleavedMemAccesses;
2130 
2131   return TTI.enableMaskedInterleavedAccessVectorization();
2132 }
2133 
2134 // Try to vectorize the interleave group that \p Instr belongs to.
2135 //
2136 // E.g. Translate following interleaved load group (factor = 3):
2137 //   for (i = 0; i < N; i+=3) {
2138 //     R = Pic[i];             // Member of index 0
2139 //     G = Pic[i+1];           // Member of index 1
2140 //     B = Pic[i+2];           // Member of index 2
2141 //     ... // do something to R, G, B
2142 //   }
2143 // To:
2144 //   %wide.vec = load <12 x i32>                       ; Read 4 tuples of R,G,B
2145 //   %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9>   ; R elements
2146 //   %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10>  ; G elements
2147 //   %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11>  ; B elements
2148 //
2149 // Or translate following interleaved store group (factor = 3):
2150 //   for (i = 0; i < N; i+=3) {
2151 //     ... do something to R, G, B
2152 //     Pic[i]   = R;           // Member of index 0
2153 //     Pic[i+1] = G;           // Member of index 1
2154 //     Pic[i+2] = B;           // Member of index 2
2155 //   }
2156 // To:
2157 //   %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2158 //   %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u>
2159 //   %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2160 //        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements
2161 //   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
2162 void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr,
2163                                                    VPTransformState &State,
2164                                                    VPValue *Addr,
2165                                                    VPValue *BlockInMask) {
2166   const InterleaveGroup<Instruction> *Group =
2167       Cost->getInterleavedAccessGroup(Instr);
2168   assert(Group && "Fail to get an interleaved access group.");
2169 
2170   // Skip if current instruction is not the insert position.
2171   if (Instr != Group->getInsertPos())
2172     return;
2173 
2174   const DataLayout &DL = Instr->getModule()->getDataLayout();
2175 
2176   // Prepare for the vector type of the interleaved load/store.
2177   Type *ScalarTy = getMemInstValueType(Instr);
2178   unsigned InterleaveFactor = Group->getFactor();
2179   Type *VecTy = VectorType::get(ScalarTy, InterleaveFactor * VF);
2180 
2181   // Prepare for the new pointers.
2182   SmallVector<Value *, 2> AddrParts;
2183   unsigned Index = Group->getIndex(Instr);
2184 
2185   // TODO: extend the masked interleaved-group support to reversed access.
2186   assert((!BlockInMask || !Group->isReverse()) &&
2187          "Reversed masked interleave-group not supported.");
2188 
2189   // If the group is reverse, adjust the index to refer to the last vector lane
2190   // instead of the first. We adjust the index from the first vector lane,
2191   // rather than directly getting the pointer for lane VF - 1, because the
2192   // pointer operand of the interleaved access is supposed to be uniform. For
2193   // uniform instructions, we're only required to generate a value for the
2194   // first vector lane in each unroll iteration.
2195   if (Group->isReverse())
2196     Index += (VF - 1) * Group->getFactor();
2197 
2198   for (unsigned Part = 0; Part < UF; Part++) {
2199     Value *AddrPart = State.get(Addr, {Part, 0});
2200     setDebugLocFromInst(Builder, AddrPart);
2201 
2202     // Notice current instruction could be any index. Need to adjust the address
2203     // to the member of index 0.
2204     //
2205     // E.g.  a = A[i+1];     // Member of index 1 (Current instruction)
2206     //       b = A[i];       // Member of index 0
2207     // Current pointer is pointed to A[i+1], adjust it to A[i].
2208     //
2209     // E.g.  A[i+1] = a;     // Member of index 1
2210     //       A[i]   = b;     // Member of index 0
2211     //       A[i+2] = c;     // Member of index 2 (Current instruction)
2212     // Current pointer is pointed to A[i+2], adjust it to A[i].
2213 
2214     bool InBounds = false;
2215     if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts()))
2216       InBounds = gep->isInBounds();
2217     AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index));
2218     cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds);
2219 
2220     // Cast to the vector pointer type.
2221     unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace();
2222     Type *PtrTy = VecTy->getPointerTo(AddressSpace);
2223     AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy));
2224   }
2225 
2226   setDebugLocFromInst(Builder, Instr);
2227   Value *UndefVec = UndefValue::get(VecTy);
2228 
2229   Value *MaskForGaps = nullptr;
2230   if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) {
2231     MaskForGaps = createBitMaskForGaps(Builder, VF, *Group);
2232     assert(MaskForGaps && "Mask for Gaps is required but it is null");
2233   }
2234 
2235   // Vectorize the interleaved load group.
2236   if (isa<LoadInst>(Instr)) {
2237     // For each unroll part, create a wide load for the group.
2238     SmallVector<Value *, 2> NewLoads;
2239     for (unsigned Part = 0; Part < UF; Part++) {
2240       Instruction *NewLoad;
2241       if (BlockInMask || MaskForGaps) {
2242         assert(useMaskedInterleavedAccesses(*TTI) &&
2243                "masked interleaved groups are not allowed.");
2244         Value *GroupMask = MaskForGaps;
2245         if (BlockInMask) {
2246           Value *BlockInMaskPart = State.get(BlockInMask, Part);
2247           auto *Undefs = UndefValue::get(BlockInMaskPart->getType());
2248           auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF);
2249           Value *ShuffledMask = Builder.CreateShuffleVector(
2250               BlockInMaskPart, Undefs, RepMask, "interleaved.mask");
2251           GroupMask = MaskForGaps
2252                           ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
2253                                                 MaskForGaps)
2254                           : ShuffledMask;
2255         }
2256         NewLoad =
2257             Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlign(),
2258                                      GroupMask, UndefVec, "wide.masked.vec");
2259       }
2260       else
2261         NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part],
2262                                             Group->getAlign(), "wide.vec");
2263       Group->addMetadata(NewLoad);
2264       NewLoads.push_back(NewLoad);
2265     }
2266 
2267     // For each member in the group, shuffle out the appropriate data from the
2268     // wide loads.
2269     for (unsigned I = 0; I < InterleaveFactor; ++I) {
2270       Instruction *Member = Group->getMember(I);
2271 
2272       // Skip the gaps in the group.
2273       if (!Member)
2274         continue;
2275 
2276       Constant *StrideMask = createStrideMask(Builder, I, InterleaveFactor, VF);
2277       for (unsigned Part = 0; Part < UF; Part++) {
2278         Value *StridedVec = Builder.CreateShuffleVector(
2279             NewLoads[Part], UndefVec, StrideMask, "strided.vec");
2280 
2281         // If this member has different type, cast the result type.
2282         if (Member->getType() != ScalarTy) {
2283           VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2284           StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2285         }
2286 
2287         if (Group->isReverse())
2288           StridedVec = reverseVector(StridedVec);
2289 
2290         VectorLoopValueMap.setVectorValue(Member, Part, StridedVec);
2291       }
2292     }
2293     return;
2294   }
2295 
2296   // The sub vector type for current instruction.
2297   VectorType *SubVT = VectorType::get(ScalarTy, VF);
2298 
2299   // Vectorize the interleaved store group.
2300   for (unsigned Part = 0; Part < UF; Part++) {
2301     // Collect the stored vector from each member.
2302     SmallVector<Value *, 4> StoredVecs;
2303     for (unsigned i = 0; i < InterleaveFactor; i++) {
2304       // Interleaved store group doesn't allow a gap, so each index has a member
2305       Instruction *Member = Group->getMember(i);
2306       assert(Member && "Fail to get a member from an interleaved store group");
2307 
2308       Value *StoredVec = getOrCreateVectorValue(
2309           cast<StoreInst>(Member)->getValueOperand(), Part);
2310       if (Group->isReverse())
2311         StoredVec = reverseVector(StoredVec);
2312 
2313       // If this member has different type, cast it to a unified type.
2314 
2315       if (StoredVec->getType() != SubVT)
2316         StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2317 
2318       StoredVecs.push_back(StoredVec);
2319     }
2320 
2321     // Concatenate all vectors into a wide vector.
2322     Value *WideVec = concatenateVectors(Builder, StoredVecs);
2323 
2324     // Interleave the elements in the wide vector.
2325     Constant *IMask = createInterleaveMask(Builder, VF, InterleaveFactor);
2326     Value *IVec = Builder.CreateShuffleVector(WideVec, UndefVec, IMask,
2327                                               "interleaved.vec");
2328 
2329     Instruction *NewStoreInstr;
2330     if (BlockInMask) {
2331       Value *BlockInMaskPart = State.get(BlockInMask, Part);
2332       auto *Undefs = UndefValue::get(BlockInMaskPart->getType());
2333       auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF);
2334       Value *ShuffledMask = Builder.CreateShuffleVector(
2335           BlockInMaskPart, Undefs, RepMask, "interleaved.mask");
2336       NewStoreInstr = Builder.CreateMaskedStore(
2337           IVec, AddrParts[Part], Group->getAlign(), ShuffledMask);
2338     }
2339     else
2340       NewStoreInstr =
2341           Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign());
2342 
2343     Group->addMetadata(NewStoreInstr);
2344   }
2345 }
2346 
2347 void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
2348                                                      VPTransformState &State,
2349                                                      VPValue *Addr,
2350                                                      VPValue *BlockInMask) {
2351   // Attempt to issue a wide load.
2352   LoadInst *LI = dyn_cast<LoadInst>(Instr);
2353   StoreInst *SI = dyn_cast<StoreInst>(Instr);
2354 
2355   assert((LI || SI) && "Invalid Load/Store instruction");
2356 
2357   LoopVectorizationCostModel::InstWidening Decision =
2358       Cost->getWideningDecision(Instr, VF);
2359   assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
2360          "CM decision should be taken at this point");
2361   if (Decision == LoopVectorizationCostModel::CM_Interleave)
2362     return vectorizeInterleaveGroup(Instr, State, Addr, BlockInMask);
2363 
2364   Type *ScalarDataTy = getMemInstValueType(Instr);
2365   Type *DataTy = VectorType::get(ScalarDataTy, VF);
2366   // An alignment of 0 means target abi alignment. We need to use the scalar's
2367   // target abi alignment in such a case.
2368   const DataLayout &DL = Instr->getModule()->getDataLayout();
2369   const Align Alignment =
2370       DL.getValueOrABITypeAlignment(getLoadStoreAlignment(Instr), ScalarDataTy);
2371 
2372   // Determine if the pointer operand of the access is either consecutive or
2373   // reverse consecutive.
2374   bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse);
2375   bool ConsecutiveStride =
2376       Reverse || (Decision == LoopVectorizationCostModel::CM_Widen);
2377   bool CreateGatherScatter =
2378       (Decision == LoopVectorizationCostModel::CM_GatherScatter);
2379 
2380   // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector
2381   // gather/scatter. Otherwise Decision should have been to Scalarize.
2382   assert((ConsecutiveStride || CreateGatherScatter) &&
2383          "The instruction should be scalarized");
2384   (void)ConsecutiveStride;
2385 
2386   VectorParts BlockInMaskParts(UF);
2387   bool isMaskRequired = BlockInMask;
2388   if (isMaskRequired)
2389     for (unsigned Part = 0; Part < UF; ++Part)
2390       BlockInMaskParts[Part] = State.get(BlockInMask, Part);
2391 
2392   const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
2393     // Calculate the pointer for the specific unroll-part.
2394     GetElementPtrInst *PartPtr = nullptr;
2395 
2396     bool InBounds = false;
2397     if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
2398       InBounds = gep->isInBounds();
2399 
2400     if (Reverse) {
2401       // If the address is consecutive but reversed, then the
2402       // wide store needs to start at the last vector element.
2403       PartPtr = cast<GetElementPtrInst>(
2404           Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(-Part * VF)));
2405       PartPtr->setIsInBounds(InBounds);
2406       PartPtr = cast<GetElementPtrInst>(
2407           Builder.CreateGEP(ScalarDataTy, PartPtr, Builder.getInt32(1 - VF)));
2408       PartPtr->setIsInBounds(InBounds);
2409       if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
2410         BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]);
2411     } else {
2412       PartPtr = cast<GetElementPtrInst>(
2413           Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(Part * VF)));
2414       PartPtr->setIsInBounds(InBounds);
2415     }
2416 
2417     unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
2418     return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
2419   };
2420 
2421   // Handle Stores:
2422   if (SI) {
2423     setDebugLocFromInst(Builder, SI);
2424 
2425     for (unsigned Part = 0; Part < UF; ++Part) {
2426       Instruction *NewSI = nullptr;
2427       Value *StoredVal = getOrCreateVectorValue(SI->getValueOperand(), Part);
2428       if (CreateGatherScatter) {
2429         Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
2430         Value *VectorGep = State.get(Addr, Part);
2431         NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
2432                                             MaskPart);
2433       } else {
2434         if (Reverse) {
2435           // If we store to reverse consecutive memory locations, then we need
2436           // to reverse the order of elements in the stored value.
2437           StoredVal = reverseVector(StoredVal);
2438           // We don't want to update the value in the map as it might be used in
2439           // another expression. So don't call resetVectorValue(StoredVal).
2440         }
2441         auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0}));
2442         if (isMaskRequired)
2443           NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
2444                                             BlockInMaskParts[Part]);
2445         else
2446           NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
2447       }
2448       addMetadata(NewSI, SI);
2449     }
2450     return;
2451   }
2452 
2453   // Handle loads.
2454   assert(LI && "Must have a load instruction");
2455   setDebugLocFromInst(Builder, LI);
2456   for (unsigned Part = 0; Part < UF; ++Part) {
2457     Value *NewLI;
2458     if (CreateGatherScatter) {
2459       Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
2460       Value *VectorGep = State.get(Addr, Part);
2461       NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart,
2462                                          nullptr, "wide.masked.gather");
2463       addMetadata(NewLI, LI);
2464     } else {
2465       auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0}));
2466       if (isMaskRequired)
2467         NewLI = Builder.CreateMaskedLoad(
2468             VecPtr, Alignment, BlockInMaskParts[Part], UndefValue::get(DataTy),
2469             "wide.masked.load");
2470       else
2471         NewLI =
2472             Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");
2473 
2474       // Add metadata to the load, but setVectorValue to the reverse shuffle.
2475       addMetadata(NewLI, LI);
2476       if (Reverse)
2477         NewLI = reverseVector(NewLI);
2478     }
2479     VectorLoopValueMap.setVectorValue(Instr, Part, NewLI);
2480   }
2481 }
2482 
2483 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
2484                                                const VPIteration &Instance,
2485                                                bool IfPredicateInstr) {
2486   assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
2487 
2488   setDebugLocFromInst(Builder, Instr);
2489 
2490   // Does this instruction return a value ?
2491   bool IsVoidRetTy = Instr->getType()->isVoidTy();
2492 
2493   Instruction *Cloned = Instr->clone();
2494   if (!IsVoidRetTy)
2495     Cloned->setName(Instr->getName() + ".cloned");
2496 
2497   // Replace the operands of the cloned instructions with their scalar
2498   // equivalents in the new loop.
2499   for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
2500     auto *NewOp = getOrCreateScalarValue(Instr->getOperand(op), Instance);
2501     Cloned->setOperand(op, NewOp);
2502   }
2503   addNewMetadata(Cloned, Instr);
2504 
2505   // Place the cloned scalar in the new loop.
2506   Builder.Insert(Cloned);
2507 
2508   // Add the cloned scalar to the scalar map entry.
2509   VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned);
2510 
2511   // If we just cloned a new assumption, add it the assumption cache.
2512   if (auto *II = dyn_cast<IntrinsicInst>(Cloned))
2513     if (II->getIntrinsicID() == Intrinsic::assume)
2514       AC->registerAssumption(II);
2515 
2516   // End if-block.
2517   if (IfPredicateInstr)
2518     PredicatedInstructions.push_back(Cloned);
2519 }
2520 
2521 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start,
2522                                                       Value *End, Value *Step,
2523                                                       Instruction *DL) {
2524   BasicBlock *Header = L->getHeader();
2525   BasicBlock *Latch = L->getLoopLatch();
2526   // As we're just creating this loop, it's possible no latch exists
2527   // yet. If so, use the header as this will be a single block loop.
2528   if (!Latch)
2529     Latch = Header;
2530 
2531   IRBuilder<> Builder(&*Header->getFirstInsertionPt());
2532   Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction);
2533   setDebugLocFromInst(Builder, OldInst);
2534   auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index");
2535 
2536   Builder.SetInsertPoint(Latch->getTerminator());
2537   setDebugLocFromInst(Builder, OldInst);
2538 
2539   // Create i+1 and fill the PHINode.
2540   Value *Next = Builder.CreateAdd(Induction, Step, "index.next");
2541   Induction->addIncoming(Start, L->getLoopPreheader());
2542   Induction->addIncoming(Next, Latch);
2543   // Create the compare.
2544   Value *ICmp = Builder.CreateICmpEQ(Next, End);
2545   Builder.CreateCondBr(ICmp, L->getExitBlock(), Header);
2546 
2547   // Now we have two terminators. Remove the old one from the block.
2548   Latch->getTerminator()->eraseFromParent();
2549 
2550   return Induction;
2551 }
2552 
2553 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
2554   if (TripCount)
2555     return TripCount;
2556 
2557   assert(L && "Create Trip Count for null loop.");
2558   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2559   // Find the loop boundaries.
2560   ScalarEvolution *SE = PSE.getSE();
2561   const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
2562   assert(BackedgeTakenCount != SE->getCouldNotCompute() &&
2563          "Invalid loop count");
2564 
2565   Type *IdxTy = Legal->getWidestInductionType();
2566   assert(IdxTy && "No type for induction");
2567 
2568   // The exit count might have the type of i64 while the phi is i32. This can
2569   // happen if we have an induction variable that is sign extended before the
2570   // compare. The only way that we get a backedge taken count is that the
2571   // induction variable was signed and as such will not overflow. In such a case
2572   // truncation is legal.
2573   if (BackedgeTakenCount->getType()->getPrimitiveSizeInBits() >
2574       IdxTy->getPrimitiveSizeInBits())
2575     BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
2576   BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
2577 
2578   // Get the total trip count from the count by adding 1.
2579   const SCEV *ExitCount = SE->getAddExpr(
2580       BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
2581 
2582   const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
2583 
2584   // Expand the trip count and place the new instructions in the preheader.
2585   // Notice that the pre-header does not change, only the loop body.
2586   SCEVExpander Exp(*SE, DL, "induction");
2587 
2588   // Count holds the overall loop count (N).
2589   TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
2590                                 L->getLoopPreheader()->getTerminator());
2591 
2592   if (TripCount->getType()->isPointerTy())
2593     TripCount =
2594         CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
2595                                     L->getLoopPreheader()->getTerminator());
2596 
2597   return TripCount;
2598 }
2599 
2600 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
2601   if (VectorTripCount)
2602     return VectorTripCount;
2603 
2604   Value *TC = getOrCreateTripCount(L);
2605   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2606 
2607   Type *Ty = TC->getType();
2608   Constant *Step = ConstantInt::get(Ty, VF * UF);
2609 
2610   // If the tail is to be folded by masking, round the number of iterations N
2611   // up to a multiple of Step instead of rounding down. This is done by first
2612   // adding Step-1 and then rounding down. Note that it's ok if this addition
2613   // overflows: the vector induction variable will eventually wrap to zero given
2614   // that it starts at zero and its Step is a power of two; the loop will then
2615   // exit, with the last early-exit vector comparison also producing all-true.
2616   if (Cost->foldTailByMasking()) {
2617     assert(isPowerOf2_32(VF * UF) &&
2618            "VF*UF must be a power of 2 when folding tail by masking");
2619     TC = Builder.CreateAdd(TC, ConstantInt::get(Ty, VF * UF - 1), "n.rnd.up");
2620   }
2621 
2622   // Now we need to generate the expression for the part of the loop that the
2623   // vectorized body will execute. This is equal to N - (N % Step) if scalar
2624   // iterations are not required for correctness, or N - Step, otherwise. Step
2625   // is equal to the vectorization factor (number of SIMD elements) times the
2626   // unroll factor (number of SIMD instructions).
2627   Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
2628 
2629   // If there is a non-reversed interleaved group that may speculatively access
2630   // memory out-of-bounds, we need to ensure that there will be at least one
2631   // iteration of the scalar epilogue loop. Thus, if the step evenly divides
2632   // the trip count, we set the remainder to be equal to the step. If the step
2633   // does not evenly divide the trip count, no adjustment is necessary since
2634   // there will already be scalar iterations. Note that the minimum iterations
2635   // check ensures that N >= Step.
2636   if (VF > 1 && Cost->requiresScalarEpilogue()) {
2637     auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
2638     R = Builder.CreateSelect(IsZero, Step, R);
2639   }
2640 
2641   VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
2642 
2643   return VectorTripCount;
2644 }
2645 
2646 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
2647                                                    const DataLayout &DL) {
2648   // Verify that V is a vector type with same number of elements as DstVTy.
2649   unsigned VF = DstVTy->getNumElements();
2650   VectorType *SrcVecTy = cast<VectorType>(V->getType());
2651   assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match");
2652   Type *SrcElemTy = SrcVecTy->getElementType();
2653   Type *DstElemTy = DstVTy->getElementType();
2654   assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
2655          "Vector elements must have same size");
2656 
2657   // Do a direct cast if element types are castable.
2658   if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
2659     return Builder.CreateBitOrPointerCast(V, DstVTy);
2660   }
2661   // V cannot be directly casted to desired vector type.
2662   // May happen when V is a floating point vector but DstVTy is a vector of
2663   // pointers or vice-versa. Handle this using a two-step bitcast using an
2664   // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
2665   assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
2666          "Only one type should be a pointer type");
2667   assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
2668          "Only one type should be a floating point type");
2669   Type *IntTy =
2670       IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
2671   VectorType *VecIntTy = VectorType::get(IntTy, VF);
2672   Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
2673   return Builder.CreateBitOrPointerCast(CastVal, DstVTy);
2674 }
2675 
2676 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
2677                                                          BasicBlock *Bypass) {
2678   Value *Count = getOrCreateTripCount(L);
2679   // Reuse existing vector loop preheader for TC checks.
2680   // Note that new preheader block is generated for vector loop.
2681   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
2682   IRBuilder<> Builder(TCCheckBlock->getTerminator());
2683 
2684   // Generate code to check if the loop's trip count is less than VF * UF, or
2685   // equal to it in case a scalar epilogue is required; this implies that the
2686   // vector trip count is zero. This check also covers the case where adding one
2687   // to the backedge-taken count overflowed leading to an incorrect trip count
2688   // of zero. In this case we will also jump to the scalar loop.
2689   auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE
2690                                           : ICmpInst::ICMP_ULT;
2691 
2692   // If tail is to be folded, vector loop takes care of all iterations.
2693   Value *CheckMinIters = Builder.getFalse();
2694   if (!Cost->foldTailByMasking())
2695     CheckMinIters = Builder.CreateICmp(
2696         P, Count, ConstantInt::get(Count->getType(), VF * UF),
2697         "min.iters.check");
2698 
2699   // Create new preheader for vector loop.
2700   LoopVectorPreHeader =
2701       SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
2702                  "vector.ph");
2703 
2704   assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
2705                                DT->getNode(Bypass)->getIDom()) &&
2706          "TC check is expected to dominate Bypass");
2707 
2708   // Update dominator for Bypass & LoopExit.
2709   DT->changeImmediateDominator(Bypass, TCCheckBlock);
2710   DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
2711 
2712   ReplaceInstWithInst(
2713       TCCheckBlock->getTerminator(),
2714       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
2715   LoopBypassBlocks.push_back(TCCheckBlock);
2716 }
2717 
2718 void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
2719   // Reuse existing vector loop preheader for SCEV checks.
2720   // Note that new preheader block is generated for vector loop.
2721   BasicBlock *const SCEVCheckBlock = LoopVectorPreHeader;
2722 
2723   // Generate the code to check that the SCEV assumptions that we made.
2724   // We want the new basic block to start at the first instruction in a
2725   // sequence of instructions that form a check.
2726   SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(),
2727                    "scev.check");
2728   Value *SCEVCheck = Exp.expandCodeForPredicate(
2729       &PSE.getUnionPredicate(), SCEVCheckBlock->getTerminator());
2730 
2731   if (auto *C = dyn_cast<ConstantInt>(SCEVCheck))
2732     if (C->isZero())
2733       return;
2734 
2735   assert(!SCEVCheckBlock->getParent()->hasOptSize() &&
2736          "Cannot SCEV check stride or overflow when optimizing for size");
2737 
2738   SCEVCheckBlock->setName("vector.scevcheck");
2739   // Create new preheader for vector loop.
2740   LoopVectorPreHeader =
2741       SplitBlock(SCEVCheckBlock, SCEVCheckBlock->getTerminator(), DT, LI,
2742                  nullptr, "vector.ph");
2743 
2744   // Update dominator only if this is first RT check.
2745   if (LoopBypassBlocks.empty()) {
2746     DT->changeImmediateDominator(Bypass, SCEVCheckBlock);
2747     DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock);
2748   }
2749 
2750   ReplaceInstWithInst(
2751       SCEVCheckBlock->getTerminator(),
2752       BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheck));
2753   LoopBypassBlocks.push_back(SCEVCheckBlock);
2754   AddedSafetyChecks = true;
2755 }
2756 
2757 void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {
2758   // VPlan-native path does not do any analysis for runtime checks currently.
2759   if (EnableVPlanNativePath)
2760     return;
2761 
2762   // Reuse existing vector loop preheader for runtime memory checks.
2763   // Note that new preheader block is generated for vector loop.
2764   BasicBlock *const MemCheckBlock = L->getLoopPreheader();
2765 
2766   // Generate the code that checks in runtime if arrays overlap. We put the
2767   // checks into a separate block to make the more common case of few elements
2768   // faster.
2769   Instruction *FirstCheckInst;
2770   Instruction *MemRuntimeCheck;
2771   std::tie(FirstCheckInst, MemRuntimeCheck) =
2772       Legal->getLAI()->addRuntimeChecks(MemCheckBlock->getTerminator());
2773   if (!MemRuntimeCheck)
2774     return;
2775 
2776   if (MemCheckBlock->getParent()->hasOptSize()) {
2777     assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
2778            "Cannot emit memory checks when optimizing for size, unless forced "
2779            "to vectorize.");
2780     ORE->emit([&]() {
2781       return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
2782                                         L->getStartLoc(), L->getHeader())
2783              << "Code-size may be reduced by not forcing "
2784                 "vectorization, or by source-code modifications "
2785                 "eliminating the need for runtime checks "
2786                 "(e.g., adding 'restrict').";
2787     });
2788   }
2789 
2790   MemCheckBlock->setName("vector.memcheck");
2791   // Create new preheader for vector loop.
2792   LoopVectorPreHeader =
2793       SplitBlock(MemCheckBlock, MemCheckBlock->getTerminator(), DT, LI, nullptr,
2794                  "vector.ph");
2795 
2796   // Update dominator only if this is first RT check.
2797   if (LoopBypassBlocks.empty()) {
2798     DT->changeImmediateDominator(Bypass, MemCheckBlock);
2799     DT->changeImmediateDominator(LoopExitBlock, MemCheckBlock);
2800   }
2801 
2802   ReplaceInstWithInst(
2803       MemCheckBlock->getTerminator(),
2804       BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheck));
2805   LoopBypassBlocks.push_back(MemCheckBlock);
2806   AddedSafetyChecks = true;
2807 
2808   // We currently don't use LoopVersioning for the actual loop cloning but we
2809   // still use it to add the noalias metadata.
2810   LVer = std::make_unique<LoopVersioning>(*Legal->getLAI(), OrigLoop, LI, DT,
2811                                           PSE.getSE());
2812   LVer->prepareNoAliasMetadata();
2813 }
2814 
2815 Value *InnerLoopVectorizer::emitTransformedIndex(
2816     IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL,
2817     const InductionDescriptor &ID) const {
2818 
2819   SCEVExpander Exp(*SE, DL, "induction");
2820   auto Step = ID.getStep();
2821   auto StartValue = ID.getStartValue();
2822   assert(Index->getType() == Step->getType() &&
2823          "Index type does not match StepValue type");
2824 
2825   // Note: the IR at this point is broken. We cannot use SE to create any new
2826   // SCEV and then expand it, hoping that SCEV's simplification will give us
2827   // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
2828   // lead to various SCEV crashes. So all we can do is to use builder and rely
2829   // on InstCombine for future simplifications. Here we handle some trivial
2830   // cases only.
2831   auto CreateAdd = [&B](Value *X, Value *Y) {
2832     assert(X->getType() == Y->getType() && "Types don't match!");
2833     if (auto *CX = dyn_cast<ConstantInt>(X))
2834       if (CX->isZero())
2835         return Y;
2836     if (auto *CY = dyn_cast<ConstantInt>(Y))
2837       if (CY->isZero())
2838         return X;
2839     return B.CreateAdd(X, Y);
2840   };
2841 
2842   auto CreateMul = [&B](Value *X, Value *Y) {
2843     assert(X->getType() == Y->getType() && "Types don't match!");
2844     if (auto *CX = dyn_cast<ConstantInt>(X))
2845       if (CX->isOne())
2846         return Y;
2847     if (auto *CY = dyn_cast<ConstantInt>(Y))
2848       if (CY->isOne())
2849         return X;
2850     return B.CreateMul(X, Y);
2851   };
2852 
2853   switch (ID.getKind()) {
2854   case InductionDescriptor::IK_IntInduction: {
2855     assert(Index->getType() == StartValue->getType() &&
2856            "Index type does not match StartValue type");
2857     if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne())
2858       return B.CreateSub(StartValue, Index);
2859     auto *Offset = CreateMul(
2860         Index, Exp.expandCodeFor(Step, Index->getType(), &*B.GetInsertPoint()));
2861     return CreateAdd(StartValue, Offset);
2862   }
2863   case InductionDescriptor::IK_PtrInduction: {
2864     assert(isa<SCEVConstant>(Step) &&
2865            "Expected constant step for pointer induction");
2866     return B.CreateGEP(
2867         StartValue->getType()->getPointerElementType(), StartValue,
2868         CreateMul(Index, Exp.expandCodeFor(Step, Index->getType(),
2869                                            &*B.GetInsertPoint())));
2870   }
2871   case InductionDescriptor::IK_FpInduction: {
2872     assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
2873     auto InductionBinOp = ID.getInductionBinOp();
2874     assert(InductionBinOp &&
2875            (InductionBinOp->getOpcode() == Instruction::FAdd ||
2876             InductionBinOp->getOpcode() == Instruction::FSub) &&
2877            "Original bin op should be defined for FP induction");
2878 
2879     Value *StepValue = cast<SCEVUnknown>(Step)->getValue();
2880 
2881     // Floating point operations had to be 'fast' to enable the induction.
2882     FastMathFlags Flags;
2883     Flags.setFast();
2884 
2885     Value *MulExp = B.CreateFMul(StepValue, Index);
2886     if (isa<Instruction>(MulExp))
2887       // We have to check, the MulExp may be a constant.
2888       cast<Instruction>(MulExp)->setFastMathFlags(Flags);
2889 
2890     Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
2891                                "induction");
2892     if (isa<Instruction>(BOp))
2893       cast<Instruction>(BOp)->setFastMathFlags(Flags);
2894 
2895     return BOp;
2896   }
2897   case InductionDescriptor::IK_NoInduction:
2898     return nullptr;
2899   }
2900   llvm_unreachable("invalid enum");
2901 }
2902 
2903 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
2904   /*
2905    In this function we generate a new loop. The new loop will contain
2906    the vectorized instructions while the old loop will continue to run the
2907    scalar remainder.
2908 
2909        [ ] <-- loop iteration number check.
2910     /   |
2911    /    v
2912   |    [ ] <-- vector loop bypass (may consist of multiple blocks).
2913   |  /  |
2914   | /   v
2915   ||   [ ]     <-- vector pre header.
2916   |/    |
2917   |     v
2918   |    [  ] \
2919   |    [  ]_|   <-- vector loop.
2920   |     |
2921   |     v
2922   |   -[ ]   <--- middle-block.
2923   |  /  |
2924   | /   v
2925   -|- >[ ]     <--- new preheader.
2926    |    |
2927    |    v
2928    |   [ ] \
2929    |   [ ]_|   <-- old scalar loop to handle remainder.
2930     \   |
2931      \  v
2932       >[ ]     <-- exit block.
2933    ...
2934    */
2935 
2936   MDNode *OrigLoopID = OrigLoop->getLoopID();
2937 
2938   // Some loops have a single integer induction variable, while other loops
2939   // don't. One example is c++ iterators that often have multiple pointer
2940   // induction variables. In the code below we also support a case where we
2941   // don't have a single induction variable.
2942   //
2943   // We try to obtain an induction variable from the original loop as hard
2944   // as possible. However if we don't find one that:
2945   //   - is an integer
2946   //   - counts from zero, stepping by one
2947   //   - is the size of the widest induction variable type
2948   // then we create a new one.
2949   OldInduction = Legal->getPrimaryInduction();
2950   Type *IdxTy = Legal->getWidestInductionType();
2951 
2952   // Split the single block loop into the two loop structure described above.
2953   LoopScalarBody = OrigLoop->getHeader();
2954   LoopVectorPreHeader = OrigLoop->getLoopPreheader();
2955   LoopExitBlock = OrigLoop->getExitBlock();
2956   assert(LoopExitBlock && "Must have an exit block");
2957   assert(LoopVectorPreHeader && "Invalid loop structure");
2958 
2959   LoopMiddleBlock =
2960       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
2961                  LI, nullptr, "middle.block");
2962   LoopScalarPreHeader =
2963       SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,
2964                  nullptr, "scalar.ph");
2965   // We intentionally don't let SplitBlock to update LoopInfo since
2966   // LoopVectorBody should belong to another loop than LoopVectorPreHeader.
2967   // LoopVectorBody is explicitly added to the correct place few lines later.
2968   LoopVectorBody =
2969       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
2970                  nullptr, nullptr, "vector.body");
2971 
2972   // Update dominator for loop exit.
2973   DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
2974 
2975   // Create and register the new vector loop.
2976   Loop *Lp = LI->AllocateLoop();
2977   Loop *ParentLoop = OrigLoop->getParentLoop();
2978 
2979   // Insert the new loop into the loop nest and register the new basic blocks
2980   // before calling any utilities such as SCEV that require valid LoopInfo.
2981   if (ParentLoop) {
2982     ParentLoop->addChildLoop(Lp);
2983   } else {
2984     LI->addTopLevelLoop(Lp);
2985   }
2986   Lp->addBasicBlockToLoop(LoopVectorBody, *LI);
2987 
2988   // Find the loop boundaries.
2989   Value *Count = getOrCreateTripCount(Lp);
2990 
2991   Value *StartIdx = ConstantInt::get(IdxTy, 0);
2992 
2993   // Now, compare the new count to zero. If it is zero skip the vector loop and
2994   // jump to the scalar loop. This check also covers the case where the
2995   // backedge-taken count is uint##_max: adding one to it will overflow leading
2996   // to an incorrect trip count of zero. In this (rare) case we will also jump
2997   // to the scalar loop.
2998   emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader);
2999 
3000   // Generate the code to check any assumptions that we've made for SCEV
3001   // expressions.
3002   emitSCEVChecks(Lp, LoopScalarPreHeader);
3003 
3004   // Generate the code that checks in runtime if arrays overlap. We put the
3005   // checks into a separate block to make the more common case of few elements
3006   // faster.
3007   emitMemRuntimeChecks(Lp, LoopScalarPreHeader);
3008 
3009   // Generate the induction variable.
3010   // The loop step is equal to the vectorization factor (num of SIMD elements)
3011   // times the unroll factor (num of SIMD instructions).
3012   Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
3013   Constant *Step = ConstantInt::get(IdxTy, VF * UF);
3014   Induction =
3015       createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
3016                               getDebugLocFromInstOrOperands(OldInduction));
3017 
3018   // We are going to resume the execution of the scalar loop.
3019   // Go over all of the induction variables that we found and fix the
3020   // PHIs that are left in the scalar version of the loop.
3021   // The starting values of PHI nodes depend on the counter of the last
3022   // iteration in the vectorized loop.
3023   // If we come from a bypass edge then we need to start from the original
3024   // start value.
3025 
3026   // This variable saves the new starting index for the scalar loop. It is used
3027   // to test if there are any tail iterations left once the vector loop has
3028   // completed.
3029   LoopVectorizationLegality::InductionList *List = Legal->getInductionVars();
3030   for (auto &InductionEntry : *List) {
3031     PHINode *OrigPhi = InductionEntry.first;
3032     InductionDescriptor II = InductionEntry.second;
3033 
3034     // Create phi nodes to merge from the  backedge-taken check block.
3035     PHINode *BCResumeVal =
3036         PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
3037                         LoopScalarPreHeader->getTerminator());
3038     // Copy original phi DL over to the new one.
3039     BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
3040     Value *&EndValue = IVEndValues[OrigPhi];
3041     if (OrigPhi == OldInduction) {
3042       // We know what the end value is.
3043       EndValue = CountRoundDown;
3044     } else {
3045       IRBuilder<> B(Lp->getLoopPreheader()->getTerminator());
3046       Type *StepType = II.getStep()->getType();
3047       Instruction::CastOps CastOp =
3048           CastInst::getCastOpcode(CountRoundDown, true, StepType, true);
3049       Value *CRD = B.CreateCast(CastOp, CountRoundDown, StepType, "cast.crd");
3050       const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout();
3051       EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
3052       EndValue->setName("ind.end");
3053     }
3054 
3055     // The new PHI merges the original incoming value, in case of a bypass,
3056     // or the value at the end of the vectorized loop.
3057     BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);
3058 
3059     // Fix the scalar body counter (PHI node).
3060     // The old induction's phi node in the scalar body needs the truncated
3061     // value.
3062     for (BasicBlock *BB : LoopBypassBlocks)
3063       BCResumeVal->addIncoming(II.getStartValue(), BB);
3064     OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
3065   }
3066 
3067   // We need the OrigLoop (scalar loop part) latch terminator to help
3068   // produce correct debug info for the middle block BB instructions.
3069   // The legality check stage guarantees that the loop will have a single
3070   // latch.
3071   assert(isa<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()) &&
3072          "Scalar loop latch terminator isn't a branch");
3073   BranchInst *ScalarLatchBr =
3074       cast<BranchInst>(OrigLoop->getLoopLatch()->getTerminator());
3075 
3076   // Add a check in the middle block to see if we have completed
3077   // all of the iterations in the first vector loop.
3078   // If (N - N%VF) == N, then we *don't* need to run the remainder.
3079   // If tail is to be folded, we know we don't need to run the remainder.
3080   Value *CmpN = Builder.getTrue();
3081   if (!Cost->foldTailByMasking()) {
3082     CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count,
3083                            CountRoundDown, "cmp.n",
3084                            LoopMiddleBlock->getTerminator());
3085 
3086     // Here we use the same DebugLoc as the scalar loop latch branch instead
3087     // of the corresponding compare because they may have ended up with
3088     // different line numbers and we want to avoid awkward line stepping while
3089     // debugging. Eg. if the compare has got a line number inside the loop.
3090     cast<Instruction>(CmpN)->setDebugLoc(ScalarLatchBr->getDebugLoc());
3091   }
3092 
3093   BranchInst *BrInst =
3094       BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, CmpN);
3095   BrInst->setDebugLoc(ScalarLatchBr->getDebugLoc());
3096   ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst);
3097 
3098   // Get ready to start creating new instructions into the vectorized body.
3099   assert(LoopVectorPreHeader == Lp->getLoopPreheader() &&
3100          "Inconsistent vector loop preheader");
3101   Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt());
3102 
3103   Optional<MDNode *> VectorizedLoopID =
3104       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
3105                                       LLVMLoopVectorizeFollowupVectorized});
3106   if (VectorizedLoopID.hasValue()) {
3107     Lp->setLoopID(VectorizedLoopID.getValue());
3108 
3109     // Do not setAlreadyVectorized if loop attributes have been defined
3110     // explicitly.
3111     return LoopVectorPreHeader;
3112   }
3113 
3114   // Keep all loop hints from the original loop on the vector loop (we'll
3115   // replace the vectorizer-specific hints below).
3116   if (MDNode *LID = OrigLoop->getLoopID())
3117     Lp->setLoopID(LID);
3118 
3119   LoopVectorizeHints Hints(Lp, true, *ORE);
3120   Hints.setAlreadyVectorized();
3121 
3122 #ifdef EXPENSIVE_CHECKS
3123   assert(DT->verify(DominatorTree::VerificationLevel::Fast));
3124   LI->verify(*DT);
3125 #endif
3126 
3127   return LoopVectorPreHeader;
3128 }
3129 
3130 // Fix up external users of the induction variable. At this point, we are
3131 // in LCSSA form, with all external PHIs that use the IV having one input value,
3132 // coming from the remainder loop. We need those PHIs to also have a correct
3133 // value for the IV when arriving directly from the middle block.
3134 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
3135                                        const InductionDescriptor &II,
3136                                        Value *CountRoundDown, Value *EndValue,
3137                                        BasicBlock *MiddleBlock) {
3138   // There are two kinds of external IV usages - those that use the value
3139   // computed in the last iteration (the PHI) and those that use the penultimate
3140   // value (the value that feeds into the phi from the loop latch).
3141   // We allow both, but they, obviously, have different values.
3142 
3143   assert(OrigLoop->getExitBlock() && "Expected a single exit block");
3144 
3145   DenseMap<Value *, Value *> MissingVals;
3146 
3147   // An external user of the last iteration's value should see the value that
3148   // the remainder loop uses to initialize its own IV.
3149   Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
3150   for (User *U : PostInc->users()) {
3151     Instruction *UI = cast<Instruction>(U);
3152     if (!OrigLoop->contains(UI)) {
3153       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3154       MissingVals[UI] = EndValue;
3155     }
3156   }
3157 
3158   // An external user of the penultimate value need to see EndValue - Step.
3159   // The simplest way to get this is to recompute it from the constituent SCEVs,
3160   // that is Start + (Step * (CRD - 1)).
3161   for (User *U : OrigPhi->users()) {
3162     auto *UI = cast<Instruction>(U);
3163     if (!OrigLoop->contains(UI)) {
3164       const DataLayout &DL =
3165           OrigLoop->getHeader()->getModule()->getDataLayout();
3166       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3167 
3168       IRBuilder<> B(MiddleBlock->getTerminator());
3169       Value *CountMinusOne = B.CreateSub(
3170           CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1));
3171       Value *CMO =
3172           !II.getStep()->getType()->isIntegerTy()
3173               ? B.CreateCast(Instruction::SIToFP, CountMinusOne,
3174                              II.getStep()->getType())
3175               : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType());
3176       CMO->setName("cast.cmo");
3177       Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II);
3178       Escape->setName("ind.escape");
3179       MissingVals[UI] = Escape;
3180     }
3181   }
3182 
3183   for (auto &I : MissingVals) {
3184     PHINode *PHI = cast<PHINode>(I.first);
3185     // One corner case we have to handle is two IVs "chasing" each-other,
3186     // that is %IV2 = phi [...], [ %IV1, %latch ]
3187     // In this case, if IV1 has an external use, we need to avoid adding both
3188     // "last value of IV1" and "penultimate value of IV2". So, verify that we
3189     // don't already have an incoming value for the middle block.
3190     if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
3191       PHI->addIncoming(I.second, MiddleBlock);
3192   }
3193 }
3194 
3195 namespace {
3196 
3197 struct CSEDenseMapInfo {
3198   static bool canHandle(const Instruction *I) {
3199     return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
3200            isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
3201   }
3202 
3203   static inline Instruction *getEmptyKey() {
3204     return DenseMapInfo<Instruction *>::getEmptyKey();
3205   }
3206 
3207   static inline Instruction *getTombstoneKey() {
3208     return DenseMapInfo<Instruction *>::getTombstoneKey();
3209   }
3210 
3211   static unsigned getHashValue(const Instruction *I) {
3212     assert(canHandle(I) && "Unknown instruction!");
3213     return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
3214                                                            I->value_op_end()));
3215   }
3216 
3217   static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
3218     if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3219         LHS == getTombstoneKey() || RHS == getTombstoneKey())
3220       return LHS == RHS;
3221     return LHS->isIdenticalTo(RHS);
3222   }
3223 };
3224 
3225 } // end anonymous namespace
3226 
3227 ///Perform cse of induction variable instructions.
3228 static void cse(BasicBlock *BB) {
3229   // Perform simple cse.
3230   SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
3231   for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
3232     Instruction *In = &*I++;
3233 
3234     if (!CSEDenseMapInfo::canHandle(In))
3235       continue;
3236 
3237     // Check if we can replace this instruction with any of the
3238     // visited instructions.
3239     if (Instruction *V = CSEMap.lookup(In)) {
3240       In->replaceAllUsesWith(V);
3241       In->eraseFromParent();
3242       continue;
3243     }
3244 
3245     CSEMap[In] = In;
3246   }
3247 }
3248 
3249 unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
3250                                                        unsigned VF,
3251                                                        bool &NeedToScalarize) {
3252   Function *F = CI->getCalledFunction();
3253   Type *ScalarRetTy = CI->getType();
3254   SmallVector<Type *, 4> Tys, ScalarTys;
3255   for (auto &ArgOp : CI->arg_operands())
3256     ScalarTys.push_back(ArgOp->getType());
3257 
3258   // Estimate cost of scalarized vector call. The source operands are assumed
3259   // to be vectors, so we need to extract individual elements from there,
3260   // execute VF scalar calls, and then gather the result into the vector return
3261   // value.
3262   unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys);
3263   if (VF == 1)
3264     return ScalarCallCost;
3265 
3266   // Compute corresponding vector type for return value and arguments.
3267   Type *RetTy = ToVectorTy(ScalarRetTy, VF);
3268   for (Type *ScalarTy : ScalarTys)
3269     Tys.push_back(ToVectorTy(ScalarTy, VF));
3270 
3271   // Compute costs of unpacking argument values for the scalar calls and
3272   // packing the return values to a vector.
3273   unsigned ScalarizationCost = getScalarizationOverhead(CI, VF);
3274 
3275   unsigned Cost = ScalarCallCost * VF + ScalarizationCost;
3276 
3277   // If we can't emit a vector call for this function, then the currently found
3278   // cost is the cost we need to return.
3279   NeedToScalarize = true;
3280   if (!TLI || CI->isNoBuiltin() || VFDatabase::getMappings(*CI).empty())
3281     return Cost;
3282 
3283   // If the corresponding vector cost is cheaper, return its cost.
3284   unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys);
3285   if (VectorCallCost < Cost) {
3286     NeedToScalarize = false;
3287     return VectorCallCost;
3288   }
3289   return Cost;
3290 }
3291 
3292 unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
3293                                                             unsigned VF) {
3294   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
3295   assert(ID && "Expected intrinsic call!");
3296 
3297   FastMathFlags FMF;
3298   if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
3299     FMF = FPMO->getFastMathFlags();
3300 
3301   SmallVector<Value *, 4> Operands(CI->arg_operands());
3302   return TTI.getIntrinsicInstrCost(ID, CI->getType(), Operands, FMF, VF);
3303 }
3304 
3305 static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
3306   auto *I1 = cast<IntegerType>(T1->getVectorElementType());
3307   auto *I2 = cast<IntegerType>(T2->getVectorElementType());
3308   return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3309 }
3310 static Type *largestIntegerVectorType(Type *T1, Type *T2) {
3311   auto *I1 = cast<IntegerType>(T1->getVectorElementType());
3312   auto *I2 = cast<IntegerType>(T2->getVectorElementType());
3313   return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3314 }
3315 
3316 void InnerLoopVectorizer::truncateToMinimalBitwidths() {
3317   // For every instruction `I` in MinBWs, truncate the operands, create a
3318   // truncated version of `I` and reextend its result. InstCombine runs
3319   // later and will remove any ext/trunc pairs.
3320   SmallPtrSet<Value *, 4> Erased;
3321   for (const auto &KV : Cost->getMinimalBitwidths()) {
3322     // If the value wasn't vectorized, we must maintain the original scalar
3323     // type. The absence of the value from VectorLoopValueMap indicates that it
3324     // wasn't vectorized.
3325     if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3326       continue;
3327     for (unsigned Part = 0; Part < UF; ++Part) {
3328       Value *I = getOrCreateVectorValue(KV.first, Part);
3329       if (Erased.find(I) != Erased.end() || I->use_empty() ||
3330           !isa<Instruction>(I))
3331         continue;
3332       Type *OriginalTy = I->getType();
3333       Type *ScalarTruncatedTy =
3334           IntegerType::get(OriginalTy->getContext(), KV.second);
3335       Type *TruncatedTy = VectorType::get(ScalarTruncatedTy,
3336                                           OriginalTy->getVectorNumElements());
3337       if (TruncatedTy == OriginalTy)
3338         continue;
3339 
3340       IRBuilder<> B(cast<Instruction>(I));
3341       auto ShrinkOperand = [&](Value *V) -> Value * {
3342         if (auto *ZI = dyn_cast<ZExtInst>(V))
3343           if (ZI->getSrcTy() == TruncatedTy)
3344             return ZI->getOperand(0);
3345         return B.CreateZExtOrTrunc(V, TruncatedTy);
3346       };
3347 
3348       // The actual instruction modification depends on the instruction type,
3349       // unfortunately.
3350       Value *NewI = nullptr;
3351       if (auto *BO = dyn_cast<BinaryOperator>(I)) {
3352         NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
3353                              ShrinkOperand(BO->getOperand(1)));
3354 
3355         // Any wrapping introduced by shrinking this operation shouldn't be
3356         // considered undefined behavior. So, we can't unconditionally copy
3357         // arithmetic wrapping flags to NewI.
3358         cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
3359       } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
3360         NewI =
3361             B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
3362                          ShrinkOperand(CI->getOperand(1)));
3363       } else if (auto *SI = dyn_cast<SelectInst>(I)) {
3364         NewI = B.CreateSelect(SI->getCondition(),
3365                               ShrinkOperand(SI->getTrueValue()),
3366                               ShrinkOperand(SI->getFalseValue()));
3367       } else if (auto *CI = dyn_cast<CastInst>(I)) {
3368         switch (CI->getOpcode()) {
3369         default:
3370           llvm_unreachable("Unhandled cast!");
3371         case Instruction::Trunc:
3372           NewI = ShrinkOperand(CI->getOperand(0));
3373           break;
3374         case Instruction::SExt:
3375           NewI = B.CreateSExtOrTrunc(
3376               CI->getOperand(0),
3377               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3378           break;
3379         case Instruction::ZExt:
3380           NewI = B.CreateZExtOrTrunc(
3381               CI->getOperand(0),
3382               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3383           break;
3384         }
3385       } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
3386         auto Elements0 = SI->getOperand(0)->getType()->getVectorNumElements();
3387         auto *O0 = B.CreateZExtOrTrunc(
3388             SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0));
3389         auto Elements1 = SI->getOperand(1)->getType()->getVectorNumElements();
3390         auto *O1 = B.CreateZExtOrTrunc(
3391             SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1));
3392 
3393         NewI = B.CreateShuffleVector(O0, O1, SI->getMask());
3394       } else if (isa<LoadInst>(I) || isa<PHINode>(I)) {
3395         // Don't do anything with the operands, just extend the result.
3396         continue;
3397       } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
3398         auto Elements = IE->getOperand(0)->getType()->getVectorNumElements();
3399         auto *O0 = B.CreateZExtOrTrunc(
3400             IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3401         auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
3402         NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
3403       } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
3404         auto Elements = EE->getOperand(0)->getType()->getVectorNumElements();
3405         auto *O0 = B.CreateZExtOrTrunc(
3406             EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3407         NewI = B.CreateExtractElement(O0, EE->getOperand(2));
3408       } else {
3409         // If we don't know what to do, be conservative and don't do anything.
3410         continue;
3411       }
3412 
3413       // Lastly, extend the result.
3414       NewI->takeName(cast<Instruction>(I));
3415       Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
3416       I->replaceAllUsesWith(Res);
3417       cast<Instruction>(I)->eraseFromParent();
3418       Erased.insert(I);
3419       VectorLoopValueMap.resetVectorValue(KV.first, Part, Res);
3420     }
3421   }
3422 
3423   // We'll have created a bunch of ZExts that are now parentless. Clean up.
3424   for (const auto &KV : Cost->getMinimalBitwidths()) {
3425     // If the value wasn't vectorized, we must maintain the original scalar
3426     // type. The absence of the value from VectorLoopValueMap indicates that it
3427     // wasn't vectorized.
3428     if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3429       continue;
3430     for (unsigned Part = 0; Part < UF; ++Part) {
3431       Value *I = getOrCreateVectorValue(KV.first, Part);
3432       ZExtInst *Inst = dyn_cast<ZExtInst>(I);
3433       if (Inst && Inst->use_empty()) {
3434         Value *NewI = Inst->getOperand(0);
3435         Inst->eraseFromParent();
3436         VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI);
3437       }
3438     }
3439   }
3440 }
3441 
3442 void InnerLoopVectorizer::fixVectorizedLoop() {
3443   // Insert truncates and extends for any truncated instructions as hints to
3444   // InstCombine.
3445   if (VF > 1)
3446     truncateToMinimalBitwidths();
3447 
3448   // Fix widened non-induction PHIs by setting up the PHI operands.
3449   if (OrigPHIsToFix.size()) {
3450     assert(EnableVPlanNativePath &&
3451            "Unexpected non-induction PHIs for fixup in non VPlan-native path");
3452     fixNonInductionPHIs();
3453   }
3454 
3455   // At this point every instruction in the original loop is widened to a
3456   // vector form. Now we need to fix the recurrences in the loop. These PHI
3457   // nodes are currently empty because we did not want to introduce cycles.
3458   // This is the second stage of vectorizing recurrences.
3459   fixCrossIterationPHIs();
3460 
3461   // Forget the original basic block.
3462   PSE.getSE()->forgetLoop(OrigLoop);
3463 
3464   // Fix-up external users of the induction variables.
3465   for (auto &Entry : *Legal->getInductionVars())
3466     fixupIVUsers(Entry.first, Entry.second,
3467                  getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)),
3468                  IVEndValues[Entry.first], LoopMiddleBlock);
3469 
3470   fixLCSSAPHIs();
3471   for (Instruction *PI : PredicatedInstructions)
3472     sinkScalarOperands(&*PI);
3473 
3474   // Remove redundant induction instructions.
3475   cse(LoopVectorBody);
3476 
3477   // Set/update profile weights for the vector and remainder loops as original
3478   // loop iterations are now distributed among them. Note that original loop
3479   // represented by LoopScalarBody becomes remainder loop after vectorization.
3480   //
3481   // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
3482   // end up getting slightly roughened result but that should be OK since
3483   // profile is not inherently precise anyway. Note also possible bypass of
3484   // vector code caused by legality checks is ignored, assigning all the weight
3485   // to the vector loop, optimistically.
3486   setProfileInfoAfterUnrolling(LI->getLoopFor(LoopScalarBody),
3487                                LI->getLoopFor(LoopVectorBody),
3488                                LI->getLoopFor(LoopScalarBody), VF * UF);
3489 }
3490 
3491 void InnerLoopVectorizer::fixCrossIterationPHIs() {
3492   // In order to support recurrences we need to be able to vectorize Phi nodes.
3493   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3494   // stage #2: We now need to fix the recurrences by adding incoming edges to
3495   // the currently empty PHI nodes. At this point every instruction in the
3496   // original loop is widened to a vector form so we can use them to construct
3497   // the incoming edges.
3498   for (PHINode &Phi : OrigLoop->getHeader()->phis()) {
3499     // Handle first-order recurrences and reductions that need to be fixed.
3500     if (Legal->isFirstOrderRecurrence(&Phi))
3501       fixFirstOrderRecurrence(&Phi);
3502     else if (Legal->isReductionVariable(&Phi))
3503       fixReduction(&Phi);
3504   }
3505 }
3506 
3507 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
3508   // This is the second phase of vectorizing first-order recurrences. An
3509   // overview of the transformation is described below. Suppose we have the
3510   // following loop.
3511   //
3512   //   for (int i = 0; i < n; ++i)
3513   //     b[i] = a[i] - a[i - 1];
3514   //
3515   // There is a first-order recurrence on "a". For this loop, the shorthand
3516   // scalar IR looks like:
3517   //
3518   //   scalar.ph:
3519   //     s_init = a[-1]
3520   //     br scalar.body
3521   //
3522   //   scalar.body:
3523   //     i = phi [0, scalar.ph], [i+1, scalar.body]
3524   //     s1 = phi [s_init, scalar.ph], [s2, scalar.body]
3525   //     s2 = a[i]
3526   //     b[i] = s2 - s1
3527   //     br cond, scalar.body, ...
3528   //
3529   // In this example, s1 is a recurrence because it's value depends on the
3530   // previous iteration. In the first phase of vectorization, we created a
3531   // temporary value for s1. We now complete the vectorization and produce the
3532   // shorthand vector IR shown below (for VF = 4, UF = 1).
3533   //
3534   //   vector.ph:
3535   //     v_init = vector(..., ..., ..., a[-1])
3536   //     br vector.body
3537   //
3538   //   vector.body
3539   //     i = phi [0, vector.ph], [i+4, vector.body]
3540   //     v1 = phi [v_init, vector.ph], [v2, vector.body]
3541   //     v2 = a[i, i+1, i+2, i+3];
3542   //     v3 = vector(v1(3), v2(0, 1, 2))
3543   //     b[i, i+1, i+2, i+3] = v2 - v3
3544   //     br cond, vector.body, middle.block
3545   //
3546   //   middle.block:
3547   //     x = v2(3)
3548   //     br scalar.ph
3549   //
3550   //   scalar.ph:
3551   //     s_init = phi [x, middle.block], [a[-1], otherwise]
3552   //     br scalar.body
3553   //
3554   // After execution completes the vector loop, we extract the next value of
3555   // the recurrence (x) to use as the initial value in the scalar loop.
3556 
3557   // Get the original loop preheader and single loop latch.
3558   auto *Preheader = OrigLoop->getLoopPreheader();
3559   auto *Latch = OrigLoop->getLoopLatch();
3560 
3561   // Get the initial and previous values of the scalar recurrence.
3562   auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader);
3563   auto *Previous = Phi->getIncomingValueForBlock(Latch);
3564 
3565   // Create a vector from the initial value.
3566   auto *VectorInit = ScalarInit;
3567   if (VF > 1) {
3568     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
3569     VectorInit = Builder.CreateInsertElement(
3570         UndefValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit,
3571         Builder.getInt32(VF - 1), "vector.recur.init");
3572   }
3573 
3574   // We constructed a temporary phi node in the first phase of vectorization.
3575   // This phi node will eventually be deleted.
3576   Builder.SetInsertPoint(
3577       cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0)));
3578 
3579   // Create a phi node for the new recurrence. The current value will either be
3580   // the initial value inserted into a vector or loop-varying vector value.
3581   auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur");
3582   VecPhi->addIncoming(VectorInit, LoopVectorPreHeader);
3583 
3584   // Get the vectorized previous value of the last part UF - 1. It appears last
3585   // among all unrolled iterations, due to the order of their construction.
3586   Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1);
3587 
3588   // Find and set the insertion point after the previous value if it is an
3589   // instruction.
3590   BasicBlock::iterator InsertPt;
3591   // Note that the previous value may have been constant-folded so it is not
3592   // guaranteed to be an instruction in the vector loop.
3593   // FIXME: Loop invariant values do not form recurrences. We should deal with
3594   //        them earlier.
3595   if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart))
3596     InsertPt = LoopVectorBody->getFirstInsertionPt();
3597   else {
3598     Instruction *PreviousInst = cast<Instruction>(PreviousLastPart);
3599     if (isa<PHINode>(PreviousLastPart))
3600       // If the previous value is a phi node, we should insert after all the phi
3601       // nodes in the block containing the PHI to avoid breaking basic block
3602       // verification. Note that the basic block may be different to
3603       // LoopVectorBody, in case we predicate the loop.
3604       InsertPt = PreviousInst->getParent()->getFirstInsertionPt();
3605     else
3606       InsertPt = ++PreviousInst->getIterator();
3607   }
3608   Builder.SetInsertPoint(&*InsertPt);
3609 
3610   // We will construct a vector for the recurrence by combining the values for
3611   // the current and previous iterations. This is the required shuffle mask.
3612   SmallVector<Constant *, 8> ShuffleMask(VF);
3613   ShuffleMask[0] = Builder.getInt32(VF - 1);
3614   for (unsigned I = 1; I < VF; ++I)
3615     ShuffleMask[I] = Builder.getInt32(I + VF - 1);
3616 
3617   // The vector from which to take the initial value for the current iteration
3618   // (actual or unrolled). Initially, this is the vector phi node.
3619   Value *Incoming = VecPhi;
3620 
3621   // Shuffle the current and previous vector and update the vector parts.
3622   for (unsigned Part = 0; Part < UF; ++Part) {
3623     Value *PreviousPart = getOrCreateVectorValue(Previous, Part);
3624     Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part);
3625     auto *Shuffle =
3626         VF > 1 ? Builder.CreateShuffleVector(Incoming, PreviousPart,
3627                                              ConstantVector::get(ShuffleMask))
3628                : Incoming;
3629     PhiPart->replaceAllUsesWith(Shuffle);
3630     cast<Instruction>(PhiPart)->eraseFromParent();
3631     VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle);
3632     Incoming = PreviousPart;
3633   }
3634 
3635   // Fix the latch value of the new recurrence in the vector loop.
3636   VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
3637 
3638   // Extract the last vector element in the middle block. This will be the
3639   // initial value for the recurrence when jumping to the scalar loop.
3640   auto *ExtractForScalar = Incoming;
3641   if (VF > 1) {
3642     Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
3643     ExtractForScalar = Builder.CreateExtractElement(
3644         ExtractForScalar, Builder.getInt32(VF - 1), "vector.recur.extract");
3645   }
3646   // Extract the second last element in the middle block if the
3647   // Phi is used outside the loop. We need to extract the phi itself
3648   // and not the last element (the phi update in the current iteration). This
3649   // will be the value when jumping to the exit block from the LoopMiddleBlock,
3650   // when the scalar loop is not run at all.
3651   Value *ExtractForPhiUsedOutsideLoop = nullptr;
3652   if (VF > 1)
3653     ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
3654         Incoming, Builder.getInt32(VF - 2), "vector.recur.extract.for.phi");
3655   // When loop is unrolled without vectorizing, initialize
3656   // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of
3657   // `Incoming`. This is analogous to the vectorized case above: extracting the
3658   // second last element when VF > 1.
3659   else if (UF > 1)
3660     ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2);
3661 
3662   // Fix the initial value of the original recurrence in the scalar loop.
3663   Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
3664   auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
3665   for (auto *BB : predecessors(LoopScalarPreHeader)) {
3666     auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
3667     Start->addIncoming(Incoming, BB);
3668   }
3669 
3670   Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start);
3671   Phi->setName("scalar.recur");
3672 
3673   // Finally, fix users of the recurrence outside the loop. The users will need
3674   // either the last value of the scalar recurrence or the last value of the
3675   // vector recurrence we extracted in the middle block. Since the loop is in
3676   // LCSSA form, we just need to find all the phi nodes for the original scalar
3677   // recurrence in the exit block, and then add an edge for the middle block.
3678   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3679     if (LCSSAPhi.getIncomingValue(0) == Phi) {
3680       LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
3681     }
3682   }
3683 }
3684 
3685 void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
3686   Constant *Zero = Builder.getInt32(0);
3687 
3688   // Get it's reduction variable descriptor.
3689   assert(Legal->isReductionVariable(Phi) &&
3690          "Unable to find the reduction variable");
3691   RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[Phi];
3692 
3693   RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();
3694   TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
3695   Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
3696   RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind =
3697     RdxDesc.getMinMaxRecurrenceKind();
3698   setDebugLocFromInst(Builder, ReductionStartValue);
3699 
3700   // We need to generate a reduction vector from the incoming scalar.
3701   // To do so, we need to generate the 'identity' vector and override
3702   // one of the elements with the incoming scalar reduction. We need
3703   // to do it in the vector-loop preheader.
3704   Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
3705 
3706   // This is the vector-clone of the value that leaves the loop.
3707   Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType();
3708 
3709   // Find the reduction identity variable. Zero for addition, or, xor,
3710   // one for multiplication, -1 for And.
3711   Value *Identity;
3712   Value *VectorStart;
3713   if (RK == RecurrenceDescriptor::RK_IntegerMinMax ||
3714       RK == RecurrenceDescriptor::RK_FloatMinMax) {
3715     // MinMax reduction have the start value as their identify.
3716     if (VF == 1) {
3717       VectorStart = Identity = ReductionStartValue;
3718     } else {
3719       VectorStart = Identity =
3720         Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident");
3721     }
3722   } else {
3723     // Handle other reduction kinds:
3724     Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity(
3725         RK, VecTy->getScalarType());
3726     if (VF == 1) {
3727       Identity = Iden;
3728       // This vector is the Identity vector where the first element is the
3729       // incoming scalar reduction.
3730       VectorStart = ReductionStartValue;
3731     } else {
3732       Identity = ConstantVector::getSplat(VF, Iden);
3733 
3734       // This vector is the Identity vector where the first element is the
3735       // incoming scalar reduction.
3736       VectorStart =
3737         Builder.CreateInsertElement(Identity, ReductionStartValue, Zero);
3738     }
3739   }
3740 
3741   // Wrap flags are in general invalid after vectorization, clear them.
3742   clearReductionWrapFlags(RdxDesc);
3743 
3744   // Fix the vector-loop phi.
3745 
3746   // Reductions do not have to start at zero. They can start with
3747   // any loop invariant values.
3748   BasicBlock *Latch = OrigLoop->getLoopLatch();
3749   Value *LoopVal = Phi->getIncomingValueForBlock(Latch);
3750 
3751   for (unsigned Part = 0; Part < UF; ++Part) {
3752     Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part);
3753     Value *Val = getOrCreateVectorValue(LoopVal, Part);
3754     // Make sure to add the reduction start value only to the
3755     // first unroll part.
3756     Value *StartVal = (Part == 0) ? VectorStart : Identity;
3757     cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader);
3758     cast<PHINode>(VecRdxPhi)
3759       ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
3760   }
3761 
3762   // Before each round, move the insertion point right between
3763   // the PHIs and the values we are going to write.
3764   // This allows us to write both PHINodes and the extractelement
3765   // instructions.
3766   Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3767 
3768   setDebugLocFromInst(Builder, LoopExitInst);
3769 
3770   // If tail is folded by masking, the vector value to leave the loop should be
3771   // a Select choosing between the vectorized LoopExitInst and vectorized Phi,
3772   // instead of the former.
3773   if (Cost->foldTailByMasking()) {
3774     for (unsigned Part = 0; Part < UF; ++Part) {
3775       Value *VecLoopExitInst =
3776           VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3777       Value *Sel = nullptr;
3778       for (User *U : VecLoopExitInst->users()) {
3779         if (isa<SelectInst>(U)) {
3780           assert(!Sel && "Reduction exit feeding two selects");
3781           Sel = U;
3782         } else
3783           assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select");
3784       }
3785       assert(Sel && "Reduction exit feeds no select");
3786       VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, Sel);
3787     }
3788   }
3789 
3790   // If the vector reduction can be performed in a smaller type, we truncate
3791   // then extend the loop exit value to enable InstCombine to evaluate the
3792   // entire expression in the smaller type.
3793   if (VF > 1 && Phi->getType() != RdxDesc.getRecurrenceType()) {
3794     Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
3795     Builder.SetInsertPoint(
3796         LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator());
3797     VectorParts RdxParts(UF);
3798     for (unsigned Part = 0; Part < UF; ++Part) {
3799       RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3800       Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3801       Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
3802                                         : Builder.CreateZExt(Trunc, VecTy);
3803       for (Value::user_iterator UI = RdxParts[Part]->user_begin();
3804            UI != RdxParts[Part]->user_end();)
3805         if (*UI != Trunc) {
3806           (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd);
3807           RdxParts[Part] = Extnd;
3808         } else {
3809           ++UI;
3810         }
3811     }
3812     Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3813     for (unsigned Part = 0; Part < UF; ++Part) {
3814       RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3815       VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]);
3816     }
3817   }
3818 
3819   // Reduce all of the unrolled parts into a single vector.
3820   Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0);
3821   unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK);
3822 
3823   // The middle block terminator has already been assigned a DebugLoc here (the
3824   // OrigLoop's single latch terminator). We want the whole middle block to
3825   // appear to execute on this line because: (a) it is all compiler generated,
3826   // (b) these instructions are always executed after evaluating the latch
3827   // conditional branch, and (c) other passes may add new predecessors which
3828   // terminate on this line. This is the easiest way to ensure we don't
3829   // accidentally cause an extra step back into the loop while debugging.
3830   setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator());
3831   for (unsigned Part = 1; Part < UF; ++Part) {
3832     Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3833     if (Op != Instruction::ICmp && Op != Instruction::FCmp)
3834       // Floating point operations had to be 'fast' to enable the reduction.
3835       ReducedPartRdx = addFastMathFlag(
3836           Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxPart,
3837                               ReducedPartRdx, "bin.rdx"),
3838           RdxDesc.getFastMathFlags());
3839     else
3840       ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx,
3841                                       RdxPart);
3842   }
3843 
3844   if (VF > 1) {
3845     bool NoNaN = Legal->hasFunNoNaNAttr();
3846     ReducedPartRdx =
3847         createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN);
3848     // If the reduction can be performed in a smaller type, we need to extend
3849     // the reduction to the wider type before we branch to the original loop.
3850     if (Phi->getType() != RdxDesc.getRecurrenceType())
3851       ReducedPartRdx =
3852         RdxDesc.isSigned()
3853         ? Builder.CreateSExt(ReducedPartRdx, Phi->getType())
3854         : Builder.CreateZExt(ReducedPartRdx, Phi->getType());
3855   }
3856 
3857   // Create a phi node that merges control-flow from the backedge-taken check
3858   // block and the middle block.
3859   PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx",
3860                                         LoopScalarPreHeader->getTerminator());
3861   for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
3862     BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]);
3863   BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
3864 
3865   // Now, we need to fix the users of the reduction variable
3866   // inside and outside of the scalar remainder loop.
3867   // We know that the loop is in LCSSA form. We need to update the
3868   // PHI nodes in the exit blocks.
3869   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3870     // All PHINodes need to have a single entry edge, or two if
3871     // we already fixed them.
3872     assert(LCSSAPhi.getNumIncomingValues() < 3 && "Invalid LCSSA PHI");
3873 
3874     // We found a reduction value exit-PHI. Update it with the
3875     // incoming bypass edge.
3876     if (LCSSAPhi.getIncomingValue(0) == LoopExitInst)
3877       LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
3878   } // end of the LCSSA phi scan.
3879 
3880     // Fix the scalar loop reduction variable with the incoming reduction sum
3881     // from the vector body and from the backedge value.
3882   int IncomingEdgeBlockIdx =
3883     Phi->getBasicBlockIndex(OrigLoop->getLoopLatch());
3884   assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
3885   // Pick the other block.
3886   int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
3887   Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
3888   Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
3889 }
3890 
3891 void InnerLoopVectorizer::clearReductionWrapFlags(
3892     RecurrenceDescriptor &RdxDesc) {
3893   RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();
3894   if (RK != RecurrenceDescriptor::RK_IntegerAdd &&
3895       RK != RecurrenceDescriptor::RK_IntegerMult)
3896     return;
3897 
3898   Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr();
3899   assert(LoopExitInstr && "null loop exit instruction");
3900   SmallVector<Instruction *, 8> Worklist;
3901   SmallPtrSet<Instruction *, 8> Visited;
3902   Worklist.push_back(LoopExitInstr);
3903   Visited.insert(LoopExitInstr);
3904 
3905   while (!Worklist.empty()) {
3906     Instruction *Cur = Worklist.pop_back_val();
3907     if (isa<OverflowingBinaryOperator>(Cur))
3908       for (unsigned Part = 0; Part < UF; ++Part) {
3909         Value *V = getOrCreateVectorValue(Cur, Part);
3910         cast<Instruction>(V)->dropPoisonGeneratingFlags();
3911       }
3912 
3913     for (User *U : Cur->users()) {
3914       Instruction *UI = cast<Instruction>(U);
3915       if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) &&
3916           Visited.insert(UI).second)
3917         Worklist.push_back(UI);
3918     }
3919   }
3920 }
3921 
3922 void InnerLoopVectorizer::fixLCSSAPHIs() {
3923   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3924     if (LCSSAPhi.getNumIncomingValues() == 1) {
3925       auto *IncomingValue = LCSSAPhi.getIncomingValue(0);
3926       // Non-instruction incoming values will have only one value.
3927       unsigned LastLane = 0;
3928       if (isa<Instruction>(IncomingValue))
3929           LastLane = Cost->isUniformAfterVectorization(
3930                          cast<Instruction>(IncomingValue), VF)
3931                          ? 0
3932                          : VF - 1;
3933       // Can be a loop invariant incoming value or the last scalar value to be
3934       // extracted from the vectorized loop.
3935       Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
3936       Value *lastIncomingValue =
3937           getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane });
3938       LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock);
3939     }
3940   }
3941 }
3942 
3943 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
3944   // The basic block and loop containing the predicated instruction.
3945   auto *PredBB = PredInst->getParent();
3946   auto *VectorLoop = LI->getLoopFor(PredBB);
3947 
3948   // Initialize a worklist with the operands of the predicated instruction.
3949   SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
3950 
3951   // Holds instructions that we need to analyze again. An instruction may be
3952   // reanalyzed if we don't yet know if we can sink it or not.
3953   SmallVector<Instruction *, 8> InstsToReanalyze;
3954 
3955   // Returns true if a given use occurs in the predicated block. Phi nodes use
3956   // their operands in their corresponding predecessor blocks.
3957   auto isBlockOfUsePredicated = [&](Use &U) -> bool {
3958     auto *I = cast<Instruction>(U.getUser());
3959     BasicBlock *BB = I->getParent();
3960     if (auto *Phi = dyn_cast<PHINode>(I))
3961       BB = Phi->getIncomingBlock(
3962           PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
3963     return BB == PredBB;
3964   };
3965 
3966   // Iteratively sink the scalarized operands of the predicated instruction
3967   // into the block we created for it. When an instruction is sunk, it's
3968   // operands are then added to the worklist. The algorithm ends after one pass
3969   // through the worklist doesn't sink a single instruction.
3970   bool Changed;
3971   do {
3972     // Add the instructions that need to be reanalyzed to the worklist, and
3973     // reset the changed indicator.
3974     Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
3975     InstsToReanalyze.clear();
3976     Changed = false;
3977 
3978     while (!Worklist.empty()) {
3979       auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
3980 
3981       // We can't sink an instruction if it is a phi node, is already in the
3982       // predicated block, is not in the loop, or may have side effects.
3983       if (!I || isa<PHINode>(I) || I->getParent() == PredBB ||
3984           !VectorLoop->contains(I) || I->mayHaveSideEffects())
3985         continue;
3986 
3987       // It's legal to sink the instruction if all its uses occur in the
3988       // predicated block. Otherwise, there's nothing to do yet, and we may
3989       // need to reanalyze the instruction.
3990       if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
3991         InstsToReanalyze.push_back(I);
3992         continue;
3993       }
3994 
3995       // Move the instruction to the beginning of the predicated block, and add
3996       // it's operands to the worklist.
3997       I->moveBefore(&*PredBB->getFirstInsertionPt());
3998       Worklist.insert(I->op_begin(), I->op_end());
3999 
4000       // The sinking may have enabled other instructions to be sunk, so we will
4001       // need to iterate.
4002       Changed = true;
4003     }
4004   } while (Changed);
4005 }
4006 
4007 void InnerLoopVectorizer::fixNonInductionPHIs() {
4008   for (PHINode *OrigPhi : OrigPHIsToFix) {
4009     PHINode *NewPhi =
4010         cast<PHINode>(VectorLoopValueMap.getVectorValue(OrigPhi, 0));
4011     unsigned NumIncomingValues = OrigPhi->getNumIncomingValues();
4012 
4013     SmallVector<BasicBlock *, 2> ScalarBBPredecessors(
4014         predecessors(OrigPhi->getParent()));
4015     SmallVector<BasicBlock *, 2> VectorBBPredecessors(
4016         predecessors(NewPhi->getParent()));
4017     assert(ScalarBBPredecessors.size() == VectorBBPredecessors.size() &&
4018            "Scalar and Vector BB should have the same number of predecessors");
4019 
4020     // The insertion point in Builder may be invalidated by the time we get
4021     // here. Force the Builder insertion point to something valid so that we do
4022     // not run into issues during insertion point restore in
4023     // getOrCreateVectorValue calls below.
4024     Builder.SetInsertPoint(NewPhi);
4025 
4026     // The predecessor order is preserved and we can rely on mapping between
4027     // scalar and vector block predecessors.
4028     for (unsigned i = 0; i < NumIncomingValues; ++i) {
4029       BasicBlock *NewPredBB = VectorBBPredecessors[i];
4030 
4031       // When looking up the new scalar/vector values to fix up, use incoming
4032       // values from original phi.
4033       Value *ScIncV =
4034           OrigPhi->getIncomingValueForBlock(ScalarBBPredecessors[i]);
4035 
4036       // Scalar incoming value may need a broadcast
4037       Value *NewIncV = getOrCreateVectorValue(ScIncV, 0);
4038       NewPhi->addIncoming(NewIncV, NewPredBB);
4039     }
4040   }
4041 }
4042 
4043 void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, unsigned UF,
4044                                    unsigned VF, bool IsPtrLoopInvariant,
4045                                    SmallBitVector &IsIndexLoopInvariant) {
4046   // Construct a vector GEP by widening the operands of the scalar GEP as
4047   // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
4048   // results in a vector of pointers when at least one operand of the GEP
4049   // is vector-typed. Thus, to keep the representation compact, we only use
4050   // vector-typed operands for loop-varying values.
4051 
4052   if (VF > 1 && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) {
4053     // If we are vectorizing, but the GEP has only loop-invariant operands,
4054     // the GEP we build (by only using vector-typed operands for
4055     // loop-varying values) would be a scalar pointer. Thus, to ensure we
4056     // produce a vector of pointers, we need to either arbitrarily pick an
4057     // operand to broadcast, or broadcast a clone of the original GEP.
4058     // Here, we broadcast a clone of the original.
4059     //
4060     // TODO: If at some point we decide to scalarize instructions having
4061     //       loop-invariant operands, this special case will no longer be
4062     //       required. We would add the scalarization decision to
4063     //       collectLoopScalars() and teach getVectorValue() to broadcast
4064     //       the lane-zero scalar value.
4065     auto *Clone = Builder.Insert(GEP->clone());
4066     for (unsigned Part = 0; Part < UF; ++Part) {
4067       Value *EntryPart = Builder.CreateVectorSplat(VF, Clone);
4068       VectorLoopValueMap.setVectorValue(GEP, Part, EntryPart);
4069       addMetadata(EntryPart, GEP);
4070     }
4071   } else {
4072     // If the GEP has at least one loop-varying operand, we are sure to
4073     // produce a vector of pointers. But if we are only unrolling, we want
4074     // to produce a scalar GEP for each unroll part. Thus, the GEP we
4075     // produce with the code below will be scalar (if VF == 1) or vector
4076     // (otherwise). Note that for the unroll-only case, we still maintain
4077     // values in the vector mapping with initVector, as we do for other
4078     // instructions.
4079     for (unsigned Part = 0; Part < UF; ++Part) {
4080       // The pointer operand of the new GEP. If it's loop-invariant, we
4081       // won't broadcast it.
4082       auto *Ptr = IsPtrLoopInvariant
4083                       ? GEP->getPointerOperand()
4084                       : getOrCreateVectorValue(GEP->getPointerOperand(), Part);
4085 
4086       // Collect all the indices for the new GEP. If any index is
4087       // loop-invariant, we won't broadcast it.
4088       SmallVector<Value *, 4> Indices;
4089       for (auto Index : enumerate(GEP->indices())) {
4090         Value *User = Index.value().get();
4091         if (IsIndexLoopInvariant[Index.index()])
4092           Indices.push_back(User);
4093         else
4094           Indices.push_back(getOrCreateVectorValue(User, Part));
4095       }
4096 
4097       // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
4098       // but it should be a vector, otherwise.
4099       auto *NewGEP =
4100           GEP->isInBounds()
4101               ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr,
4102                                           Indices)
4103               : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices);
4104       assert((VF == 1 || NewGEP->getType()->isVectorTy()) &&
4105              "NewGEP is not a pointer vector");
4106       VectorLoopValueMap.setVectorValue(GEP, Part, NewGEP);
4107       addMetadata(NewGEP, GEP);
4108     }
4109   }
4110 }
4111 
4112 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
4113                                               unsigned VF) {
4114   PHINode *P = cast<PHINode>(PN);
4115   if (EnableVPlanNativePath) {
4116     // Currently we enter here in the VPlan-native path for non-induction
4117     // PHIs where all control flow is uniform. We simply widen these PHIs.
4118     // Create a vector phi with no operands - the vector phi operands will be
4119     // set at the end of vector code generation.
4120     Type *VecTy =
4121         (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF);
4122     Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi");
4123     VectorLoopValueMap.setVectorValue(P, 0, VecPhi);
4124     OrigPHIsToFix.push_back(P);
4125 
4126     return;
4127   }
4128 
4129   assert(PN->getParent() == OrigLoop->getHeader() &&
4130          "Non-header phis should have been handled elsewhere");
4131 
4132   // In order to support recurrences we need to be able to vectorize Phi nodes.
4133   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
4134   // stage #1: We create a new vector PHI node with no incoming edges. We'll use
4135   // this value when we vectorize all of the instructions that use the PHI.
4136   if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) {
4137     for (unsigned Part = 0; Part < UF; ++Part) {
4138       // This is phase one of vectorizing PHIs.
4139       Type *VecTy =
4140           (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF);
4141       Value *EntryPart = PHINode::Create(
4142           VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt());
4143       VectorLoopValueMap.setVectorValue(P, Part, EntryPart);
4144     }
4145     return;
4146   }
4147 
4148   setDebugLocFromInst(Builder, P);
4149 
4150   // This PHINode must be an induction variable.
4151   // Make sure that we know about it.
4152   assert(Legal->getInductionVars()->count(P) && "Not an induction variable");
4153 
4154   InductionDescriptor II = Legal->getInductionVars()->lookup(P);
4155   const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
4156 
4157   // FIXME: The newly created binary instructions should contain nsw/nuw flags,
4158   // which can be found from the original scalar operations.
4159   switch (II.getKind()) {
4160   case InductionDescriptor::IK_NoInduction:
4161     llvm_unreachable("Unknown induction");
4162   case InductionDescriptor::IK_IntInduction:
4163   case InductionDescriptor::IK_FpInduction:
4164     llvm_unreachable("Integer/fp induction is handled elsewhere.");
4165   case InductionDescriptor::IK_PtrInduction: {
4166     // Handle the pointer induction variable case.
4167     assert(P->getType()->isPointerTy() && "Unexpected type.");
4168     // This is the normalized GEP that starts counting at zero.
4169     Value *PtrInd = Induction;
4170     PtrInd = Builder.CreateSExtOrTrunc(PtrInd, II.getStep()->getType());
4171     // Determine the number of scalars we need to generate for each unroll
4172     // iteration. If the instruction is uniform, we only need to generate the
4173     // first lane. Otherwise, we generate all VF values.
4174     unsigned Lanes = Cost->isUniformAfterVectorization(P, VF) ? 1 : VF;
4175     // These are the scalar results. Notice that we don't generate vector GEPs
4176     // because scalar GEPs result in better code.
4177     for (unsigned Part = 0; Part < UF; ++Part) {
4178       for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
4179         Constant *Idx = ConstantInt::get(PtrInd->getType(), Lane + Part * VF);
4180         Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
4181         Value *SclrGep =
4182             emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II);
4183         SclrGep->setName("next.gep");
4184         VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep);
4185       }
4186     }
4187     return;
4188   }
4189   }
4190 }
4191 
4192 /// A helper function for checking whether an integer division-related
4193 /// instruction may divide by zero (in which case it must be predicated if
4194 /// executed conditionally in the scalar code).
4195 /// TODO: It may be worthwhile to generalize and check isKnownNonZero().
4196 /// Non-zero divisors that are non compile-time constants will not be
4197 /// converted into multiplication, so we will still end up scalarizing
4198 /// the division, but can do so w/o predication.
4199 static bool mayDivideByZero(Instruction &I) {
4200   assert((I.getOpcode() == Instruction::UDiv ||
4201           I.getOpcode() == Instruction::SDiv ||
4202           I.getOpcode() == Instruction::URem ||
4203           I.getOpcode() == Instruction::SRem) &&
4204          "Unexpected instruction");
4205   Value *Divisor = I.getOperand(1);
4206   auto *CInt = dyn_cast<ConstantInt>(Divisor);
4207   return !CInt || CInt->isZero();
4208 }
4209 
4210 void InnerLoopVectorizer::widenInstruction(Instruction &I) {
4211   switch (I.getOpcode()) {
4212   case Instruction::Br:
4213   case Instruction::PHI:
4214   case Instruction::GetElementPtr:
4215     llvm_unreachable("This instruction is handled by a different recipe.");
4216   case Instruction::UDiv:
4217   case Instruction::SDiv:
4218   case Instruction::SRem:
4219   case Instruction::URem:
4220   case Instruction::Add:
4221   case Instruction::FAdd:
4222   case Instruction::Sub:
4223   case Instruction::FSub:
4224   case Instruction::FNeg:
4225   case Instruction::Mul:
4226   case Instruction::FMul:
4227   case Instruction::FDiv:
4228   case Instruction::FRem:
4229   case Instruction::Shl:
4230   case Instruction::LShr:
4231   case Instruction::AShr:
4232   case Instruction::And:
4233   case Instruction::Or:
4234   case Instruction::Xor: {
4235     // Just widen unops and binops.
4236     setDebugLocFromInst(Builder, &I);
4237 
4238     for (unsigned Part = 0; Part < UF; ++Part) {
4239       SmallVector<Value *, 2> Ops;
4240       for (Value *Op : I.operands())
4241         Ops.push_back(getOrCreateVectorValue(Op, Part));
4242 
4243       Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops);
4244 
4245       if (auto *VecOp = dyn_cast<Instruction>(V))
4246         VecOp->copyIRFlags(&I);
4247 
4248       // Use this vector value for all users of the original instruction.
4249       VectorLoopValueMap.setVectorValue(&I, Part, V);
4250       addMetadata(V, &I);
4251     }
4252 
4253     break;
4254   }
4255   case Instruction::Select: {
4256     // Widen selects.
4257     // If the selector is loop invariant we can create a select
4258     // instruction with a scalar condition. Otherwise, use vector-select.
4259     auto *SE = PSE.getSE();
4260     bool InvariantCond =
4261         SE->isLoopInvariant(PSE.getSCEV(I.getOperand(0)), OrigLoop);
4262     setDebugLocFromInst(Builder, &I);
4263 
4264     // The condition can be loop invariant  but still defined inside the
4265     // loop. This means that we can't just use the original 'cond' value.
4266     // We have to take the 'vectorized' value and pick the first lane.
4267     // Instcombine will make this a no-op.
4268 
4269     auto *ScalarCond = getOrCreateScalarValue(I.getOperand(0), {0, 0});
4270 
4271     for (unsigned Part = 0; Part < UF; ++Part) {
4272       Value *Cond = getOrCreateVectorValue(I.getOperand(0), Part);
4273       Value *Op0 = getOrCreateVectorValue(I.getOperand(1), Part);
4274       Value *Op1 = getOrCreateVectorValue(I.getOperand(2), Part);
4275       Value *Sel =
4276           Builder.CreateSelect(InvariantCond ? ScalarCond : Cond, Op0, Op1);
4277       VectorLoopValueMap.setVectorValue(&I, Part, Sel);
4278       addMetadata(Sel, &I);
4279     }
4280 
4281     break;
4282   }
4283 
4284   case Instruction::ICmp:
4285   case Instruction::FCmp: {
4286     // Widen compares. Generate vector compares.
4287     bool FCmp = (I.getOpcode() == Instruction::FCmp);
4288     auto *Cmp = cast<CmpInst>(&I);
4289     setDebugLocFromInst(Builder, Cmp);
4290     for (unsigned Part = 0; Part < UF; ++Part) {
4291       Value *A = getOrCreateVectorValue(Cmp->getOperand(0), Part);
4292       Value *B = getOrCreateVectorValue(Cmp->getOperand(1), Part);
4293       Value *C = nullptr;
4294       if (FCmp) {
4295         // Propagate fast math flags.
4296         IRBuilder<>::FastMathFlagGuard FMFG(Builder);
4297         Builder.setFastMathFlags(Cmp->getFastMathFlags());
4298         C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
4299       } else {
4300         C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
4301       }
4302       VectorLoopValueMap.setVectorValue(&I, Part, C);
4303       addMetadata(C, &I);
4304     }
4305 
4306     break;
4307   }
4308 
4309   case Instruction::ZExt:
4310   case Instruction::SExt:
4311   case Instruction::FPToUI:
4312   case Instruction::FPToSI:
4313   case Instruction::FPExt:
4314   case Instruction::PtrToInt:
4315   case Instruction::IntToPtr:
4316   case Instruction::SIToFP:
4317   case Instruction::UIToFP:
4318   case Instruction::Trunc:
4319   case Instruction::FPTrunc:
4320   case Instruction::BitCast: {
4321     auto *CI = cast<CastInst>(&I);
4322     setDebugLocFromInst(Builder, CI);
4323 
4324     /// Vectorize casts.
4325     Type *DestTy =
4326         (VF == 1) ? CI->getType() : VectorType::get(CI->getType(), VF);
4327 
4328     for (unsigned Part = 0; Part < UF; ++Part) {
4329       Value *A = getOrCreateVectorValue(CI->getOperand(0), Part);
4330       Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
4331       VectorLoopValueMap.setVectorValue(&I, Part, Cast);
4332       addMetadata(Cast, &I);
4333     }
4334     break;
4335   }
4336 
4337   case Instruction::Call: {
4338     // Ignore dbg intrinsics.
4339     if (isa<DbgInfoIntrinsic>(I))
4340       break;
4341     setDebugLocFromInst(Builder, &I);
4342 
4343     Module *M = I.getParent()->getParent()->getParent();
4344     auto *CI = cast<CallInst>(&I);
4345 
4346     SmallVector<Type *, 4> Tys;
4347     for (Value *ArgOperand : CI->arg_operands())
4348       Tys.push_back(ToVectorTy(ArgOperand->getType(), VF));
4349 
4350     Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
4351 
4352     // The flag shows whether we use Intrinsic or a usual Call for vectorized
4353     // version of the instruction.
4354     // Is it beneficial to perform intrinsic call compared to lib call?
4355     bool NeedToScalarize;
4356     unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize);
4357     bool UseVectorIntrinsic =
4358         ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost;
4359     assert((UseVectorIntrinsic || !NeedToScalarize) &&
4360            "Instruction should be scalarized elsewhere.");
4361 
4362     for (unsigned Part = 0; Part < UF; ++Part) {
4363       SmallVector<Value *, 4> Args;
4364       for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) {
4365         Value *Arg = CI->getArgOperand(i);
4366         // Some intrinsics have a scalar argument - don't replace it with a
4367         // vector.
4368         if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, i))
4369           Arg = getOrCreateVectorValue(CI->getArgOperand(i), Part);
4370         Args.push_back(Arg);
4371       }
4372 
4373       Function *VectorF;
4374       if (UseVectorIntrinsic) {
4375         // Use vector version of the intrinsic.
4376         Type *TysForDecl[] = {CI->getType()};
4377         if (VF > 1)
4378           TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
4379         VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
4380       } else {
4381         // Use vector version of the function call.
4382         const VFShape Shape =
4383             VFShape::get(*CI, {VF, false} /*EC*/, false /*HasGlobalPred*/);
4384 #ifndef NDEBUG
4385         const SmallVector<VFInfo, 8> Infos = VFDatabase::getMappings(*CI);
4386         assert(std::find_if(Infos.begin(), Infos.end(),
4387                             [&Shape](const VFInfo &Info) {
4388                               return Info.Shape == Shape;
4389                             }) != Infos.end() &&
4390                "Vector function shape is missing from the database.");
4391 #endif
4392         VectorF = VFDatabase(*CI).getVectorizedFunction(Shape);
4393       }
4394       assert(VectorF && "Can't create vector function.");
4395 
4396       SmallVector<OperandBundleDef, 1> OpBundles;
4397       CI->getOperandBundlesAsDefs(OpBundles);
4398       CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
4399 
4400       if (isa<FPMathOperator>(V))
4401         V->copyFastMathFlags(CI);
4402 
4403       VectorLoopValueMap.setVectorValue(&I, Part, V);
4404       addMetadata(V, &I);
4405     }
4406 
4407     break;
4408   }
4409 
4410   default:
4411     // This instruction is not vectorized by simple widening.
4412     LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
4413     llvm_unreachable("Unhandled instruction!");
4414   } // end of switch.
4415 }
4416 
4417 void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
4418   // We should not collect Scalars more than once per VF. Right now, this
4419   // function is called from collectUniformsAndScalars(), which already does
4420   // this check. Collecting Scalars for VF=1 does not make any sense.
4421   assert(VF >= 2 && Scalars.find(VF) == Scalars.end() &&
4422          "This function should not be visited twice for the same VF");
4423 
4424   SmallSetVector<Instruction *, 8> Worklist;
4425 
4426   // These sets are used to seed the analysis with pointers used by memory
4427   // accesses that will remain scalar.
4428   SmallSetVector<Instruction *, 8> ScalarPtrs;
4429   SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
4430 
4431   // A helper that returns true if the use of Ptr by MemAccess will be scalar.
4432   // The pointer operands of loads and stores will be scalar as long as the
4433   // memory access is not a gather or scatter operation. The value operand of a
4434   // store will remain scalar if the store is scalarized.
4435   auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
4436     InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
4437     assert(WideningDecision != CM_Unknown &&
4438            "Widening decision should be ready at this moment");
4439     if (auto *Store = dyn_cast<StoreInst>(MemAccess))
4440       if (Ptr == Store->getValueOperand())
4441         return WideningDecision == CM_Scalarize;
4442     assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
4443            "Ptr is neither a value or pointer operand");
4444     return WideningDecision != CM_GatherScatter;
4445   };
4446 
4447   // A helper that returns true if the given value is a bitcast or
4448   // getelementptr instruction contained in the loop.
4449   auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
4450     return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
4451             isa<GetElementPtrInst>(V)) &&
4452            !TheLoop->isLoopInvariant(V);
4453   };
4454 
4455   // A helper that evaluates a memory access's use of a pointer. If the use
4456   // will be a scalar use, and the pointer is only used by memory accesses, we
4457   // place the pointer in ScalarPtrs. Otherwise, the pointer is placed in
4458   // PossibleNonScalarPtrs.
4459   auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
4460     // We only care about bitcast and getelementptr instructions contained in
4461     // the loop.
4462     if (!isLoopVaryingBitCastOrGEP(Ptr))
4463       return;
4464 
4465     // If the pointer has already been identified as scalar (e.g., if it was
4466     // also identified as uniform), there's nothing to do.
4467     auto *I = cast<Instruction>(Ptr);
4468     if (Worklist.count(I))
4469       return;
4470 
4471     // If the use of the pointer will be a scalar use, and all users of the
4472     // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
4473     // place the pointer in PossibleNonScalarPtrs.
4474     if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
4475           return isa<LoadInst>(U) || isa<StoreInst>(U);
4476         }))
4477       ScalarPtrs.insert(I);
4478     else
4479       PossibleNonScalarPtrs.insert(I);
4480   };
4481 
4482   // We seed the scalars analysis with three classes of instructions: (1)
4483   // instructions marked uniform-after-vectorization, (2) bitcast and
4484   // getelementptr instructions used by memory accesses requiring a scalar use,
4485   // and (3) pointer induction variables and their update instructions (we
4486   // currently only scalarize these).
4487   //
4488   // (1) Add to the worklist all instructions that have been identified as
4489   // uniform-after-vectorization.
4490   Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
4491 
4492   // (2) Add to the worklist all bitcast and getelementptr instructions used by
4493   // memory accesses requiring a scalar use. The pointer operands of loads and
4494   // stores will be scalar as long as the memory accesses is not a gather or
4495   // scatter operation. The value operand of a store will remain scalar if the
4496   // store is scalarized.
4497   for (auto *BB : TheLoop->blocks())
4498     for (auto &I : *BB) {
4499       if (auto *Load = dyn_cast<LoadInst>(&I)) {
4500         evaluatePtrUse(Load, Load->getPointerOperand());
4501       } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
4502         evaluatePtrUse(Store, Store->getPointerOperand());
4503         evaluatePtrUse(Store, Store->getValueOperand());
4504       }
4505     }
4506   for (auto *I : ScalarPtrs)
4507     if (PossibleNonScalarPtrs.find(I) == PossibleNonScalarPtrs.end()) {
4508       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
4509       Worklist.insert(I);
4510     }
4511 
4512   // (3) Add to the worklist all pointer induction variables and their update
4513   // instructions.
4514   //
4515   // TODO: Once we are able to vectorize pointer induction variables we should
4516   //       no longer insert them into the worklist here.
4517   auto *Latch = TheLoop->getLoopLatch();
4518   for (auto &Induction : *Legal->getInductionVars()) {
4519     auto *Ind = Induction.first;
4520     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4521     if (Induction.second.getKind() != InductionDescriptor::IK_PtrInduction)
4522       continue;
4523     Worklist.insert(Ind);
4524     Worklist.insert(IndUpdate);
4525     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
4526     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
4527                       << "\n");
4528   }
4529 
4530   // Insert the forced scalars.
4531   // FIXME: Currently widenPHIInstruction() often creates a dead vector
4532   // induction variable when the PHI user is scalarized.
4533   auto ForcedScalar = ForcedScalars.find(VF);
4534   if (ForcedScalar != ForcedScalars.end())
4535     for (auto *I : ForcedScalar->second)
4536       Worklist.insert(I);
4537 
4538   // Expand the worklist by looking through any bitcasts and getelementptr
4539   // instructions we've already identified as scalar. This is similar to the
4540   // expansion step in collectLoopUniforms(); however, here we're only
4541   // expanding to include additional bitcasts and getelementptr instructions.
4542   unsigned Idx = 0;
4543   while (Idx != Worklist.size()) {
4544     Instruction *Dst = Worklist[Idx++];
4545     if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
4546       continue;
4547     auto *Src = cast<Instruction>(Dst->getOperand(0));
4548     if (llvm::all_of(Src->users(), [&](User *U) -> bool {
4549           auto *J = cast<Instruction>(U);
4550           return !TheLoop->contains(J) || Worklist.count(J) ||
4551                  ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
4552                   isScalarUse(J, Src));
4553         })) {
4554       Worklist.insert(Src);
4555       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
4556     }
4557   }
4558 
4559   // An induction variable will remain scalar if all users of the induction
4560   // variable and induction variable update remain scalar.
4561   for (auto &Induction : *Legal->getInductionVars()) {
4562     auto *Ind = Induction.first;
4563     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4564 
4565     // We already considered pointer induction variables, so there's no reason
4566     // to look at their users again.
4567     //
4568     // TODO: Once we are able to vectorize pointer induction variables we
4569     //       should no longer skip over them here.
4570     if (Induction.second.getKind() == InductionDescriptor::IK_PtrInduction)
4571       continue;
4572 
4573     // Determine if all users of the induction variable are scalar after
4574     // vectorization.
4575     auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4576       auto *I = cast<Instruction>(U);
4577       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I);
4578     });
4579     if (!ScalarInd)
4580       continue;
4581 
4582     // Determine if all users of the induction variable update instruction are
4583     // scalar after vectorization.
4584     auto ScalarIndUpdate =
4585         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4586           auto *I = cast<Instruction>(U);
4587           return I == Ind || !TheLoop->contains(I) || Worklist.count(I);
4588         });
4589     if (!ScalarIndUpdate)
4590       continue;
4591 
4592     // The induction variable and its update instruction will remain scalar.
4593     Worklist.insert(Ind);
4594     Worklist.insert(IndUpdate);
4595     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
4596     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
4597                       << "\n");
4598   }
4599 
4600   Scalars[VF].insert(Worklist.begin(), Worklist.end());
4601 }
4602 
4603 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, unsigned VF) {
4604   if (!blockNeedsPredication(I->getParent()))
4605     return false;
4606   switch(I->getOpcode()) {
4607   default:
4608     break;
4609   case Instruction::Load:
4610   case Instruction::Store: {
4611     if (!Legal->isMaskRequired(I))
4612       return false;
4613     auto *Ptr = getLoadStorePointerOperand(I);
4614     auto *Ty = getMemInstValueType(I);
4615     // We have already decided how to vectorize this instruction, get that
4616     // result.
4617     if (VF > 1) {
4618       InstWidening WideningDecision = getWideningDecision(I, VF);
4619       assert(WideningDecision != CM_Unknown &&
4620              "Widening decision should be ready at this moment");
4621       return WideningDecision == CM_Scalarize;
4622     }
4623     const MaybeAlign Alignment = getLoadStoreAlignment(I);
4624     return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
4625                                 isLegalMaskedGather(Ty, Alignment))
4626                             : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
4627                                 isLegalMaskedScatter(Ty, Alignment));
4628   }
4629   case Instruction::UDiv:
4630   case Instruction::SDiv:
4631   case Instruction::SRem:
4632   case Instruction::URem:
4633     return mayDivideByZero(*I);
4634   }
4635   return false;
4636 }
4637 
4638 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(Instruction *I,
4639                                                                unsigned VF) {
4640   assert(isAccessInterleaved(I) && "Expecting interleaved access.");
4641   assert(getWideningDecision(I, VF) == CM_Unknown &&
4642          "Decision should not be set yet.");
4643   auto *Group = getInterleavedAccessGroup(I);
4644   assert(Group && "Must have a group.");
4645 
4646   // If the instruction's allocated size doesn't equal it's type size, it
4647   // requires padding and will be scalarized.
4648   auto &DL = I->getModule()->getDataLayout();
4649   auto *ScalarTy = getMemInstValueType(I);
4650   if (hasIrregularType(ScalarTy, DL, VF))
4651     return false;
4652 
4653   // Check if masking is required.
4654   // A Group may need masking for one of two reasons: it resides in a block that
4655   // needs predication, or it was decided to use masking to deal with gaps.
4656   bool PredicatedAccessRequiresMasking =
4657       Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I);
4658   bool AccessWithGapsRequiresMasking =
4659       Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
4660   if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking)
4661     return true;
4662 
4663   // If masked interleaving is required, we expect that the user/target had
4664   // enabled it, because otherwise it either wouldn't have been created or
4665   // it should have been invalidated by the CostModel.
4666   assert(useMaskedInterleavedAccesses(TTI) &&
4667          "Masked interleave-groups for predicated accesses are not enabled.");
4668 
4669   auto *Ty = getMemInstValueType(I);
4670   const MaybeAlign Alignment = getLoadStoreAlignment(I);
4671   return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
4672                           : TTI.isLegalMaskedStore(Ty, Alignment);
4673 }
4674 
4675 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(Instruction *I,
4676                                                                unsigned VF) {
4677   // Get and ensure we have a valid memory instruction.
4678   LoadInst *LI = dyn_cast<LoadInst>(I);
4679   StoreInst *SI = dyn_cast<StoreInst>(I);
4680   assert((LI || SI) && "Invalid memory instruction");
4681 
4682   auto *Ptr = getLoadStorePointerOperand(I);
4683 
4684   // In order to be widened, the pointer should be consecutive, first of all.
4685   if (!Legal->isConsecutivePtr(Ptr))
4686     return false;
4687 
4688   // If the instruction is a store located in a predicated block, it will be
4689   // scalarized.
4690   if (isScalarWithPredication(I))
4691     return false;
4692 
4693   // If the instruction's allocated size doesn't equal it's type size, it
4694   // requires padding and will be scalarized.
4695   auto &DL = I->getModule()->getDataLayout();
4696   auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType();
4697   if (hasIrregularType(ScalarTy, DL, VF))
4698     return false;
4699 
4700   return true;
4701 }
4702 
4703 void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
4704   // We should not collect Uniforms more than once per VF. Right now,
4705   // this function is called from collectUniformsAndScalars(), which
4706   // already does this check. Collecting Uniforms for VF=1 does not make any
4707   // sense.
4708 
4709   assert(VF >= 2 && Uniforms.find(VF) == Uniforms.end() &&
4710          "This function should not be visited twice for the same VF");
4711 
4712   // Visit the list of Uniforms. If we'll not find any uniform value, we'll
4713   // not analyze again.  Uniforms.count(VF) will return 1.
4714   Uniforms[VF].clear();
4715 
4716   // We now know that the loop is vectorizable!
4717   // Collect instructions inside the loop that will remain uniform after
4718   // vectorization.
4719 
4720   // Global values, params and instructions outside of current loop are out of
4721   // scope.
4722   auto isOutOfScope = [&](Value *V) -> bool {
4723     Instruction *I = dyn_cast<Instruction>(V);
4724     return (!I || !TheLoop->contains(I));
4725   };
4726 
4727   SetVector<Instruction *> Worklist;
4728   BasicBlock *Latch = TheLoop->getLoopLatch();
4729 
4730   // Instructions that are scalar with predication must not be considered
4731   // uniform after vectorization, because that would create an erroneous
4732   // replicating region where only a single instance out of VF should be formed.
4733   // TODO: optimize such seldom cases if found important, see PR40816.
4734   auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
4735     if (isScalarWithPredication(I, VF)) {
4736       LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "
4737                         << *I << "\n");
4738       return;
4739     }
4740     LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
4741     Worklist.insert(I);
4742   };
4743 
4744   // Start with the conditional branch. If the branch condition is an
4745   // instruction contained in the loop that is only used by the branch, it is
4746   // uniform.
4747   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
4748   if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
4749     addToWorklistIfAllowed(Cmp);
4750 
4751   // Holds consecutive and consecutive-like pointers. Consecutive-like pointers
4752   // are pointers that are treated like consecutive pointers during
4753   // vectorization. The pointer operands of interleaved accesses are an
4754   // example.
4755   SmallSetVector<Instruction *, 8> ConsecutiveLikePtrs;
4756 
4757   // Holds pointer operands of instructions that are possibly non-uniform.
4758   SmallPtrSet<Instruction *, 8> PossibleNonUniformPtrs;
4759 
4760   auto isUniformDecision = [&](Instruction *I, unsigned VF) {
4761     InstWidening WideningDecision = getWideningDecision(I, VF);
4762     assert(WideningDecision != CM_Unknown &&
4763            "Widening decision should be ready at this moment");
4764 
4765     return (WideningDecision == CM_Widen ||
4766             WideningDecision == CM_Widen_Reverse ||
4767             WideningDecision == CM_Interleave);
4768   };
4769   // Iterate over the instructions in the loop, and collect all
4770   // consecutive-like pointer operands in ConsecutiveLikePtrs. If it's possible
4771   // that a consecutive-like pointer operand will be scalarized, we collect it
4772   // in PossibleNonUniformPtrs instead. We use two sets here because a single
4773   // getelementptr instruction can be used by both vectorized and scalarized
4774   // memory instructions. For example, if a loop loads and stores from the same
4775   // location, but the store is conditional, the store will be scalarized, and
4776   // the getelementptr won't remain uniform.
4777   for (auto *BB : TheLoop->blocks())
4778     for (auto &I : *BB) {
4779       // If there's no pointer operand, there's nothing to do.
4780       auto *Ptr = dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
4781       if (!Ptr)
4782         continue;
4783 
4784       // True if all users of Ptr are memory accesses that have Ptr as their
4785       // pointer operand.
4786       auto UsersAreMemAccesses =
4787           llvm::all_of(Ptr->users(), [&](User *U) -> bool {
4788             return getLoadStorePointerOperand(U) == Ptr;
4789           });
4790 
4791       // Ensure the memory instruction will not be scalarized or used by
4792       // gather/scatter, making its pointer operand non-uniform. If the pointer
4793       // operand is used by any instruction other than a memory access, we
4794       // conservatively assume the pointer operand may be non-uniform.
4795       if (!UsersAreMemAccesses || !isUniformDecision(&I, VF))
4796         PossibleNonUniformPtrs.insert(Ptr);
4797 
4798       // If the memory instruction will be vectorized and its pointer operand
4799       // is consecutive-like, or interleaving - the pointer operand should
4800       // remain uniform.
4801       else
4802         ConsecutiveLikePtrs.insert(Ptr);
4803     }
4804 
4805   // Add to the Worklist all consecutive and consecutive-like pointers that
4806   // aren't also identified as possibly non-uniform.
4807   for (auto *V : ConsecutiveLikePtrs)
4808     if (PossibleNonUniformPtrs.find(V) == PossibleNonUniformPtrs.end())
4809       addToWorklistIfAllowed(V);
4810 
4811   // Expand Worklist in topological order: whenever a new instruction
4812   // is added , its users should be already inside Worklist.  It ensures
4813   // a uniform instruction will only be used by uniform instructions.
4814   unsigned idx = 0;
4815   while (idx != Worklist.size()) {
4816     Instruction *I = Worklist[idx++];
4817 
4818     for (auto OV : I->operand_values()) {
4819       // isOutOfScope operands cannot be uniform instructions.
4820       if (isOutOfScope(OV))
4821         continue;
4822       // First order recurrence Phi's should typically be considered
4823       // non-uniform.
4824       auto *OP = dyn_cast<PHINode>(OV);
4825       if (OP && Legal->isFirstOrderRecurrence(OP))
4826         continue;
4827       // If all the users of the operand are uniform, then add the
4828       // operand into the uniform worklist.
4829       auto *OI = cast<Instruction>(OV);
4830       if (llvm::all_of(OI->users(), [&](User *U) -> bool {
4831             auto *J = cast<Instruction>(U);
4832             return Worklist.count(J) ||
4833                    (OI == getLoadStorePointerOperand(J) &&
4834                     isUniformDecision(J, VF));
4835           }))
4836         addToWorklistIfAllowed(OI);
4837     }
4838   }
4839 
4840   // Returns true if Ptr is the pointer operand of a memory access instruction
4841   // I, and I is known to not require scalarization.
4842   auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
4843     return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF);
4844   };
4845 
4846   // For an instruction to be added into Worklist above, all its users inside
4847   // the loop should also be in Worklist. However, this condition cannot be
4848   // true for phi nodes that form a cyclic dependence. We must process phi
4849   // nodes separately. An induction variable will remain uniform if all users
4850   // of the induction variable and induction variable update remain uniform.
4851   // The code below handles both pointer and non-pointer induction variables.
4852   for (auto &Induction : *Legal->getInductionVars()) {
4853     auto *Ind = Induction.first;
4854     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4855 
4856     // Determine if all users of the induction variable are uniform after
4857     // vectorization.
4858     auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4859       auto *I = cast<Instruction>(U);
4860       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
4861              isVectorizedMemAccessUse(I, Ind);
4862     });
4863     if (!UniformInd)
4864       continue;
4865 
4866     // Determine if all users of the induction variable update instruction are
4867     // uniform after vectorization.
4868     auto UniformIndUpdate =
4869         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4870           auto *I = cast<Instruction>(U);
4871           return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
4872                  isVectorizedMemAccessUse(I, IndUpdate);
4873         });
4874     if (!UniformIndUpdate)
4875       continue;
4876 
4877     // The induction variable and its update instruction will remain uniform.
4878     addToWorklistIfAllowed(Ind);
4879     addToWorklistIfAllowed(IndUpdate);
4880   }
4881 
4882   Uniforms[VF].insert(Worklist.begin(), Worklist.end());
4883 }
4884 
4885 bool LoopVectorizationCostModel::runtimeChecksRequired() {
4886   LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
4887 
4888   if (Legal->getRuntimePointerChecking()->Need) {
4889     reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
4890         "runtime pointer checks needed. Enable vectorization of this "
4891         "loop with '#pragma clang loop vectorize(enable)' when "
4892         "compiling with -Os/-Oz",
4893         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4894     return true;
4895   }
4896 
4897   if (!PSE.getUnionPredicate().getPredicates().empty()) {
4898     reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
4899         "runtime SCEV checks needed. Enable vectorization of this "
4900         "loop with '#pragma clang loop vectorize(enable)' when "
4901         "compiling with -Os/-Oz",
4902         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4903     return true;
4904   }
4905 
4906   // FIXME: Avoid specializing for stride==1 instead of bailing out.
4907   if (!Legal->getLAI()->getSymbolicStrides().empty()) {
4908     reportVectorizationFailure("Runtime stride check is required with -Os/-Oz",
4909         "runtime stride == 1 checks needed. Enable vectorization of "
4910         "this loop with '#pragma clang loop vectorize(enable)' when "
4911         "compiling with -Os/-Oz",
4912         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4913     return true;
4914   }
4915 
4916   return false;
4917 }
4918 
4919 Optional<unsigned> LoopVectorizationCostModel::computeMaxVF() {
4920   if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
4921     // TODO: It may by useful to do since it's still likely to be dynamically
4922     // uniform if the target can skip.
4923     reportVectorizationFailure(
4924         "Not inserting runtime ptr check for divergent target",
4925         "runtime pointer checks needed. Not enabled for divergent target",
4926         "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
4927     return None;
4928   }
4929 
4930   unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
4931   LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
4932   if (TC == 1) {
4933     reportVectorizationFailure("Single iteration (non) loop",
4934         "loop trip count is one, irrelevant for vectorization",
4935         "SingleIterationLoop", ORE, TheLoop);
4936     return None;
4937   }
4938 
4939   switch (ScalarEpilogueStatus) {
4940   case CM_ScalarEpilogueAllowed:
4941     return computeFeasibleMaxVF(TC);
4942   case CM_ScalarEpilogueNotNeededUsePredicate:
4943     LLVM_DEBUG(
4944         dbgs() << "LV: vector predicate hint/switch found.\n"
4945                << "LV: Not allowing scalar epilogue, creating predicated "
4946                << "vector loop.\n");
4947     break;
4948   case CM_ScalarEpilogueNotAllowedLowTripLoop:
4949     // fallthrough as a special case of OptForSize
4950   case CM_ScalarEpilogueNotAllowedOptSize:
4951     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
4952       LLVM_DEBUG(
4953           dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
4954     else
4955       LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
4956                         << "count.\n");
4957 
4958     // Bail if runtime checks are required, which are not good when optimising
4959     // for size.
4960     if (runtimeChecksRequired())
4961       return None;
4962     break;
4963   }
4964 
4965   // Now try the tail folding
4966 
4967   // Invalidate interleave groups that require an epilogue if we can't mask
4968   // the interleave-group.
4969   if (!useMaskedInterleavedAccesses(TTI))
4970     InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
4971 
4972   unsigned MaxVF = computeFeasibleMaxVF(TC);
4973   if (TC > 0 && TC % MaxVF == 0) {
4974     // Accept MaxVF if we do not have a tail.
4975     LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
4976     return MaxVF;
4977   }
4978 
4979   // If we don't know the precise trip count, or if the trip count that we
4980   // found modulo the vectorization factor is not zero, try to fold the tail
4981   // by masking.
4982   // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
4983   if (Legal->prepareToFoldTailByMasking()) {
4984     FoldTailByMasking = true;
4985     return MaxVF;
4986   }
4987 
4988   if (TC == 0) {
4989     reportVectorizationFailure(
4990         "Unable to calculate the loop count due to complex control flow",
4991         "unable to calculate the loop count due to complex control flow",
4992         "UnknownLoopCountComplexCFG", ORE, TheLoop);
4993     return None;
4994   }
4995 
4996   reportVectorizationFailure(
4997       "Cannot optimize for size and vectorize at the same time.",
4998       "cannot optimize for size and vectorize at the same time. "
4999       "Enable vectorization of this loop with '#pragma clang loop "
5000       "vectorize(enable)' when compiling with -Os/-Oz",
5001       "NoTailLoopWithOptForSize", ORE, TheLoop);
5002   return None;
5003 }
5004 
5005 unsigned
5006 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) {
5007   MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
5008   unsigned SmallestType, WidestType;
5009   std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
5010   unsigned WidestRegister = TTI.getRegisterBitWidth(true);
5011 
5012   // Get the maximum safe dependence distance in bits computed by LAA.
5013   // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
5014   // the memory accesses that is most restrictive (involved in the smallest
5015   // dependence distance).
5016   unsigned MaxSafeRegisterWidth = Legal->getMaxSafeRegisterWidth();
5017 
5018   WidestRegister = std::min(WidestRegister, MaxSafeRegisterWidth);
5019 
5020   unsigned MaxVectorSize = WidestRegister / WidestType;
5021 
5022   LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
5023                     << " / " << WidestType << " bits.\n");
5024   LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
5025                     << WidestRegister << " bits.\n");
5026 
5027   assert(MaxVectorSize <= 256 && "Did not expect to pack so many elements"
5028                                  " into one vector!");
5029   if (MaxVectorSize == 0) {
5030     LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n");
5031     MaxVectorSize = 1;
5032     return MaxVectorSize;
5033   } else if (ConstTripCount && ConstTripCount < MaxVectorSize &&
5034              isPowerOf2_32(ConstTripCount)) {
5035     // We need to clamp the VF to be the ConstTripCount. There is no point in
5036     // choosing a higher viable VF as done in the loop below.
5037     LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: "
5038                       << ConstTripCount << "\n");
5039     MaxVectorSize = ConstTripCount;
5040     return MaxVectorSize;
5041   }
5042 
5043   unsigned MaxVF = MaxVectorSize;
5044   if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) ||
5045       (MaximizeBandwidth && isScalarEpilogueAllowed())) {
5046     // Collect all viable vectorization factors larger than the default MaxVF
5047     // (i.e. MaxVectorSize).
5048     SmallVector<unsigned, 8> VFs;
5049     unsigned NewMaxVectorSize = WidestRegister / SmallestType;
5050     for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2)
5051       VFs.push_back(VS);
5052 
5053     // For each VF calculate its register usage.
5054     auto RUs = calculateRegisterUsage(VFs);
5055 
5056     // Select the largest VF which doesn't require more registers than existing
5057     // ones.
5058     for (int i = RUs.size() - 1; i >= 0; --i) {
5059       bool Selected = true;
5060       for (auto& pair : RUs[i].MaxLocalUsers) {
5061         unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5062         if (pair.second > TargetNumRegisters)
5063           Selected = false;
5064       }
5065       if (Selected) {
5066         MaxVF = VFs[i];
5067         break;
5068       }
5069     }
5070     if (unsigned MinVF = TTI.getMinimumVF(SmallestType)) {
5071       if (MaxVF < MinVF) {
5072         LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
5073                           << ") with target's minimum: " << MinVF << '\n');
5074         MaxVF = MinVF;
5075       }
5076     }
5077   }
5078   return MaxVF;
5079 }
5080 
5081 VectorizationFactor
5082 LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) {
5083   float Cost = expectedCost(1).first;
5084   const float ScalarCost = Cost;
5085   unsigned Width = 1;
5086   LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n");
5087 
5088   bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
5089   if (ForceVectorization && MaxVF > 1) {
5090     // Ignore scalar width, because the user explicitly wants vectorization.
5091     // Initialize cost to max so that VF = 2 is, at least, chosen during cost
5092     // evaluation.
5093     Cost = std::numeric_limits<float>::max();
5094   }
5095 
5096   for (unsigned i = 2; i <= MaxVF; i *= 2) {
5097     // Notice that the vector loop needs to be executed less times, so
5098     // we need to divide the cost of the vector loops by the width of
5099     // the vector elements.
5100     VectorizationCostTy C = expectedCost(i);
5101     float VectorCost = C.first / (float)i;
5102     LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
5103                       << " costs: " << (int)VectorCost << ".\n");
5104     if (!C.second && !ForceVectorization) {
5105       LLVM_DEBUG(
5106           dbgs() << "LV: Not considering vector loop of width " << i
5107                  << " because it will not generate any vector instructions.\n");
5108       continue;
5109     }
5110     if (VectorCost < Cost) {
5111       Cost = VectorCost;
5112       Width = i;
5113     }
5114   }
5115 
5116   if (!EnableCondStoresVectorization && NumPredStores) {
5117     reportVectorizationFailure("There are conditional stores.",
5118         "store that is conditionally executed prevents vectorization",
5119         "ConditionalStore", ORE, TheLoop);
5120     Width = 1;
5121     Cost = ScalarCost;
5122   }
5123 
5124   LLVM_DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs()
5125              << "LV: Vectorization seems to be not beneficial, "
5126              << "but was forced by a user.\n");
5127   LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n");
5128   VectorizationFactor Factor = {Width, (unsigned)(Width * Cost)};
5129   return Factor;
5130 }
5131 
5132 std::pair<unsigned, unsigned>
5133 LoopVectorizationCostModel::getSmallestAndWidestTypes() {
5134   unsigned MinWidth = -1U;
5135   unsigned MaxWidth = 8;
5136   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5137 
5138   // For each block.
5139   for (BasicBlock *BB : TheLoop->blocks()) {
5140     // For each instruction in the loop.
5141     for (Instruction &I : BB->instructionsWithoutDebug()) {
5142       Type *T = I.getType();
5143 
5144       // Skip ignored values.
5145       if (ValuesToIgnore.find(&I) != ValuesToIgnore.end())
5146         continue;
5147 
5148       // Only examine Loads, Stores and PHINodes.
5149       if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
5150         continue;
5151 
5152       // Examine PHI nodes that are reduction variables. Update the type to
5153       // account for the recurrence type.
5154       if (auto *PN = dyn_cast<PHINode>(&I)) {
5155         if (!Legal->isReductionVariable(PN))
5156           continue;
5157         RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[PN];
5158         T = RdxDesc.getRecurrenceType();
5159       }
5160 
5161       // Examine the stored values.
5162       if (auto *ST = dyn_cast<StoreInst>(&I))
5163         T = ST->getValueOperand()->getType();
5164 
5165       // Ignore loaded pointer types and stored pointer types that are not
5166       // vectorizable.
5167       //
5168       // FIXME: The check here attempts to predict whether a load or store will
5169       //        be vectorized. We only know this for certain after a VF has
5170       //        been selected. Here, we assume that if an access can be
5171       //        vectorized, it will be. We should also look at extending this
5172       //        optimization to non-pointer types.
5173       //
5174       if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) &&
5175           !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I))
5176         continue;
5177 
5178       MinWidth = std::min(MinWidth,
5179                           (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
5180       MaxWidth = std::max(MaxWidth,
5181                           (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
5182     }
5183   }
5184 
5185   return {MinWidth, MaxWidth};
5186 }
5187 
5188 unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF,
5189                                                            unsigned LoopCost) {
5190   // -- The interleave heuristics --
5191   // We interleave the loop in order to expose ILP and reduce the loop overhead.
5192   // There are many micro-architectural considerations that we can't predict
5193   // at this level. For example, frontend pressure (on decode or fetch) due to
5194   // code size, or the number and capabilities of the execution ports.
5195   //
5196   // We use the following heuristics to select the interleave count:
5197   // 1. If the code has reductions, then we interleave to break the cross
5198   // iteration dependency.
5199   // 2. If the loop is really small, then we interleave to reduce the loop
5200   // overhead.
5201   // 3. We don't interleave if we think that we will spill registers to memory
5202   // due to the increased register pressure.
5203 
5204   if (!isScalarEpilogueAllowed())
5205     return 1;
5206 
5207   // We used the distance for the interleave count.
5208   if (Legal->getMaxSafeDepDistBytes() != -1U)
5209     return 1;
5210 
5211   // Do not interleave loops with a relatively small known or estimated trip
5212   // count.
5213   auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
5214   if (BestKnownTC && *BestKnownTC < TinyTripCountInterleaveThreshold)
5215     return 1;
5216 
5217   RegisterUsage R = calculateRegisterUsage({VF})[0];
5218   // We divide by these constants so assume that we have at least one
5219   // instruction that uses at least one register.
5220   for (auto& pair : R.MaxLocalUsers) {
5221     pair.second = std::max(pair.second, 1U);
5222   }
5223 
5224   // We calculate the interleave count using the following formula.
5225   // Subtract the number of loop invariants from the number of available
5226   // registers. These registers are used by all of the interleaved instances.
5227   // Next, divide the remaining registers by the number of registers that is
5228   // required by the loop, in order to estimate how many parallel instances
5229   // fit without causing spills. All of this is rounded down if necessary to be
5230   // a power of two. We want power of two interleave count to simplify any
5231   // addressing operations or alignment considerations.
5232   // We also want power of two interleave counts to ensure that the induction
5233   // variable of the vector loop wraps to zero, when tail is folded by masking;
5234   // this currently happens when OptForSize, in which case IC is set to 1 above.
5235   unsigned IC = UINT_MAX;
5236 
5237   for (auto& pair : R.MaxLocalUsers) {
5238     unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5239     LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
5240                       << " registers of "
5241                       << TTI.getRegisterClassName(pair.first) << " register class\n");
5242     if (VF == 1) {
5243       if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
5244         TargetNumRegisters = ForceTargetNumScalarRegs;
5245     } else {
5246       if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
5247         TargetNumRegisters = ForceTargetNumVectorRegs;
5248     }
5249     unsigned MaxLocalUsers = pair.second;
5250     unsigned LoopInvariantRegs = 0;
5251     if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end())
5252       LoopInvariantRegs = R.LoopInvariantRegs[pair.first];
5253 
5254     unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers);
5255     // Don't count the induction variable as interleaved.
5256     if (EnableIndVarRegisterHeur) {
5257       TmpIC =
5258           PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) /
5259                         std::max(1U, (MaxLocalUsers - 1)));
5260     }
5261 
5262     IC = std::min(IC, TmpIC);
5263   }
5264 
5265   // Clamp the interleave ranges to reasonable counts.
5266   unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF);
5267 
5268   // Check if the user has overridden the max.
5269   if (VF == 1) {
5270     if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
5271       MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
5272   } else {
5273     if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
5274       MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
5275   }
5276 
5277   // If trip count is known or estimated compile time constant, limit the
5278   // interleave count to be less than the trip count divided by VF.
5279   if (BestKnownTC) {
5280     MaxInterleaveCount = std::min(*BestKnownTC / VF, MaxInterleaveCount);
5281   }
5282 
5283   // If we did not calculate the cost for VF (because the user selected the VF)
5284   // then we calculate the cost of VF here.
5285   if (LoopCost == 0)
5286     LoopCost = expectedCost(VF).first;
5287 
5288   assert(LoopCost && "Non-zero loop cost expected");
5289 
5290   // Clamp the calculated IC to be between the 1 and the max interleave count
5291   // that the target and trip count allows.
5292   if (IC > MaxInterleaveCount)
5293     IC = MaxInterleaveCount;
5294   else if (IC < 1)
5295     IC = 1;
5296 
5297   // Interleave if we vectorized this loop and there is a reduction that could
5298   // benefit from interleaving.
5299   if (VF > 1 && !Legal->getReductionVars()->empty()) {
5300     LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
5301     return IC;
5302   }
5303 
5304   // Note that if we've already vectorized the loop we will have done the
5305   // runtime check and so interleaving won't require further checks.
5306   bool InterleavingRequiresRuntimePointerCheck =
5307       (VF == 1 && Legal->getRuntimePointerChecking()->Need);
5308 
5309   // We want to interleave small loops in order to reduce the loop overhead and
5310   // potentially expose ILP opportunities.
5311   LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n');
5312   if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) {
5313     // We assume that the cost overhead is 1 and we use the cost model
5314     // to estimate the cost of the loop and interleave until the cost of the
5315     // loop overhead is about 5% of the cost of the loop.
5316     unsigned SmallIC =
5317         std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost));
5318 
5319     // Interleave until store/load ports (estimated by max interleave count) are
5320     // saturated.
5321     unsigned NumStores = Legal->getNumStores();
5322     unsigned NumLoads = Legal->getNumLoads();
5323     unsigned StoresIC = IC / (NumStores ? NumStores : 1);
5324     unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
5325 
5326     // If we have a scalar reduction (vector reductions are already dealt with
5327     // by this point), we can increase the critical path length if the loop
5328     // we're interleaving is inside another loop. Limit, by default to 2, so the
5329     // critical path only gets increased by one reduction operation.
5330     if (!Legal->getReductionVars()->empty() && TheLoop->getLoopDepth() > 1) {
5331       unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
5332       SmallIC = std::min(SmallIC, F);
5333       StoresIC = std::min(StoresIC, F);
5334       LoadsIC = std::min(LoadsIC, F);
5335     }
5336 
5337     if (EnableLoadStoreRuntimeInterleave &&
5338         std::max(StoresIC, LoadsIC) > SmallIC) {
5339       LLVM_DEBUG(
5340           dbgs() << "LV: Interleaving to saturate store or load ports.\n");
5341       return std::max(StoresIC, LoadsIC);
5342     }
5343 
5344     LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
5345     return SmallIC;
5346   }
5347 
5348   // Interleave if this is a large loop (small loops are already dealt with by
5349   // this point) that could benefit from interleaving.
5350   bool HasReductions = !Legal->getReductionVars()->empty();
5351   if (TTI.enableAggressiveInterleaving(HasReductions)) {
5352     LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5353     return IC;
5354   }
5355 
5356   LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
5357   return 1;
5358 }
5359 
5360 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
5361 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {
5362   // This function calculates the register usage by measuring the highest number
5363   // of values that are alive at a single location. Obviously, this is a very
5364   // rough estimation. We scan the loop in a topological order in order and
5365   // assign a number to each instruction. We use RPO to ensure that defs are
5366   // met before their users. We assume that each instruction that has in-loop
5367   // users starts an interval. We record every time that an in-loop value is
5368   // used, so we have a list of the first and last occurrences of each
5369   // instruction. Next, we transpose this data structure into a multi map that
5370   // holds the list of intervals that *end* at a specific location. This multi
5371   // map allows us to perform a linear search. We scan the instructions linearly
5372   // and record each time that a new interval starts, by placing it in a set.
5373   // If we find this value in the multi-map then we remove it from the set.
5374   // The max register usage is the maximum size of the set.
5375   // We also search for instructions that are defined outside the loop, but are
5376   // used inside the loop. We need this number separately from the max-interval
5377   // usage number because when we unroll, loop-invariant values do not take
5378   // more register.
5379   LoopBlocksDFS DFS(TheLoop);
5380   DFS.perform(LI);
5381 
5382   RegisterUsage RU;
5383 
5384   // Each 'key' in the map opens a new interval. The values
5385   // of the map are the index of the 'last seen' usage of the
5386   // instruction that is the key.
5387   using IntervalMap = DenseMap<Instruction *, unsigned>;
5388 
5389   // Maps instruction to its index.
5390   SmallVector<Instruction *, 64> IdxToInstr;
5391   // Marks the end of each interval.
5392   IntervalMap EndPoint;
5393   // Saves the list of instruction indices that are used in the loop.
5394   SmallPtrSet<Instruction *, 8> Ends;
5395   // Saves the list of values that are used in the loop but are
5396   // defined outside the loop, such as arguments and constants.
5397   SmallPtrSet<Value *, 8> LoopInvariants;
5398 
5399   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
5400     for (Instruction &I : BB->instructionsWithoutDebug()) {
5401       IdxToInstr.push_back(&I);
5402 
5403       // Save the end location of each USE.
5404       for (Value *U : I.operands()) {
5405         auto *Instr = dyn_cast<Instruction>(U);
5406 
5407         // Ignore non-instruction values such as arguments, constants, etc.
5408         if (!Instr)
5409           continue;
5410 
5411         // If this instruction is outside the loop then record it and continue.
5412         if (!TheLoop->contains(Instr)) {
5413           LoopInvariants.insert(Instr);
5414           continue;
5415         }
5416 
5417         // Overwrite previous end points.
5418         EndPoint[Instr] = IdxToInstr.size();
5419         Ends.insert(Instr);
5420       }
5421     }
5422   }
5423 
5424   // Saves the list of intervals that end with the index in 'key'.
5425   using InstrList = SmallVector<Instruction *, 2>;
5426   DenseMap<unsigned, InstrList> TransposeEnds;
5427 
5428   // Transpose the EndPoints to a list of values that end at each index.
5429   for (auto &Interval : EndPoint)
5430     TransposeEnds[Interval.second].push_back(Interval.first);
5431 
5432   SmallPtrSet<Instruction *, 8> OpenIntervals;
5433 
5434   // Get the size of the widest register.
5435   unsigned MaxSafeDepDist = -1U;
5436   if (Legal->getMaxSafeDepDistBytes() != -1U)
5437     MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8;
5438   unsigned WidestRegister =
5439       std::min(TTI.getRegisterBitWidth(true), MaxSafeDepDist);
5440   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5441 
5442   SmallVector<RegisterUsage, 8> RUs(VFs.size());
5443   SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size());
5444 
5445   LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
5446 
5447   // A lambda that gets the register usage for the given type and VF.
5448   auto GetRegUsage = [&DL, WidestRegister](Type *Ty, unsigned VF) {
5449     if (Ty->isTokenTy())
5450       return 0U;
5451     unsigned TypeSize = DL.getTypeSizeInBits(Ty->getScalarType());
5452     return std::max<unsigned>(1, VF * TypeSize / WidestRegister);
5453   };
5454 
5455   for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
5456     Instruction *I = IdxToInstr[i];
5457 
5458     // Remove all of the instructions that end at this location.
5459     InstrList &List = TransposeEnds[i];
5460     for (Instruction *ToRemove : List)
5461       OpenIntervals.erase(ToRemove);
5462 
5463     // Ignore instructions that are never used within the loop.
5464     if (Ends.find(I) == Ends.end())
5465       continue;
5466 
5467     // Skip ignored values.
5468     if (ValuesToIgnore.find(I) != ValuesToIgnore.end())
5469       continue;
5470 
5471     // For each VF find the maximum usage of registers.
5472     for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
5473       // Count the number of live intervals.
5474       SmallMapVector<unsigned, unsigned, 4> RegUsage;
5475 
5476       if (VFs[j] == 1) {
5477         for (auto Inst : OpenIntervals) {
5478           unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
5479           if (RegUsage.find(ClassID) == RegUsage.end())
5480             RegUsage[ClassID] = 1;
5481           else
5482             RegUsage[ClassID] += 1;
5483         }
5484       } else {
5485         collectUniformsAndScalars(VFs[j]);
5486         for (auto Inst : OpenIntervals) {
5487           // Skip ignored values for VF > 1.
5488           if (VecValuesToIgnore.find(Inst) != VecValuesToIgnore.end())
5489             continue;
5490           if (isScalarAfterVectorization(Inst, VFs[j])) {
5491             unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
5492             if (RegUsage.find(ClassID) == RegUsage.end())
5493               RegUsage[ClassID] = 1;
5494             else
5495               RegUsage[ClassID] += 1;
5496           } else {
5497             unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType());
5498             if (RegUsage.find(ClassID) == RegUsage.end())
5499               RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]);
5500             else
5501               RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]);
5502           }
5503         }
5504       }
5505 
5506       for (auto& pair : RegUsage) {
5507         if (MaxUsages[j].find(pair.first) != MaxUsages[j].end())
5508           MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second);
5509         else
5510           MaxUsages[j][pair.first] = pair.second;
5511       }
5512     }
5513 
5514     LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
5515                       << OpenIntervals.size() << '\n');
5516 
5517     // Add the current instruction to the list of open intervals.
5518     OpenIntervals.insert(I);
5519   }
5520 
5521   for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
5522     SmallMapVector<unsigned, unsigned, 4> Invariant;
5523 
5524     for (auto Inst : LoopInvariants) {
5525       unsigned Usage = VFs[i] == 1 ? 1 : GetRegUsage(Inst->getType(), VFs[i]);
5526       unsigned ClassID = TTI.getRegisterClassForType(VFs[i] > 1, Inst->getType());
5527       if (Invariant.find(ClassID) == Invariant.end())
5528         Invariant[ClassID] = Usage;
5529       else
5530         Invariant[ClassID] += Usage;
5531     }
5532 
5533     LLVM_DEBUG({
5534       dbgs() << "LV(REG): VF = " << VFs[i] << '\n';
5535       dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size()
5536              << " item\n";
5537       for (const auto &pair : MaxUsages[i]) {
5538         dbgs() << "LV(REG): RegisterClass: "
5539                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5540                << " registers\n";
5541       }
5542       dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()
5543              << " item\n";
5544       for (const auto &pair : Invariant) {
5545         dbgs() << "LV(REG): RegisterClass: "
5546                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5547                << " registers\n";
5548       }
5549     });
5550 
5551     RU.LoopInvariantRegs = Invariant;
5552     RU.MaxLocalUsers = MaxUsages[i];
5553     RUs[i] = RU;
5554   }
5555 
5556   return RUs;
5557 }
5558 
5559 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){
5560   // TODO: Cost model for emulated masked load/store is completely
5561   // broken. This hack guides the cost model to use an artificially
5562   // high enough value to practically disable vectorization with such
5563   // operations, except where previously deployed legality hack allowed
5564   // using very low cost values. This is to avoid regressions coming simply
5565   // from moving "masked load/store" check from legality to cost model.
5566   // Masked Load/Gather emulation was previously never allowed.
5567   // Limited number of Masked Store/Scatter emulation was allowed.
5568   assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction");
5569   return isa<LoadInst>(I) ||
5570          (isa<StoreInst>(I) &&
5571           NumPredStores > NumberOfStoresToPredicate);
5572 }
5573 
5574 void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) {
5575   // If we aren't vectorizing the loop, or if we've already collected the
5576   // instructions to scalarize, there's nothing to do. Collection may already
5577   // have occurred if we have a user-selected VF and are now computing the
5578   // expected cost for interleaving.
5579   if (VF < 2 || InstsToScalarize.find(VF) != InstsToScalarize.end())
5580     return;
5581 
5582   // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
5583   // not profitable to scalarize any instructions, the presence of VF in the
5584   // map will indicate that we've analyzed it already.
5585   ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
5586 
5587   // Find all the instructions that are scalar with predication in the loop and
5588   // determine if it would be better to not if-convert the blocks they are in.
5589   // If so, we also record the instructions to scalarize.
5590   for (BasicBlock *BB : TheLoop->blocks()) {
5591     if (!blockNeedsPredication(BB))
5592       continue;
5593     for (Instruction &I : *BB)
5594       if (isScalarWithPredication(&I)) {
5595         ScalarCostsTy ScalarCosts;
5596         // Do not apply discount logic if hacked cost is needed
5597         // for emulated masked memrefs.
5598         if (!useEmulatedMaskMemRefHack(&I) &&
5599             computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
5600           ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
5601         // Remember that BB will remain after vectorization.
5602         PredicatedBBsAfterVectorization.insert(BB);
5603       }
5604   }
5605 }
5606 
5607 int LoopVectorizationCostModel::computePredInstDiscount(
5608     Instruction *PredInst, DenseMap<Instruction *, unsigned> &ScalarCosts,
5609     unsigned VF) {
5610   assert(!isUniformAfterVectorization(PredInst, VF) &&
5611          "Instruction marked uniform-after-vectorization will be predicated");
5612 
5613   // Initialize the discount to zero, meaning that the scalar version and the
5614   // vector version cost the same.
5615   int Discount = 0;
5616 
5617   // Holds instructions to analyze. The instructions we visit are mapped in
5618   // ScalarCosts. Those instructions are the ones that would be scalarized if
5619   // we find that the scalar version costs less.
5620   SmallVector<Instruction *, 8> Worklist;
5621 
5622   // Returns true if the given instruction can be scalarized.
5623   auto canBeScalarized = [&](Instruction *I) -> bool {
5624     // We only attempt to scalarize instructions forming a single-use chain
5625     // from the original predicated block that would otherwise be vectorized.
5626     // Although not strictly necessary, we give up on instructions we know will
5627     // already be scalar to avoid traversing chains that are unlikely to be
5628     // beneficial.
5629     if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
5630         isScalarAfterVectorization(I, VF))
5631       return false;
5632 
5633     // If the instruction is scalar with predication, it will be analyzed
5634     // separately. We ignore it within the context of PredInst.
5635     if (isScalarWithPredication(I))
5636       return false;
5637 
5638     // If any of the instruction's operands are uniform after vectorization,
5639     // the instruction cannot be scalarized. This prevents, for example, a
5640     // masked load from being scalarized.
5641     //
5642     // We assume we will only emit a value for lane zero of an instruction
5643     // marked uniform after vectorization, rather than VF identical values.
5644     // Thus, if we scalarize an instruction that uses a uniform, we would
5645     // create uses of values corresponding to the lanes we aren't emitting code
5646     // for. This behavior can be changed by allowing getScalarValue to clone
5647     // the lane zero values for uniforms rather than asserting.
5648     for (Use &U : I->operands())
5649       if (auto *J = dyn_cast<Instruction>(U.get()))
5650         if (isUniformAfterVectorization(J, VF))
5651           return false;
5652 
5653     // Otherwise, we can scalarize the instruction.
5654     return true;
5655   };
5656 
5657   // Compute the expected cost discount from scalarizing the entire expression
5658   // feeding the predicated instruction. We currently only consider expressions
5659   // that are single-use instruction chains.
5660   Worklist.push_back(PredInst);
5661   while (!Worklist.empty()) {
5662     Instruction *I = Worklist.pop_back_val();
5663 
5664     // If we've already analyzed the instruction, there's nothing to do.
5665     if (ScalarCosts.find(I) != ScalarCosts.end())
5666       continue;
5667 
5668     // Compute the cost of the vector instruction. Note that this cost already
5669     // includes the scalarization overhead of the predicated instruction.
5670     unsigned VectorCost = getInstructionCost(I, VF).first;
5671 
5672     // Compute the cost of the scalarized instruction. This cost is the cost of
5673     // the instruction as if it wasn't if-converted and instead remained in the
5674     // predicated block. We will scale this cost by block probability after
5675     // computing the scalarization overhead.
5676     unsigned ScalarCost = VF * getInstructionCost(I, 1).first;
5677 
5678     // Compute the scalarization overhead of needed insertelement instructions
5679     // and phi nodes.
5680     if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) {
5681       ScalarCost += TTI.getScalarizationOverhead(ToVectorTy(I->getType(), VF),
5682                                                  true, false);
5683       ScalarCost += VF * TTI.getCFInstrCost(Instruction::PHI);
5684     }
5685 
5686     // Compute the scalarization overhead of needed extractelement
5687     // instructions. For each of the instruction's operands, if the operand can
5688     // be scalarized, add it to the worklist; otherwise, account for the
5689     // overhead.
5690     for (Use &U : I->operands())
5691       if (auto *J = dyn_cast<Instruction>(U.get())) {
5692         assert(VectorType::isValidElementType(J->getType()) &&
5693                "Instruction has non-scalar type");
5694         if (canBeScalarized(J))
5695           Worklist.push_back(J);
5696         else if (needsExtract(J, VF))
5697           ScalarCost += TTI.getScalarizationOverhead(
5698                               ToVectorTy(J->getType(),VF), false, true);
5699       }
5700 
5701     // Scale the total scalar cost by block probability.
5702     ScalarCost /= getReciprocalPredBlockProb();
5703 
5704     // Compute the discount. A non-negative discount means the vector version
5705     // of the instruction costs more, and scalarizing would be beneficial.
5706     Discount += VectorCost - ScalarCost;
5707     ScalarCosts[I] = ScalarCost;
5708   }
5709 
5710   return Discount;
5711 }
5712 
5713 LoopVectorizationCostModel::VectorizationCostTy
5714 LoopVectorizationCostModel::expectedCost(unsigned VF) {
5715   VectorizationCostTy Cost;
5716 
5717   // For each block.
5718   for (BasicBlock *BB : TheLoop->blocks()) {
5719     VectorizationCostTy BlockCost;
5720 
5721     // For each instruction in the old loop.
5722     for (Instruction &I : BB->instructionsWithoutDebug()) {
5723       // Skip ignored values.
5724       if (ValuesToIgnore.find(&I) != ValuesToIgnore.end() ||
5725           (VF > 1 && VecValuesToIgnore.find(&I) != VecValuesToIgnore.end()))
5726         continue;
5727 
5728       VectorizationCostTy C = getInstructionCost(&I, VF);
5729 
5730       // Check if we should override the cost.
5731       if (ForceTargetInstructionCost.getNumOccurrences() > 0)
5732         C.first = ForceTargetInstructionCost;
5733 
5734       BlockCost.first += C.first;
5735       BlockCost.second |= C.second;
5736       LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first
5737                         << " for VF " << VF << " For instruction: " << I
5738                         << '\n');
5739     }
5740 
5741     // If we are vectorizing a predicated block, it will have been
5742     // if-converted. This means that the block's instructions (aside from
5743     // stores and instructions that may divide by zero) will now be
5744     // unconditionally executed. For the scalar case, we may not always execute
5745     // the predicated block. Thus, scale the block's cost by the probability of
5746     // executing it.
5747     if (VF == 1 && blockNeedsPredication(BB))
5748       BlockCost.first /= getReciprocalPredBlockProb();
5749 
5750     Cost.first += BlockCost.first;
5751     Cost.second |= BlockCost.second;
5752   }
5753 
5754   return Cost;
5755 }
5756 
5757 /// Gets Address Access SCEV after verifying that the access pattern
5758 /// is loop invariant except the induction variable dependence.
5759 ///
5760 /// This SCEV can be sent to the Target in order to estimate the address
5761 /// calculation cost.
5762 static const SCEV *getAddressAccessSCEV(
5763               Value *Ptr,
5764               LoopVectorizationLegality *Legal,
5765               PredicatedScalarEvolution &PSE,
5766               const Loop *TheLoop) {
5767 
5768   auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
5769   if (!Gep)
5770     return nullptr;
5771 
5772   // We are looking for a gep with all loop invariant indices except for one
5773   // which should be an induction variable.
5774   auto SE = PSE.getSE();
5775   unsigned NumOperands = Gep->getNumOperands();
5776   for (unsigned i = 1; i < NumOperands; ++i) {
5777     Value *Opd = Gep->getOperand(i);
5778     if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
5779         !Legal->isInductionVariable(Opd))
5780       return nullptr;
5781   }
5782 
5783   // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
5784   return PSE.getSCEV(Ptr);
5785 }
5786 
5787 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
5788   return Legal->hasStride(I->getOperand(0)) ||
5789          Legal->hasStride(I->getOperand(1));
5790 }
5791 
5792 unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
5793                                                                  unsigned VF) {
5794   assert(VF > 1 && "Scalarization cost of instruction implies vectorization.");
5795   Type *ValTy = getMemInstValueType(I);
5796   auto SE = PSE.getSE();
5797 
5798   unsigned AS = getLoadStoreAddressSpace(I);
5799   Value *Ptr = getLoadStorePointerOperand(I);
5800   Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
5801 
5802   // Figure out whether the access is strided and get the stride value
5803   // if it's known in compile time
5804   const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
5805 
5806   // Get the cost of the scalar memory instruction and address computation.
5807   unsigned Cost = VF * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
5808 
5809   // Don't pass *I here, since it is scalar but will actually be part of a
5810   // vectorized loop where the user of it is a vectorized instruction.
5811   const MaybeAlign Alignment = getLoadStoreAlignment(I);
5812   Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(),
5813                                    Alignment, AS);
5814 
5815   // Get the overhead of the extractelement and insertelement instructions
5816   // we might create due to scalarization.
5817   Cost += getScalarizationOverhead(I, VF);
5818 
5819   // If we have a predicated store, it may not be executed for each vector
5820   // lane. Scale the cost by the probability of executing the predicated
5821   // block.
5822   if (isPredicatedInst(I)) {
5823     Cost /= getReciprocalPredBlockProb();
5824 
5825     if (useEmulatedMaskMemRefHack(I))
5826       // Artificially setting to a high enough value to practically disable
5827       // vectorization with such operations.
5828       Cost = 3000000;
5829   }
5830 
5831   return Cost;
5832 }
5833 
5834 unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
5835                                                              unsigned VF) {
5836   Type *ValTy = getMemInstValueType(I);
5837   Type *VectorTy = ToVectorTy(ValTy, VF);
5838   Value *Ptr = getLoadStorePointerOperand(I);
5839   unsigned AS = getLoadStoreAddressSpace(I);
5840   int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
5841 
5842   assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
5843          "Stride should be 1 or -1 for consecutive memory access");
5844   const MaybeAlign Alignment = getLoadStoreAlignment(I);
5845   unsigned Cost = 0;
5846   if (Legal->isMaskRequired(I))
5847     Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy,
5848                                       Alignment ? Alignment->value() : 0, AS);
5849   else
5850     Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, I);
5851 
5852   bool Reverse = ConsecutiveStride < 0;
5853   if (Reverse)
5854     Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
5855   return Cost;
5856 }
5857 
5858 unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
5859                                                          unsigned VF) {
5860   Type *ValTy = getMemInstValueType(I);
5861   Type *VectorTy = ToVectorTy(ValTy, VF);
5862   const MaybeAlign Alignment = getLoadStoreAlignment(I);
5863   unsigned AS = getLoadStoreAddressSpace(I);
5864   if (isa<LoadInst>(I)) {
5865     return TTI.getAddressComputationCost(ValTy) +
5866            TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS) +
5867            TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
5868   }
5869   StoreInst *SI = cast<StoreInst>(I);
5870 
5871   bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand());
5872   return TTI.getAddressComputationCost(ValTy) +
5873          TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS) +
5874          (isLoopInvariantStoreValue
5875               ? 0
5876               : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
5877                                        VF - 1));
5878 }
5879 
5880 unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
5881                                                           unsigned VF) {
5882   Type *ValTy = getMemInstValueType(I);
5883   Type *VectorTy = ToVectorTy(ValTy, VF);
5884   const MaybeAlign Alignment = getLoadStoreAlignment(I);
5885   Value *Ptr = getLoadStorePointerOperand(I);
5886 
5887   return TTI.getAddressComputationCost(VectorTy) +
5888          TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr,
5889                                     Legal->isMaskRequired(I),
5890                                     Alignment ? Alignment->value() : 0);
5891 }
5892 
5893 unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
5894                                                             unsigned VF) {
5895   Type *ValTy = getMemInstValueType(I);
5896   Type *VectorTy = ToVectorTy(ValTy, VF);
5897   unsigned AS = getLoadStoreAddressSpace(I);
5898 
5899   auto Group = getInterleavedAccessGroup(I);
5900   assert(Group && "Fail to get an interleaved access group.");
5901 
5902   unsigned InterleaveFactor = Group->getFactor();
5903   Type *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
5904 
5905   // Holds the indices of existing members in an interleaved load group.
5906   // An interleaved store group doesn't need this as it doesn't allow gaps.
5907   SmallVector<unsigned, 4> Indices;
5908   if (isa<LoadInst>(I)) {
5909     for (unsigned i = 0; i < InterleaveFactor; i++)
5910       if (Group->getMember(i))
5911         Indices.push_back(i);
5912   }
5913 
5914   // Calculate the cost of the whole interleaved group.
5915   bool UseMaskForGaps =
5916       Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
5917   unsigned Cost = TTI.getInterleavedMemoryOpCost(
5918       I->getOpcode(), WideVecTy, Group->getFactor(), Indices,
5919       Group->getAlignment(), AS, Legal->isMaskRequired(I), UseMaskForGaps);
5920 
5921   if (Group->isReverse()) {
5922     // TODO: Add support for reversed masked interleaved access.
5923     assert(!Legal->isMaskRequired(I) &&
5924            "Reverse masked interleaved access not supported.");
5925     Cost += Group->getNumMembers() *
5926             TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
5927   }
5928   return Cost;
5929 }
5930 
5931 unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
5932                                                               unsigned VF) {
5933   // Calculate scalar cost only. Vectorization cost should be ready at this
5934   // moment.
5935   if (VF == 1) {
5936     Type *ValTy = getMemInstValueType(I);
5937     const MaybeAlign Alignment = getLoadStoreAlignment(I);
5938     unsigned AS = getLoadStoreAddressSpace(I);
5939 
5940     return TTI.getAddressComputationCost(ValTy) +
5941            TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, I);
5942   }
5943   return getWideningCost(I, VF);
5944 }
5945 
5946 LoopVectorizationCostModel::VectorizationCostTy
5947 LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
5948   // If we know that this instruction will remain uniform, check the cost of
5949   // the scalar version.
5950   if (isUniformAfterVectorization(I, VF))
5951     VF = 1;
5952 
5953   if (VF > 1 && isProfitableToScalarize(I, VF))
5954     return VectorizationCostTy(InstsToScalarize[VF][I], false);
5955 
5956   // Forced scalars do not have any scalarization overhead.
5957   auto ForcedScalar = ForcedScalars.find(VF);
5958   if (VF > 1 && ForcedScalar != ForcedScalars.end()) {
5959     auto InstSet = ForcedScalar->second;
5960     if (InstSet.find(I) != InstSet.end())
5961       return VectorizationCostTy((getInstructionCost(I, 1).first * VF), false);
5962   }
5963 
5964   Type *VectorTy;
5965   unsigned C = getInstructionCost(I, VF, VectorTy);
5966 
5967   bool TypeNotScalarized =
5968       VF > 1 && VectorTy->isVectorTy() && TTI.getNumberOfParts(VectorTy) < VF;
5969   return VectorizationCostTy(C, TypeNotScalarized);
5970 }
5971 
5972 unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
5973                                                               unsigned VF) {
5974 
5975   if (VF == 1)
5976     return 0;
5977 
5978   unsigned Cost = 0;
5979   Type *RetTy = ToVectorTy(I->getType(), VF);
5980   if (!RetTy->isVoidTy() &&
5981       (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
5982     Cost += TTI.getScalarizationOverhead(RetTy, true, false);
5983 
5984   // Some targets keep addresses scalar.
5985   if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
5986     return Cost;
5987 
5988   // Some targets support efficient element stores.
5989   if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
5990     return Cost;
5991 
5992   // Collect operands to consider.
5993   CallInst *CI = dyn_cast<CallInst>(I);
5994   Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands();
5995 
5996   // Skip operands that do not require extraction/scalarization and do not incur
5997   // any overhead.
5998   return Cost + TTI.getOperandsScalarizationOverhead(
5999                     filterExtractingOperands(Ops, VF), VF);
6000 }
6001 
6002 void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {
6003   if (VF == 1)
6004     return;
6005   NumPredStores = 0;
6006   for (BasicBlock *BB : TheLoop->blocks()) {
6007     // For each instruction in the old loop.
6008     for (Instruction &I : *BB) {
6009       Value *Ptr =  getLoadStorePointerOperand(&I);
6010       if (!Ptr)
6011         continue;
6012 
6013       // TODO: We should generate better code and update the cost model for
6014       // predicated uniform stores. Today they are treated as any other
6015       // predicated store (see added test cases in
6016       // invariant-store-vectorization.ll).
6017       if (isa<StoreInst>(&I) && isScalarWithPredication(&I))
6018         NumPredStores++;
6019 
6020       if (Legal->isUniform(Ptr) &&
6021           // Conditional loads and stores should be scalarized and predicated.
6022           // isScalarWithPredication cannot be used here since masked
6023           // gather/scatters are not considered scalar with predication.
6024           !Legal->blockNeedsPredication(I.getParent())) {
6025         // TODO: Avoid replicating loads and stores instead of
6026         // relying on instcombine to remove them.
6027         // Load: Scalar load + broadcast
6028         // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
6029         unsigned Cost = getUniformMemOpCost(&I, VF);
6030         setWideningDecision(&I, VF, CM_Scalarize, Cost);
6031         continue;
6032       }
6033 
6034       // We assume that widening is the best solution when possible.
6035       if (memoryInstructionCanBeWidened(&I, VF)) {
6036         unsigned Cost = getConsecutiveMemOpCost(&I, VF);
6037         int ConsecutiveStride =
6038                Legal->isConsecutivePtr(getLoadStorePointerOperand(&I));
6039         assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6040                "Expected consecutive stride.");
6041         InstWidening Decision =
6042             ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
6043         setWideningDecision(&I, VF, Decision, Cost);
6044         continue;
6045       }
6046 
6047       // Choose between Interleaving, Gather/Scatter or Scalarization.
6048       unsigned InterleaveCost = std::numeric_limits<unsigned>::max();
6049       unsigned NumAccesses = 1;
6050       if (isAccessInterleaved(&I)) {
6051         auto Group = getInterleavedAccessGroup(&I);
6052         assert(Group && "Fail to get an interleaved access group.");
6053 
6054         // Make one decision for the whole group.
6055         if (getWideningDecision(&I, VF) != CM_Unknown)
6056           continue;
6057 
6058         NumAccesses = Group->getNumMembers();
6059         if (interleavedAccessCanBeWidened(&I, VF))
6060           InterleaveCost = getInterleaveGroupCost(&I, VF);
6061       }
6062 
6063       unsigned GatherScatterCost =
6064           isLegalGatherOrScatter(&I)
6065               ? getGatherScatterCost(&I, VF) * NumAccesses
6066               : std::numeric_limits<unsigned>::max();
6067 
6068       unsigned ScalarizationCost =
6069           getMemInstScalarizationCost(&I, VF) * NumAccesses;
6070 
6071       // Choose better solution for the current VF,
6072       // write down this decision and use it during vectorization.
6073       unsigned Cost;
6074       InstWidening Decision;
6075       if (InterleaveCost <= GatherScatterCost &&
6076           InterleaveCost < ScalarizationCost) {
6077         Decision = CM_Interleave;
6078         Cost = InterleaveCost;
6079       } else if (GatherScatterCost < ScalarizationCost) {
6080         Decision = CM_GatherScatter;
6081         Cost = GatherScatterCost;
6082       } else {
6083         Decision = CM_Scalarize;
6084         Cost = ScalarizationCost;
6085       }
6086       // If the instructions belongs to an interleave group, the whole group
6087       // receives the same decision. The whole group receives the cost, but
6088       // the cost will actually be assigned to one instruction.
6089       if (auto Group = getInterleavedAccessGroup(&I))
6090         setWideningDecision(Group, VF, Decision, Cost);
6091       else
6092         setWideningDecision(&I, VF, Decision, Cost);
6093     }
6094   }
6095 
6096   // Make sure that any load of address and any other address computation
6097   // remains scalar unless there is gather/scatter support. This avoids
6098   // inevitable extracts into address registers, and also has the benefit of
6099   // activating LSR more, since that pass can't optimize vectorized
6100   // addresses.
6101   if (TTI.prefersVectorizedAddressing())
6102     return;
6103 
6104   // Start with all scalar pointer uses.
6105   SmallPtrSet<Instruction *, 8> AddrDefs;
6106   for (BasicBlock *BB : TheLoop->blocks())
6107     for (Instruction &I : *BB) {
6108       Instruction *PtrDef =
6109         dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
6110       if (PtrDef && TheLoop->contains(PtrDef) &&
6111           getWideningDecision(&I, VF) != CM_GatherScatter)
6112         AddrDefs.insert(PtrDef);
6113     }
6114 
6115   // Add all instructions used to generate the addresses.
6116   SmallVector<Instruction *, 4> Worklist;
6117   for (auto *I : AddrDefs)
6118     Worklist.push_back(I);
6119   while (!Worklist.empty()) {
6120     Instruction *I = Worklist.pop_back_val();
6121     for (auto &Op : I->operands())
6122       if (auto *InstOp = dyn_cast<Instruction>(Op))
6123         if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
6124             AddrDefs.insert(InstOp).second)
6125           Worklist.push_back(InstOp);
6126   }
6127 
6128   for (auto *I : AddrDefs) {
6129     if (isa<LoadInst>(I)) {
6130       // Setting the desired widening decision should ideally be handled in
6131       // by cost functions, but since this involves the task of finding out
6132       // if the loaded register is involved in an address computation, it is
6133       // instead changed here when we know this is the case.
6134       InstWidening Decision = getWideningDecision(I, VF);
6135       if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
6136         // Scalarize a widened load of address.
6137         setWideningDecision(I, VF, CM_Scalarize,
6138                             (VF * getMemoryInstructionCost(I, 1)));
6139       else if (auto Group = getInterleavedAccessGroup(I)) {
6140         // Scalarize an interleave group of address loads.
6141         for (unsigned I = 0; I < Group->getFactor(); ++I) {
6142           if (Instruction *Member = Group->getMember(I))
6143             setWideningDecision(Member, VF, CM_Scalarize,
6144                                 (VF * getMemoryInstructionCost(Member, 1)));
6145         }
6146       }
6147     } else
6148       // Make sure I gets scalarized and a cost estimate without
6149       // scalarization overhead.
6150       ForcedScalars[VF].insert(I);
6151   }
6152 }
6153 
6154 unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
6155                                                         unsigned VF,
6156                                                         Type *&VectorTy) {
6157   Type *RetTy = I->getType();
6158   if (canTruncateToMinimalBitwidth(I, VF))
6159     RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
6160   VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF);
6161   auto SE = PSE.getSE();
6162 
6163   // TODO: We need to estimate the cost of intrinsic calls.
6164   switch (I->getOpcode()) {
6165   case Instruction::GetElementPtr:
6166     // We mark this instruction as zero-cost because the cost of GEPs in
6167     // vectorized code depends on whether the corresponding memory instruction
6168     // is scalarized or not. Therefore, we handle GEPs with the memory
6169     // instruction cost.
6170     return 0;
6171   case Instruction::Br: {
6172     // In cases of scalarized and predicated instructions, there will be VF
6173     // predicated blocks in the vectorized loop. Each branch around these
6174     // blocks requires also an extract of its vector compare i1 element.
6175     bool ScalarPredicatedBB = false;
6176     BranchInst *BI = cast<BranchInst>(I);
6177     if (VF > 1 && BI->isConditional() &&
6178         (PredicatedBBsAfterVectorization.find(BI->getSuccessor(0)) !=
6179              PredicatedBBsAfterVectorization.end() ||
6180          PredicatedBBsAfterVectorization.find(BI->getSuccessor(1)) !=
6181              PredicatedBBsAfterVectorization.end()))
6182       ScalarPredicatedBB = true;
6183 
6184     if (ScalarPredicatedBB) {
6185       // Return cost for branches around scalarized and predicated blocks.
6186       Type *Vec_i1Ty =
6187           VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
6188       return (TTI.getScalarizationOverhead(Vec_i1Ty, false, true) +
6189               (TTI.getCFInstrCost(Instruction::Br) * VF));
6190     } else if (I->getParent() == TheLoop->getLoopLatch() || VF == 1)
6191       // The back-edge branch will remain, as will all scalar branches.
6192       return TTI.getCFInstrCost(Instruction::Br);
6193     else
6194       // This branch will be eliminated by if-conversion.
6195       return 0;
6196     // Note: We currently assume zero cost for an unconditional branch inside
6197     // a predicated block since it will become a fall-through, although we
6198     // may decide in the future to call TTI for all branches.
6199   }
6200   case Instruction::PHI: {
6201     auto *Phi = cast<PHINode>(I);
6202 
6203     // First-order recurrences are replaced by vector shuffles inside the loop.
6204     // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type.
6205     if (VF > 1 && Legal->isFirstOrderRecurrence(Phi))
6206       return TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector,
6207                                 VectorTy, VF - 1, VectorType::get(RetTy, 1));
6208 
6209     // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
6210     // converted into select instructions. We require N - 1 selects per phi
6211     // node, where N is the number of incoming values.
6212     if (VF > 1 && Phi->getParent() != TheLoop->getHeader())
6213       return (Phi->getNumIncomingValues() - 1) *
6214              TTI.getCmpSelInstrCost(
6215                  Instruction::Select, ToVectorTy(Phi->getType(), VF),
6216                  ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF));
6217 
6218     return TTI.getCFInstrCost(Instruction::PHI);
6219   }
6220   case Instruction::UDiv:
6221   case Instruction::SDiv:
6222   case Instruction::URem:
6223   case Instruction::SRem:
6224     // If we have a predicated instruction, it may not be executed for each
6225     // vector lane. Get the scalarization cost and scale this amount by the
6226     // probability of executing the predicated block. If the instruction is not
6227     // predicated, we fall through to the next case.
6228     if (VF > 1 && isScalarWithPredication(I)) {
6229       unsigned Cost = 0;
6230 
6231       // These instructions have a non-void type, so account for the phi nodes
6232       // that we will create. This cost is likely to be zero. The phi node
6233       // cost, if any, should be scaled by the block probability because it
6234       // models a copy at the end of each predicated block.
6235       Cost += VF * TTI.getCFInstrCost(Instruction::PHI);
6236 
6237       // The cost of the non-predicated instruction.
6238       Cost += VF * TTI.getArithmeticInstrCost(I->getOpcode(), RetTy);
6239 
6240       // The cost of insertelement and extractelement instructions needed for
6241       // scalarization.
6242       Cost += getScalarizationOverhead(I, VF);
6243 
6244       // Scale the cost by the probability of executing the predicated blocks.
6245       // This assumes the predicated block for each vector lane is equally
6246       // likely.
6247       return Cost / getReciprocalPredBlockProb();
6248     }
6249     LLVM_FALLTHROUGH;
6250   case Instruction::Add:
6251   case Instruction::FAdd:
6252   case Instruction::Sub:
6253   case Instruction::FSub:
6254   case Instruction::Mul:
6255   case Instruction::FMul:
6256   case Instruction::FDiv:
6257   case Instruction::FRem:
6258   case Instruction::Shl:
6259   case Instruction::LShr:
6260   case Instruction::AShr:
6261   case Instruction::And:
6262   case Instruction::Or:
6263   case Instruction::Xor: {
6264     // Since we will replace the stride by 1 the multiplication should go away.
6265     if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal))
6266       return 0;
6267     // Certain instructions can be cheaper to vectorize if they have a constant
6268     // second vector operand. One example of this are shifts on x86.
6269     Value *Op2 = I->getOperand(1);
6270     TargetTransformInfo::OperandValueProperties Op2VP;
6271     TargetTransformInfo::OperandValueKind Op2VK =
6272         TTI.getOperandInfo(Op2, Op2VP);
6273     if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2))
6274       Op2VK = TargetTransformInfo::OK_UniformValue;
6275 
6276     SmallVector<const Value *, 4> Operands(I->operand_values());
6277     unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
6278     return N * TTI.getArithmeticInstrCost(
6279                    I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue,
6280                    Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I);
6281   }
6282   case Instruction::FNeg: {
6283     unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
6284     return N * TTI.getArithmeticInstrCost(
6285                    I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue,
6286                    TargetTransformInfo::OK_AnyValue,
6287                    TargetTransformInfo::OP_None, TargetTransformInfo::OP_None,
6288                    I->getOperand(0), I);
6289   }
6290   case Instruction::Select: {
6291     SelectInst *SI = cast<SelectInst>(I);
6292     const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
6293     bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
6294     Type *CondTy = SI->getCondition()->getType();
6295     if (!ScalarCond)
6296       CondTy = VectorType::get(CondTy, VF);
6297 
6298     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, I);
6299   }
6300   case Instruction::ICmp:
6301   case Instruction::FCmp: {
6302     Type *ValTy = I->getOperand(0)->getType();
6303     Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
6304     if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
6305       ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
6306     VectorTy = ToVectorTy(ValTy, VF);
6307     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, I);
6308   }
6309   case Instruction::Store:
6310   case Instruction::Load: {
6311     unsigned Width = VF;
6312     if (Width > 1) {
6313       InstWidening Decision = getWideningDecision(I, Width);
6314       assert(Decision != CM_Unknown &&
6315              "CM decision should be taken at this point");
6316       if (Decision == CM_Scalarize)
6317         Width = 1;
6318     }
6319     VectorTy = ToVectorTy(getMemInstValueType(I), Width);
6320     return getMemoryInstructionCost(I, VF);
6321   }
6322   case Instruction::ZExt:
6323   case Instruction::SExt:
6324   case Instruction::FPToUI:
6325   case Instruction::FPToSI:
6326   case Instruction::FPExt:
6327   case Instruction::PtrToInt:
6328   case Instruction::IntToPtr:
6329   case Instruction::SIToFP:
6330   case Instruction::UIToFP:
6331   case Instruction::Trunc:
6332   case Instruction::FPTrunc:
6333   case Instruction::BitCast: {
6334     // We optimize the truncation of induction variables having constant
6335     // integer steps. The cost of these truncations is the same as the scalar
6336     // operation.
6337     if (isOptimizableIVTruncate(I, VF)) {
6338       auto *Trunc = cast<TruncInst>(I);
6339       return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
6340                                   Trunc->getSrcTy(), Trunc);
6341     }
6342 
6343     Type *SrcScalarTy = I->getOperand(0)->getType();
6344     Type *SrcVecTy =
6345         VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
6346     if (canTruncateToMinimalBitwidth(I, VF)) {
6347       // This cast is going to be shrunk. This may remove the cast or it might
6348       // turn it into slightly different cast. For example, if MinBW == 16,
6349       // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
6350       //
6351       // Calculate the modified src and dest types.
6352       Type *MinVecTy = VectorTy;
6353       if (I->getOpcode() == Instruction::Trunc) {
6354         SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
6355         VectorTy =
6356             largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
6357       } else if (I->getOpcode() == Instruction::ZExt ||
6358                  I->getOpcode() == Instruction::SExt) {
6359         SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy);
6360         VectorTy =
6361             smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
6362       }
6363     }
6364 
6365     unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
6366     return N * TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy, I);
6367   }
6368   case Instruction::Call: {
6369     bool NeedToScalarize;
6370     CallInst *CI = cast<CallInst>(I);
6371     unsigned CallCost = getVectorCallCost(CI, VF, NeedToScalarize);
6372     if (getVectorIntrinsicIDForCall(CI, TLI))
6373       return std::min(CallCost, getVectorIntrinsicCost(CI, VF));
6374     return CallCost;
6375   }
6376   default:
6377     // The cost of executing VF copies of the scalar instruction. This opcode
6378     // is unknown. Assume that it is the same as 'mul'.
6379     return VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy) +
6380            getScalarizationOverhead(I, VF);
6381   } // end of switch.
6382 }
6383 
6384 char LoopVectorize::ID = 0;
6385 
6386 static const char lv_name[] = "Loop Vectorization";
6387 
6388 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
6389 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
6390 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)
6391 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
6392 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
6393 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
6394 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
6395 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
6396 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
6397 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
6398 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
6399 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
6400 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
6401 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
6402 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy)
6403 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
6404 
6405 namespace llvm {
6406 
6407 Pass *createLoopVectorizePass() { return new LoopVectorize(); }
6408 
6409 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced,
6410                               bool VectorizeOnlyWhenForced) {
6411   return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced);
6412 }
6413 
6414 } // end namespace llvm
6415 
6416 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
6417   // Check if the pointer operand of a load or store instruction is
6418   // consecutive.
6419   if (auto *Ptr = getLoadStorePointerOperand(Inst))
6420     return Legal->isConsecutivePtr(Ptr);
6421   return false;
6422 }
6423 
6424 void LoopVectorizationCostModel::collectValuesToIgnore() {
6425   // Ignore ephemeral values.
6426   CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
6427 
6428   // Ignore type-promoting instructions we identified during reduction
6429   // detection.
6430   for (auto &Reduction : *Legal->getReductionVars()) {
6431     RecurrenceDescriptor &RedDes = Reduction.second;
6432     SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
6433     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
6434   }
6435   // Ignore type-casting instructions we identified during induction
6436   // detection.
6437   for (auto &Induction : *Legal->getInductionVars()) {
6438     InductionDescriptor &IndDes = Induction.second;
6439     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
6440     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
6441   }
6442 }
6443 
6444 // TODO: we could return a pair of values that specify the max VF and
6445 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
6446 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
6447 // doesn't have a cost model that can choose which plan to execute if
6448 // more than one is generated.
6449 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits,
6450                                  LoopVectorizationCostModel &CM) {
6451   unsigned WidestType;
6452   std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
6453   return WidestVectorRegBits / WidestType;
6454 }
6455 
6456 VectorizationFactor
6457 LoopVectorizationPlanner::planInVPlanNativePath(unsigned UserVF) {
6458   unsigned VF = UserVF;
6459   // Outer loop handling: They may require CFG and instruction level
6460   // transformations before even evaluating whether vectorization is profitable.
6461   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
6462   // the vectorization pipeline.
6463   if (!OrigLoop->empty()) {
6464     // If the user doesn't provide a vectorization factor, determine a
6465     // reasonable one.
6466     if (!UserVF) {
6467       VF = determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM);
6468       LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
6469 
6470       // Make sure we have a VF > 1 for stress testing.
6471       if (VPlanBuildStressTest && VF < 2) {
6472         LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
6473                           << "overriding computed VF.\n");
6474         VF = 4;
6475       }
6476     }
6477     assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
6478     assert(isPowerOf2_32(VF) && "VF needs to be a power of two");
6479     LLVM_DEBUG(dbgs() << "LV: Using " << (UserVF ? "user " : "") << "VF " << VF
6480                       << " to build VPlans.\n");
6481     buildVPlans(VF, VF);
6482 
6483     // For VPlan build stress testing, we bail out after VPlan construction.
6484     if (VPlanBuildStressTest)
6485       return VectorizationFactor::Disabled();
6486 
6487     return {VF, 0};
6488   }
6489 
6490   LLVM_DEBUG(
6491       dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
6492                 "VPlan-native path.\n");
6493   return VectorizationFactor::Disabled();
6494 }
6495 
6496 Optional<VectorizationFactor> LoopVectorizationPlanner::plan(unsigned UserVF) {
6497   assert(OrigLoop->empty() && "Inner loop expected.");
6498   Optional<unsigned> MaybeMaxVF = CM.computeMaxVF();
6499   if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved.
6500     return None;
6501 
6502   // Invalidate interleave groups if all blocks of loop will be predicated.
6503   if (CM.blockNeedsPredication(OrigLoop->getHeader()) &&
6504       !useMaskedInterleavedAccesses(*TTI)) {
6505     LLVM_DEBUG(
6506         dbgs()
6507         << "LV: Invalidate all interleaved groups due to fold-tail by masking "
6508            "which requires masked-interleaved support.\n");
6509     CM.InterleaveInfo.reset();
6510   }
6511 
6512   if (UserVF) {
6513     LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
6514     assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two");
6515     // Collect the instructions (and their associated costs) that will be more
6516     // profitable to scalarize.
6517     CM.selectUserVectorizationFactor(UserVF);
6518     buildVPlansWithVPRecipes(UserVF, UserVF);
6519     LLVM_DEBUG(printPlans(dbgs()));
6520     return {{UserVF, 0}};
6521   }
6522 
6523   unsigned MaxVF = MaybeMaxVF.getValue();
6524   assert(MaxVF != 0 && "MaxVF is zero.");
6525 
6526   for (unsigned VF = 1; VF <= MaxVF; VF *= 2) {
6527     // Collect Uniform and Scalar instructions after vectorization with VF.
6528     CM.collectUniformsAndScalars(VF);
6529 
6530     // Collect the instructions (and their associated costs) that will be more
6531     // profitable to scalarize.
6532     if (VF > 1)
6533       CM.collectInstsToScalarize(VF);
6534   }
6535 
6536   buildVPlansWithVPRecipes(1, MaxVF);
6537   LLVM_DEBUG(printPlans(dbgs()));
6538   if (MaxVF == 1)
6539     return VectorizationFactor::Disabled();
6540 
6541   // Select the optimal vectorization factor.
6542   return CM.selectVectorizationFactor(MaxVF);
6543 }
6544 
6545 void LoopVectorizationPlanner::setBestPlan(unsigned VF, unsigned UF) {
6546   LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF
6547                     << '\n');
6548   BestVF = VF;
6549   BestUF = UF;
6550 
6551   erase_if(VPlans, [VF](const VPlanPtr &Plan) {
6552     return !Plan->hasVF(VF);
6553   });
6554   assert(VPlans.size() == 1 && "Best VF has not a single VPlan.");
6555 }
6556 
6557 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV,
6558                                            DominatorTree *DT) {
6559   // Perform the actual loop transformation.
6560 
6561   // 1. Create a new empty loop. Unlink the old loop and connect the new one.
6562   VPCallbackILV CallbackILV(ILV);
6563 
6564   VPTransformState State{BestVF, BestUF,      LI,
6565                          DT,     ILV.Builder, ILV.VectorLoopValueMap,
6566                          &ILV,   CallbackILV};
6567   State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
6568   State.TripCount = ILV.getOrCreateTripCount(nullptr);
6569 
6570   //===------------------------------------------------===//
6571   //
6572   // Notice: any optimization or new instruction that go
6573   // into the code below should also be implemented in
6574   // the cost-model.
6575   //
6576   //===------------------------------------------------===//
6577 
6578   // 2. Copy and widen instructions from the old loop into the new loop.
6579   assert(VPlans.size() == 1 && "Not a single VPlan to execute.");
6580   VPlans.front()->execute(&State);
6581 
6582   // 3. Fix the vectorized code: take care of header phi's, live-outs,
6583   //    predication, updating analyses.
6584   ILV.fixVectorizedLoop();
6585 }
6586 
6587 void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
6588     SmallPtrSetImpl<Instruction *> &DeadInstructions) {
6589   BasicBlock *Latch = OrigLoop->getLoopLatch();
6590 
6591   // We create new control-flow for the vectorized loop, so the original
6592   // condition will be dead after vectorization if it's only used by the
6593   // branch.
6594   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
6595   if (Cmp && Cmp->hasOneUse())
6596     DeadInstructions.insert(Cmp);
6597 
6598   // We create new "steps" for induction variable updates to which the original
6599   // induction variables map. An original update instruction will be dead if
6600   // all its users except the induction variable are dead.
6601   for (auto &Induction : *Legal->getInductionVars()) {
6602     PHINode *Ind = Induction.first;
6603     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
6604     if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
6605           return U == Ind || DeadInstructions.find(cast<Instruction>(U)) !=
6606                                  DeadInstructions.end();
6607         }))
6608       DeadInstructions.insert(IndUpdate);
6609 
6610     // We record as "Dead" also the type-casting instructions we had identified
6611     // during induction analysis. We don't need any handling for them in the
6612     // vectorized loop because we have proven that, under a proper runtime
6613     // test guarding the vectorized loop, the value of the phi, and the casted
6614     // value of the phi, are the same. The last instruction in this casting chain
6615     // will get its scalar/vector/widened def from the scalar/vector/widened def
6616     // of the respective phi node. Any other casts in the induction def-use chain
6617     // have no other uses outside the phi update chain, and will be ignored.
6618     InductionDescriptor &IndDes = Induction.second;
6619     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
6620     DeadInstructions.insert(Casts.begin(), Casts.end());
6621   }
6622 }
6623 
6624 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; }
6625 
6626 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; }
6627 
6628 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step,
6629                                         Instruction::BinaryOps BinOp) {
6630   // When unrolling and the VF is 1, we only need to add a simple scalar.
6631   Type *Ty = Val->getType();
6632   assert(!Ty->isVectorTy() && "Val must be a scalar");
6633 
6634   if (Ty->isFloatingPointTy()) {
6635     Constant *C = ConstantFP::get(Ty, (double)StartIdx);
6636 
6637     // Floating point operations had to be 'fast' to enable the unrolling.
6638     Value *MulOp = addFastMathFlag(Builder.CreateFMul(C, Step));
6639     return addFastMathFlag(Builder.CreateBinOp(BinOp, Val, MulOp));
6640   }
6641   Constant *C = ConstantInt::get(Ty, StartIdx);
6642   return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction");
6643 }
6644 
6645 static void AddRuntimeUnrollDisableMetaData(Loop *L) {
6646   SmallVector<Metadata *, 4> MDs;
6647   // Reserve first location for self reference to the LoopID metadata node.
6648   MDs.push_back(nullptr);
6649   bool IsUnrollMetadata = false;
6650   MDNode *LoopID = L->getLoopID();
6651   if (LoopID) {
6652     // First find existing loop unrolling disable metadata.
6653     for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
6654       auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
6655       if (MD) {
6656         const auto *S = dyn_cast<MDString>(MD->getOperand(0));
6657         IsUnrollMetadata =
6658             S && S->getString().startswith("llvm.loop.unroll.disable");
6659       }
6660       MDs.push_back(LoopID->getOperand(i));
6661     }
6662   }
6663 
6664   if (!IsUnrollMetadata) {
6665     // Add runtime unroll disable metadata.
6666     LLVMContext &Context = L->getHeader()->getContext();
6667     SmallVector<Metadata *, 1> DisableOperands;
6668     DisableOperands.push_back(
6669         MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
6670     MDNode *DisableNode = MDNode::get(Context, DisableOperands);
6671     MDs.push_back(DisableNode);
6672     MDNode *NewLoopID = MDNode::get(Context, MDs);
6673     // Set operand 0 to refer to the loop id itself.
6674     NewLoopID->replaceOperandWith(0, NewLoopID);
6675     L->setLoopID(NewLoopID);
6676   }
6677 }
6678 
6679 bool LoopVectorizationPlanner::getDecisionAndClampRange(
6680     const std::function<bool(unsigned)> &Predicate, VFRange &Range) {
6681   assert(Range.End > Range.Start && "Trying to test an empty VF range.");
6682   bool PredicateAtRangeStart = Predicate(Range.Start);
6683 
6684   for (unsigned TmpVF = Range.Start * 2; TmpVF < Range.End; TmpVF *= 2)
6685     if (Predicate(TmpVF) != PredicateAtRangeStart) {
6686       Range.End = TmpVF;
6687       break;
6688     }
6689 
6690   return PredicateAtRangeStart;
6691 }
6692 
6693 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
6694 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
6695 /// of VF's starting at a given VF and extending it as much as possible. Each
6696 /// vectorization decision can potentially shorten this sub-range during
6697 /// buildVPlan().
6698 void LoopVectorizationPlanner::buildVPlans(unsigned MinVF, unsigned MaxVF) {
6699   for (unsigned VF = MinVF; VF < MaxVF + 1;) {
6700     VFRange SubRange = {VF, MaxVF + 1};
6701     VPlans.push_back(buildVPlan(SubRange));
6702     VF = SubRange.End;
6703   }
6704 }
6705 
6706 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
6707                                          VPlanPtr &Plan) {
6708   assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
6709 
6710   // Look for cached value.
6711   std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
6712   EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
6713   if (ECEntryIt != EdgeMaskCache.end())
6714     return ECEntryIt->second;
6715 
6716   VPValue *SrcMask = createBlockInMask(Src, Plan);
6717 
6718   // The terminator has to be a branch inst!
6719   BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
6720   assert(BI && "Unexpected terminator found");
6721 
6722   if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1))
6723     return EdgeMaskCache[Edge] = SrcMask;
6724 
6725   VPValue *EdgeMask = Plan->getVPValue(BI->getCondition());
6726   assert(EdgeMask && "No Edge Mask found for condition");
6727 
6728   if (BI->getSuccessor(0) != Dst)
6729     EdgeMask = Builder.createNot(EdgeMask);
6730 
6731   if (SrcMask) // Otherwise block in-mask is all-one, no need to AND.
6732     EdgeMask = Builder.createAnd(EdgeMask, SrcMask);
6733 
6734   return EdgeMaskCache[Edge] = EdgeMask;
6735 }
6736 
6737 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
6738   assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
6739 
6740   // Look for cached value.
6741   BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB);
6742   if (BCEntryIt != BlockMaskCache.end())
6743     return BCEntryIt->second;
6744 
6745   // All-one mask is modelled as no-mask following the convention for masked
6746   // load/store/gather/scatter. Initialize BlockMask to no-mask.
6747   VPValue *BlockMask = nullptr;
6748 
6749   if (OrigLoop->getHeader() == BB) {
6750     if (!CM.blockNeedsPredication(BB))
6751       return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one.
6752 
6753     // Introduce the early-exit compare IV <= BTC to form header block mask.
6754     // This is used instead of IV < TC because TC may wrap, unlike BTC.
6755     VPValue *IV = Plan->getVPValue(Legal->getPrimaryInduction());
6756     VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
6757     BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
6758     return BlockMaskCache[BB] = BlockMask;
6759   }
6760 
6761   // This is the block mask. We OR all incoming edges.
6762   for (auto *Predecessor : predecessors(BB)) {
6763     VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan);
6764     if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too.
6765       return BlockMaskCache[BB] = EdgeMask;
6766 
6767     if (!BlockMask) { // BlockMask has its initialized nullptr value.
6768       BlockMask = EdgeMask;
6769       continue;
6770     }
6771 
6772     BlockMask = Builder.createOr(BlockMask, EdgeMask);
6773   }
6774 
6775   return BlockMaskCache[BB] = BlockMask;
6776 }
6777 
6778 VPWidenMemoryInstructionRecipe *
6779 VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range,
6780                                   VPlanPtr &Plan) {
6781   if (!isa<LoadInst>(I) && !isa<StoreInst>(I))
6782     return nullptr;
6783 
6784   auto willWiden = [&](unsigned VF) -> bool {
6785     if (VF == 1)
6786       return false;
6787     LoopVectorizationCostModel::InstWidening Decision =
6788         CM.getWideningDecision(I, VF);
6789     assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
6790            "CM decision should be taken at this point.");
6791     if (Decision == LoopVectorizationCostModel::CM_Interleave)
6792       return true;
6793     if (CM.isScalarAfterVectorization(I, VF) ||
6794         CM.isProfitableToScalarize(I, VF))
6795       return false;
6796     return Decision != LoopVectorizationCostModel::CM_Scalarize;
6797   };
6798 
6799   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
6800     return nullptr;
6801 
6802   VPValue *Mask = nullptr;
6803   if (Legal->isMaskRequired(I))
6804     Mask = createBlockInMask(I->getParent(), Plan);
6805 
6806   VPValue *Addr = Plan->getOrAddVPValue(getLoadStorePointerOperand(I));
6807   return new VPWidenMemoryInstructionRecipe(*I, Addr, Mask);
6808 }
6809 
6810 VPWidenIntOrFpInductionRecipe *
6811 VPRecipeBuilder::tryToOptimizeInduction(Instruction *I, VFRange &Range) {
6812   if (PHINode *Phi = dyn_cast<PHINode>(I)) {
6813     // Check if this is an integer or fp induction. If so, build the recipe that
6814     // produces its scalar and vector values.
6815     InductionDescriptor II = Legal->getInductionVars()->lookup(Phi);
6816     if (II.getKind() == InductionDescriptor::IK_IntInduction ||
6817         II.getKind() == InductionDescriptor::IK_FpInduction)
6818       return new VPWidenIntOrFpInductionRecipe(Phi);
6819 
6820     return nullptr;
6821   }
6822 
6823   // Optimize the special case where the source is a constant integer
6824   // induction variable. Notice that we can only optimize the 'trunc' case
6825   // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
6826   // (c) other casts depend on pointer size.
6827 
6828   // Determine whether \p K is a truncation based on an induction variable that
6829   // can be optimized.
6830   auto isOptimizableIVTruncate =
6831       [&](Instruction *K) -> std::function<bool(unsigned)> {
6832     return
6833         [=](unsigned VF) -> bool { return CM.isOptimizableIVTruncate(K, VF); };
6834   };
6835 
6836   if (isa<TruncInst>(I) && LoopVectorizationPlanner::getDecisionAndClampRange(
6837                                isOptimizableIVTruncate(I), Range))
6838     return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)),
6839                                              cast<TruncInst>(I));
6840   return nullptr;
6841 }
6842 
6843 VPBlendRecipe *VPRecipeBuilder::tryToBlend(Instruction *I, VPlanPtr &Plan) {
6844   PHINode *Phi = dyn_cast<PHINode>(I);
6845   if (!Phi || Phi->getParent() == OrigLoop->getHeader())
6846     return nullptr;
6847 
6848   // We know that all PHIs in non-header blocks are converted into selects, so
6849   // we don't have to worry about the insertion order and we can just use the
6850   // builder. At this point we generate the predication tree. There may be
6851   // duplications since this is a simple recursive scan, but future
6852   // optimizations will clean it up.
6853 
6854   SmallVector<VPValue *, 2> Masks;
6855   unsigned NumIncoming = Phi->getNumIncomingValues();
6856   for (unsigned In = 0; In < NumIncoming; In++) {
6857     VPValue *EdgeMask =
6858       createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan);
6859     assert((EdgeMask || NumIncoming == 1) &&
6860            "Multiple predecessors with one having a full mask");
6861     if (EdgeMask)
6862       Masks.push_back(EdgeMask);
6863   }
6864   return new VPBlendRecipe(Phi, Masks);
6865 }
6866 
6867 bool VPRecipeBuilder::tryToWiden(Instruction *I, VPBasicBlock *VPBB,
6868                                  VFRange &Range) {
6869 
6870   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
6871       [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range);
6872 
6873   if (IsPredicated)
6874     return false;
6875 
6876   auto IsVectorizableOpcode = [](unsigned Opcode) {
6877     switch (Opcode) {
6878     case Instruction::Add:
6879     case Instruction::And:
6880     case Instruction::AShr:
6881     case Instruction::BitCast:
6882     case Instruction::Br:
6883     case Instruction::Call:
6884     case Instruction::FAdd:
6885     case Instruction::FCmp:
6886     case Instruction::FDiv:
6887     case Instruction::FMul:
6888     case Instruction::FNeg:
6889     case Instruction::FPExt:
6890     case Instruction::FPToSI:
6891     case Instruction::FPToUI:
6892     case Instruction::FPTrunc:
6893     case Instruction::FRem:
6894     case Instruction::FSub:
6895     case Instruction::ICmp:
6896     case Instruction::IntToPtr:
6897     case Instruction::Load:
6898     case Instruction::LShr:
6899     case Instruction::Mul:
6900     case Instruction::Or:
6901     case Instruction::PHI:
6902     case Instruction::PtrToInt:
6903     case Instruction::SDiv:
6904     case Instruction::Select:
6905     case Instruction::SExt:
6906     case Instruction::Shl:
6907     case Instruction::SIToFP:
6908     case Instruction::SRem:
6909     case Instruction::Store:
6910     case Instruction::Sub:
6911     case Instruction::Trunc:
6912     case Instruction::UDiv:
6913     case Instruction::UIToFP:
6914     case Instruction::URem:
6915     case Instruction::Xor:
6916     case Instruction::ZExt:
6917       return true;
6918     }
6919     return false;
6920   };
6921 
6922   if (!IsVectorizableOpcode(I->getOpcode()))
6923     return false;
6924 
6925   if (CallInst *CI = dyn_cast<CallInst>(I)) {
6926     Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
6927     if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
6928                ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect))
6929       return false;
6930   }
6931 
6932   auto willWiden = [&](unsigned VF) -> bool {
6933     if (!isa<PHINode>(I) && (CM.isScalarAfterVectorization(I, VF) ||
6934                              CM.isProfitableToScalarize(I, VF)))
6935       return false;
6936     if (CallInst *CI = dyn_cast<CallInst>(I)) {
6937       Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
6938       // The following case may be scalarized depending on the VF.
6939       // The flag shows whether we use Intrinsic or a usual Call for vectorized
6940       // version of the instruction.
6941       // Is it beneficial to perform intrinsic call compared to lib call?
6942       bool NeedToScalarize;
6943       unsigned CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize);
6944       bool UseVectorIntrinsic =
6945           ID && CM.getVectorIntrinsicCost(CI, VF) <= CallCost;
6946       return UseVectorIntrinsic || !NeedToScalarize;
6947     }
6948     if (isa<LoadInst>(I) || isa<StoreInst>(I)) {
6949       assert(CM.getWideningDecision(I, VF) ==
6950                  LoopVectorizationCostModel::CM_Scalarize &&
6951              "Memory widening decisions should have been taken care by now");
6952       return false;
6953     }
6954     return true;
6955   };
6956 
6957   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
6958     return false;
6959   // If this ingredient's recipe is to be recorded, keep its recipe a singleton
6960   // to avoid having to split recipes later.
6961   bool IsSingleton = Ingredient2Recipe.count(I);
6962 
6963   // Success: widen this instruction.
6964 
6965   // Use the default widening recipe. We optimize the common case where
6966   // consecutive instructions can be represented by a single recipe.
6967   if (!IsSingleton && !VPBB->empty() && LastExtensibleRecipe == &VPBB->back() &&
6968       LastExtensibleRecipe->appendInstruction(I))
6969     return true;
6970 
6971   VPWidenRecipe *WidenRecipe = new VPWidenRecipe(I);
6972   if (!IsSingleton)
6973     LastExtensibleRecipe = WidenRecipe;
6974   setRecipe(I, WidenRecipe);
6975   VPBB->appendRecipe(WidenRecipe);
6976   return true;
6977 }
6978 
6979 VPBasicBlock *VPRecipeBuilder::handleReplication(
6980     Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
6981     DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe,
6982     VPlanPtr &Plan) {
6983   bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
6984       [&](unsigned VF) { return CM.isUniformAfterVectorization(I, VF); },
6985       Range);
6986 
6987   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
6988       [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range);
6989 
6990   auto *Recipe = new VPReplicateRecipe(I, IsUniform, IsPredicated);
6991   setRecipe(I, Recipe);
6992 
6993   // Find if I uses a predicated instruction. If so, it will use its scalar
6994   // value. Avoid hoisting the insert-element which packs the scalar value into
6995   // a vector value, as that happens iff all users use the vector value.
6996   for (auto &Op : I->operands())
6997     if (auto *PredInst = dyn_cast<Instruction>(Op))
6998       if (PredInst2Recipe.find(PredInst) != PredInst2Recipe.end())
6999         PredInst2Recipe[PredInst]->setAlsoPack(false);
7000 
7001   // Finalize the recipe for Instr, first if it is not predicated.
7002   if (!IsPredicated) {
7003     LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
7004     VPBB->appendRecipe(Recipe);
7005     return VPBB;
7006   }
7007   LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
7008   assert(VPBB->getSuccessors().empty() &&
7009          "VPBB has successors when handling predicated replication.");
7010   // Record predicated instructions for above packing optimizations.
7011   PredInst2Recipe[I] = Recipe;
7012   VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan);
7013   VPBlockUtils::insertBlockAfter(Region, VPBB);
7014   auto *RegSucc = new VPBasicBlock();
7015   VPBlockUtils::insertBlockAfter(RegSucc, Region);
7016   return RegSucc;
7017 }
7018 
7019 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr,
7020                                                       VPRecipeBase *PredRecipe,
7021                                                       VPlanPtr &Plan) {
7022   // Instructions marked for predication are replicated and placed under an
7023   // if-then construct to prevent side-effects.
7024 
7025   // Generate recipes to compute the block mask for this region.
7026   VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan);
7027 
7028   // Build the triangular if-then region.
7029   std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
7030   assert(Instr->getParent() && "Predicated instruction not in any basic block");
7031   auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask);
7032   auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
7033   auto *PHIRecipe =
7034       Instr->getType()->isVoidTy() ? nullptr : new VPPredInstPHIRecipe(Instr);
7035   auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
7036   auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe);
7037   VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true);
7038 
7039   // Note: first set Entry as region entry and then connect successors starting
7040   // from it in order, to propagate the "parent" of each VPBasicBlock.
7041   VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry);
7042   VPBlockUtils::connectBlocks(Pred, Exit);
7043 
7044   return Region;
7045 }
7046 
7047 bool VPRecipeBuilder::tryToCreateRecipe(Instruction *Instr, VFRange &Range,
7048                                         VPlanPtr &Plan, VPBasicBlock *VPBB) {
7049   VPRecipeBase *Recipe = nullptr;
7050 
7051   // First, check for specific widening recipes that deal with memory
7052   // operations, inductions and Phi nodes.
7053   if ((Recipe = tryToWidenMemory(Instr, Range, Plan)) ||
7054       (Recipe = tryToOptimizeInduction(Instr, Range)) ||
7055       (Recipe = tryToBlend(Instr, Plan)) ||
7056       (isa<PHINode>(Instr) &&
7057        (Recipe = new VPWidenPHIRecipe(cast<PHINode>(Instr))))) {
7058     setRecipe(Instr, Recipe);
7059     VPBB->appendRecipe(Recipe);
7060     return true;
7061   }
7062 
7063   // Handle GEP widening.
7064   if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Instr)) {
7065     auto Scalarize = [&](unsigned VF) {
7066       return CM.isScalarWithPredication(Instr, VF) ||
7067              CM.isScalarAfterVectorization(Instr, VF) ||
7068              CM.isProfitableToScalarize(Instr, VF);
7069     };
7070     if (LoopVectorizationPlanner::getDecisionAndClampRange(Scalarize, Range))
7071       return false;
7072     VPWidenGEPRecipe *Recipe = new VPWidenGEPRecipe(GEP, OrigLoop);
7073     setRecipe(Instr, Recipe);
7074     VPBB->appendRecipe(Recipe);
7075     return true;
7076   }
7077 
7078   // Check if Instr is to be widened by a general VPWidenRecipe, after
7079   // having first checked for specific widening recipes.
7080   if (tryToWiden(Instr, VPBB, Range))
7081     return true;
7082 
7083   return false;
7084 }
7085 
7086 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF,
7087                                                         unsigned MaxVF) {
7088   assert(OrigLoop->empty() && "Inner loop expected.");
7089 
7090   // Collect conditions feeding internal conditional branches; they need to be
7091   // represented in VPlan for it to model masking.
7092   SmallPtrSet<Value *, 1> NeedDef;
7093 
7094   auto *Latch = OrigLoop->getLoopLatch();
7095   for (BasicBlock *BB : OrigLoop->blocks()) {
7096     if (BB == Latch)
7097       continue;
7098     BranchInst *Branch = dyn_cast<BranchInst>(BB->getTerminator());
7099     if (Branch && Branch->isConditional())
7100       NeedDef.insert(Branch->getCondition());
7101   }
7102 
7103   // If the tail is to be folded by masking, the primary induction variable
7104   // needs to be represented in VPlan for it to model early-exit masking.
7105   // Also, both the Phi and the live-out instruction of each reduction are
7106   // required in order to introduce a select between them in VPlan.
7107   if (CM.foldTailByMasking()) {
7108     NeedDef.insert(Legal->getPrimaryInduction());
7109     for (auto &Reduction : *Legal->getReductionVars()) {
7110       NeedDef.insert(Reduction.first);
7111       NeedDef.insert(Reduction.second.getLoopExitInstr());
7112     }
7113   }
7114 
7115   // Collect instructions from the original loop that will become trivially dead
7116   // in the vectorized loop. We don't need to vectorize these instructions. For
7117   // example, original induction update instructions can become dead because we
7118   // separately emit induction "steps" when generating code for the new loop.
7119   // Similarly, we create a new latch condition when setting up the structure
7120   // of the new loop, so the old one can become dead.
7121   SmallPtrSet<Instruction *, 4> DeadInstructions;
7122   collectTriviallyDeadInstructions(DeadInstructions);
7123 
7124   // Add assume instructions we need to drop to DeadInstructions, to prevent
7125   // them from being added to the VPlan.
7126   // TODO: We only need to drop assumes in blocks that get flattend. If the
7127   // control flow is preserved, we should keep them.
7128   auto &ConditionalAssumes = Legal->getConditionalAssumes();
7129   DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end());
7130 
7131   DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter();
7132   // Dead instructions do not need sinking. Remove them from SinkAfter.
7133   for (Instruction *I : DeadInstructions)
7134     SinkAfter.erase(I);
7135 
7136   for (unsigned VF = MinVF; VF < MaxVF + 1;) {
7137     VFRange SubRange = {VF, MaxVF + 1};
7138     VPlans.push_back(buildVPlanWithVPRecipes(SubRange, NeedDef,
7139                                              DeadInstructions, SinkAfter));
7140     VF = SubRange.End;
7141   }
7142 }
7143 
7144 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
7145     VFRange &Range, SmallPtrSetImpl<Value *> &NeedDef,
7146     SmallPtrSetImpl<Instruction *> &DeadInstructions,
7147     const DenseMap<Instruction *, Instruction *> &SinkAfter) {
7148 
7149   // Hold a mapping from predicated instructions to their recipes, in order to
7150   // fix their AlsoPack behavior if a user is determined to replicate and use a
7151   // scalar instead of vector value.
7152   DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe;
7153 
7154   SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
7155 
7156   VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, Builder);
7157 
7158   // ---------------------------------------------------------------------------
7159   // Pre-construction: record ingredients whose recipes we'll need to further
7160   // process after constructing the initial VPlan.
7161   // ---------------------------------------------------------------------------
7162 
7163   // Mark instructions we'll need to sink later and their targets as
7164   // ingredients whose recipe we'll need to record.
7165   for (auto &Entry : SinkAfter) {
7166     RecipeBuilder.recordRecipeOf(Entry.first);
7167     RecipeBuilder.recordRecipeOf(Entry.second);
7168   }
7169 
7170   // For each interleave group which is relevant for this (possibly trimmed)
7171   // Range, add it to the set of groups to be later applied to the VPlan and add
7172   // placeholders for its members' Recipes which we'll be replacing with a
7173   // single VPInterleaveRecipe.
7174   for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
7175     auto applyIG = [IG, this](unsigned VF) -> bool {
7176       return (VF >= 2 && // Query is illegal for VF == 1
7177               CM.getWideningDecision(IG->getInsertPos(), VF) ==
7178                   LoopVectorizationCostModel::CM_Interleave);
7179     };
7180     if (!getDecisionAndClampRange(applyIG, Range))
7181       continue;
7182     InterleaveGroups.insert(IG);
7183     for (unsigned i = 0; i < IG->getFactor(); i++)
7184       if (Instruction *Member = IG->getMember(i))
7185         RecipeBuilder.recordRecipeOf(Member);
7186   };
7187 
7188   // ---------------------------------------------------------------------------
7189   // Build initial VPlan: Scan the body of the loop in a topological order to
7190   // visit each basic block after having visited its predecessor basic blocks.
7191   // ---------------------------------------------------------------------------
7192 
7193   // Create a dummy pre-entry VPBasicBlock to start building the VPlan.
7194   VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry");
7195   auto Plan = std::make_unique<VPlan>(VPBB);
7196 
7197   // Represent values that will have defs inside VPlan.
7198   for (Value *V : NeedDef)
7199     Plan->addVPValue(V);
7200 
7201   // Scan the body of the loop in a topological order to visit each basic block
7202   // after having visited its predecessor basic blocks.
7203   LoopBlocksDFS DFS(OrigLoop);
7204   DFS.perform(LI);
7205 
7206   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
7207     // Relevant instructions from basic block BB will be grouped into VPRecipe
7208     // ingredients and fill a new VPBasicBlock.
7209     unsigned VPBBsForBB = 0;
7210     auto *FirstVPBBForBB = new VPBasicBlock(BB->getName());
7211     VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB);
7212     VPBB = FirstVPBBForBB;
7213     Builder.setInsertPoint(VPBB);
7214 
7215     // Introduce each ingredient into VPlan.
7216     for (Instruction &I : BB->instructionsWithoutDebug()) {
7217       Instruction *Instr = &I;
7218 
7219       // First filter out irrelevant instructions, to ensure no recipes are
7220       // built for them.
7221       if (isa<BranchInst>(Instr) ||
7222           DeadInstructions.find(Instr) != DeadInstructions.end())
7223         continue;
7224 
7225       if (RecipeBuilder.tryToCreateRecipe(Instr, Range, Plan, VPBB))
7226         continue;
7227 
7228       // Otherwise, if all widening options failed, Instruction is to be
7229       // replicated. This may create a successor for VPBB.
7230       VPBasicBlock *NextVPBB = RecipeBuilder.handleReplication(
7231           Instr, Range, VPBB, PredInst2Recipe, Plan);
7232       if (NextVPBB != VPBB) {
7233         VPBB = NextVPBB;
7234         VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++)
7235                                     : "");
7236       }
7237     }
7238   }
7239 
7240   // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks
7241   // may also be empty, such as the last one VPBB, reflecting original
7242   // basic-blocks with no recipes.
7243   VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry());
7244   assert(PreEntry->empty() && "Expecting empty pre-entry block.");
7245   VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor());
7246   VPBlockUtils::disconnectBlocks(PreEntry, Entry);
7247   delete PreEntry;
7248 
7249   // ---------------------------------------------------------------------------
7250   // Transform initial VPlan: Apply previously taken decisions, in order, to
7251   // bring the VPlan to its final state.
7252   // ---------------------------------------------------------------------------
7253 
7254   // Apply Sink-After legal constraints.
7255   for (auto &Entry : SinkAfter) {
7256     VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first);
7257     VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second);
7258     Sink->moveAfter(Target);
7259   }
7260 
7261   // Interleave memory: for each Interleave Group we marked earlier as relevant
7262   // for this VPlan, replace the Recipes widening its memory instructions with a
7263   // single VPInterleaveRecipe at its insertion point.
7264   for (auto IG : InterleaveGroups) {
7265     auto *Recipe = cast<VPWidenMemoryInstructionRecipe>(
7266         RecipeBuilder.getRecipe(IG->getInsertPos()));
7267     (new VPInterleaveRecipe(IG, Recipe->getAddr(), Recipe->getMask()))
7268         ->insertBefore(Recipe);
7269 
7270     for (unsigned i = 0; i < IG->getFactor(); ++i)
7271       if (Instruction *Member = IG->getMember(i)) {
7272         RecipeBuilder.getRecipe(Member)->eraseFromParent();
7273       }
7274   }
7275 
7276   // Finally, if tail is folded by masking, introduce selects between the phi
7277   // and the live-out instruction of each reduction, at the end of the latch.
7278   if (CM.foldTailByMasking()) {
7279     Builder.setInsertPoint(VPBB);
7280     auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan);
7281     for (auto &Reduction : *Legal->getReductionVars()) {
7282       VPValue *Phi = Plan->getVPValue(Reduction.first);
7283       VPValue *Red = Plan->getVPValue(Reduction.second.getLoopExitInstr());
7284       Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi});
7285     }
7286   }
7287 
7288   std::string PlanName;
7289   raw_string_ostream RSO(PlanName);
7290   unsigned VF = Range.Start;
7291   Plan->addVF(VF);
7292   RSO << "Initial VPlan for VF={" << VF;
7293   for (VF *= 2; VF < Range.End; VF *= 2) {
7294     Plan->addVF(VF);
7295     RSO << "," << VF;
7296   }
7297   RSO << "},UF>=1";
7298   RSO.flush();
7299   Plan->setName(PlanName);
7300 
7301   return Plan;
7302 }
7303 
7304 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
7305   // Outer loop handling: They may require CFG and instruction level
7306   // transformations before even evaluating whether vectorization is profitable.
7307   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
7308   // the vectorization pipeline.
7309   assert(!OrigLoop->empty());
7310   assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
7311 
7312   // Create new empty VPlan
7313   auto Plan = std::make_unique<VPlan>();
7314 
7315   // Build hierarchical CFG
7316   VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
7317   HCFGBuilder.buildHierarchicalCFG();
7318 
7319   for (unsigned VF = Range.Start; VF < Range.End; VF *= 2)
7320     Plan->addVF(VF);
7321 
7322   if (EnableVPlanPredication) {
7323     VPlanPredicator VPP(*Plan);
7324     VPP.predicate();
7325 
7326     // Avoid running transformation to recipes until masked code generation in
7327     // VPlan-native path is in place.
7328     return Plan;
7329   }
7330 
7331   SmallPtrSet<Instruction *, 1> DeadInstructions;
7332   VPlanTransforms::VPInstructionsToVPRecipes(
7333       OrigLoop, Plan, Legal->getInductionVars(), DeadInstructions);
7334   return Plan;
7335 }
7336 
7337 Value* LoopVectorizationPlanner::VPCallbackILV::
7338 getOrCreateVectorValues(Value *V, unsigned Part) {
7339       return ILV.getOrCreateVectorValue(V, Part);
7340 }
7341 
7342 Value *LoopVectorizationPlanner::VPCallbackILV::getOrCreateScalarValue(
7343     Value *V, const VPIteration &Instance) {
7344   return ILV.getOrCreateScalarValue(V, Instance);
7345 }
7346 
7347 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent) const {
7348   O << " +\n"
7349     << Indent << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
7350   IG->getInsertPos()->printAsOperand(O, false);
7351   O << ", ";
7352   getAddr()->printAsOperand(O);
7353   VPValue *Mask = getMask();
7354   if (Mask) {
7355     O << ", ";
7356     Mask->printAsOperand(O);
7357   }
7358   O << "\\l\"";
7359   for (unsigned i = 0; i < IG->getFactor(); ++i)
7360     if (Instruction *I = IG->getMember(i))
7361       O << " +\n"
7362         << Indent << "\"  " << VPlanIngredient(I) << " " << i << "\\l\"";
7363 }
7364 
7365 void VPWidenRecipe::execute(VPTransformState &State) {
7366   for (auto &Instr : make_range(Begin, End))
7367     State.ILV->widenInstruction(Instr);
7368 }
7369 
7370 void VPWidenGEPRecipe::execute(VPTransformState &State) {
7371   State.ILV->widenGEP(GEP, State.UF, State.VF, IsPtrLoopInvariant,
7372                       IsIndexLoopInvariant);
7373 }
7374 
7375 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
7376   assert(!State.Instance && "Int or FP induction being replicated.");
7377   State.ILV->widenIntOrFpInduction(IV, Trunc);
7378 }
7379 
7380 void VPWidenPHIRecipe::execute(VPTransformState &State) {
7381   State.ILV->widenPHIInstruction(Phi, State.UF, State.VF);
7382 }
7383 
7384 void VPBlendRecipe::execute(VPTransformState &State) {
7385   State.ILV->setDebugLocFromInst(State.Builder, Phi);
7386   // We know that all PHIs in non-header blocks are converted into
7387   // selects, so we don't have to worry about the insertion order and we
7388   // can just use the builder.
7389   // At this point we generate the predication tree. There may be
7390   // duplications since this is a simple recursive scan, but future
7391   // optimizations will clean it up.
7392 
7393   unsigned NumIncoming = Phi->getNumIncomingValues();
7394 
7395   assert((User || NumIncoming == 1) &&
7396          "Multiple predecessors with predecessors having a full mask");
7397   // Generate a sequence of selects of the form:
7398   // SELECT(Mask3, In3,
7399   //      SELECT(Mask2, In2,
7400   //                   ( ...)))
7401   InnerLoopVectorizer::VectorParts Entry(State.UF);
7402   for (unsigned In = 0; In < NumIncoming; ++In) {
7403     for (unsigned Part = 0; Part < State.UF; ++Part) {
7404       // We might have single edge PHIs (blocks) - use an identity
7405       // 'select' for the first PHI operand.
7406       Value *In0 =
7407           State.ILV->getOrCreateVectorValue(Phi->getIncomingValue(In), Part);
7408       if (In == 0)
7409         Entry[Part] = In0; // Initialize with the first incoming value.
7410       else {
7411         // Select between the current value and the previous incoming edge
7412         // based on the incoming mask.
7413         Value *Cond = State.get(User->getOperand(In), Part);
7414         Entry[Part] =
7415             State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi");
7416       }
7417     }
7418   }
7419   for (unsigned Part = 0; Part < State.UF; ++Part)
7420     State.ValueMap.setVectorValue(Phi, Part, Entry[Part]);
7421 }
7422 
7423 void VPInterleaveRecipe::execute(VPTransformState &State) {
7424   assert(!State.Instance && "Interleave group being replicated.");
7425   State.ILV->vectorizeInterleaveGroup(IG->getInsertPos(), State, getAddr(),
7426                                       getMask());
7427 }
7428 
7429 void VPReplicateRecipe::execute(VPTransformState &State) {
7430   if (State.Instance) { // Generate a single instance.
7431     State.ILV->scalarizeInstruction(Ingredient, *State.Instance, IsPredicated);
7432     // Insert scalar instance packing it into a vector.
7433     if (AlsoPack && State.VF > 1) {
7434       // If we're constructing lane 0, initialize to start from undef.
7435       if (State.Instance->Lane == 0) {
7436         Value *Undef =
7437             UndefValue::get(VectorType::get(Ingredient->getType(), State.VF));
7438         State.ValueMap.setVectorValue(Ingredient, State.Instance->Part, Undef);
7439       }
7440       State.ILV->packScalarIntoVectorValue(Ingredient, *State.Instance);
7441     }
7442     return;
7443   }
7444 
7445   // Generate scalar instances for all VF lanes of all UF parts, unless the
7446   // instruction is uniform inwhich case generate only the first lane for each
7447   // of the UF parts.
7448   unsigned EndLane = IsUniform ? 1 : State.VF;
7449   for (unsigned Part = 0; Part < State.UF; ++Part)
7450     for (unsigned Lane = 0; Lane < EndLane; ++Lane)
7451       State.ILV->scalarizeInstruction(Ingredient, {Part, Lane}, IsPredicated);
7452 }
7453 
7454 void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
7455   assert(State.Instance && "Branch on Mask works only on single instance.");
7456 
7457   unsigned Part = State.Instance->Part;
7458   unsigned Lane = State.Instance->Lane;
7459 
7460   Value *ConditionBit = nullptr;
7461   if (!User) // Block in mask is all-one.
7462     ConditionBit = State.Builder.getTrue();
7463   else {
7464     VPValue *BlockInMask = User->getOperand(0);
7465     ConditionBit = State.get(BlockInMask, Part);
7466     if (ConditionBit->getType()->isVectorTy())
7467       ConditionBit = State.Builder.CreateExtractElement(
7468           ConditionBit, State.Builder.getInt32(Lane));
7469   }
7470 
7471   // Replace the temporary unreachable terminator with a new conditional branch,
7472   // whose two destinations will be set later when they are created.
7473   auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
7474   assert(isa<UnreachableInst>(CurrentTerminator) &&
7475          "Expected to replace unreachable terminator with conditional branch.");
7476   auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
7477   CondBr->setSuccessor(0, nullptr);
7478   ReplaceInstWithInst(CurrentTerminator, CondBr);
7479 }
7480 
7481 void VPPredInstPHIRecipe::execute(VPTransformState &State) {
7482   assert(State.Instance && "Predicated instruction PHI works per instance.");
7483   Instruction *ScalarPredInst = cast<Instruction>(
7484       State.ValueMap.getScalarValue(PredInst, *State.Instance));
7485   BasicBlock *PredicatedBB = ScalarPredInst->getParent();
7486   BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
7487   assert(PredicatingBB && "Predicated block has no single predecessor.");
7488 
7489   // By current pack/unpack logic we need to generate only a single phi node: if
7490   // a vector value for the predicated instruction exists at this point it means
7491   // the instruction has vector users only, and a phi for the vector value is
7492   // needed. In this case the recipe of the predicated instruction is marked to
7493   // also do that packing, thereby "hoisting" the insert-element sequence.
7494   // Otherwise, a phi node for the scalar value is needed.
7495   unsigned Part = State.Instance->Part;
7496   if (State.ValueMap.hasVectorValue(PredInst, Part)) {
7497     Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part);
7498     InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
7499     PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
7500     VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
7501     VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
7502     State.ValueMap.resetVectorValue(PredInst, Part, VPhi); // Update cache.
7503   } else {
7504     Type *PredInstType = PredInst->getType();
7505     PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
7506     Phi->addIncoming(UndefValue::get(ScalarPredInst->getType()), PredicatingBB);
7507     Phi->addIncoming(ScalarPredInst, PredicatedBB);
7508     State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi);
7509   }
7510 }
7511 
7512 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
7513   State.ILV->vectorizeMemoryInstruction(&Instr, State, getAddr(), getMask());
7514 }
7515 
7516 // Determine how to lower the scalar epilogue, which depends on 1) optimising
7517 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
7518 // predication, and 4) a TTI hook that analyses whether the loop is suitable
7519 // for predication.
7520 static ScalarEpilogueLowering getScalarEpilogueLowering(
7521     Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI,
7522     BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
7523     AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
7524     LoopVectorizationLegality &LVL) {
7525   bool OptSize =
7526       F->hasOptSize() || llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
7527                                                      PGSOQueryType::IRPass);
7528   // 1) OptSize takes precedence over all other options, i.e. if this is set,
7529   // don't look at hints or options, and don't request a scalar epilogue.
7530   if (OptSize && Hints.getForce() != LoopVectorizeHints::FK_Enabled)
7531     return CM_ScalarEpilogueNotAllowedOptSize;
7532 
7533   bool PredicateOptDisabled = PreferPredicateOverEpilog.getNumOccurrences() &&
7534                               !PreferPredicateOverEpilog;
7535 
7536   // 2) Next, if disabling predication is requested on the command line, honour
7537   // this and request a scalar epilogue. Also do this if we don't have a
7538   // primary induction variable, which is required for predication.
7539   if (PredicateOptDisabled || !LVL.getPrimaryInduction())
7540     return CM_ScalarEpilogueAllowed;
7541 
7542   // 3) and 4) look if enabling predication is requested on the command line,
7543   // with a loop hint, or if the TTI hook indicates this is profitable, request
7544   // predication .
7545   if (PreferPredicateOverEpilog ||
7546       Hints.getPredicate() == LoopVectorizeHints::FK_Enabled ||
7547       (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT,
7548                                         LVL.getLAI()) &&
7549        Hints.getPredicate() != LoopVectorizeHints::FK_Disabled))
7550     return CM_ScalarEpilogueNotNeededUsePredicate;
7551 
7552   return CM_ScalarEpilogueAllowed;
7553 }
7554 
7555 // Process the loop in the VPlan-native vectorization path. This path builds
7556 // VPlan upfront in the vectorization pipeline, which allows to apply
7557 // VPlan-to-VPlan transformations from the very beginning without modifying the
7558 // input LLVM IR.
7559 static bool processLoopInVPlanNativePath(
7560     Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
7561     LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
7562     TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
7563     OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
7564     ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) {
7565 
7566   assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
7567   Function *F = L->getHeader()->getParent();
7568   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
7569 
7570   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
7571       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL);
7572 
7573   LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
7574                                 &Hints, IAI);
7575   // Use the planner for outer loop vectorization.
7576   // TODO: CM is not used at this point inside the planner. Turn CM into an
7577   // optional argument if we don't need it in the future.
7578   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI);
7579 
7580   // Get user vectorization factor.
7581   const unsigned UserVF = Hints.getWidth();
7582 
7583   // Plan how to best vectorize, return the best VF and its cost.
7584   const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
7585 
7586   // If we are stress testing VPlan builds, do not attempt to generate vector
7587   // code. Masked vector code generation support will follow soon.
7588   // Also, do not attempt to vectorize if no vector code will be produced.
7589   if (VPlanBuildStressTest || EnableVPlanPredication ||
7590       VectorizationFactor::Disabled() == VF)
7591     return false;
7592 
7593   LVP.setBestPlan(VF.Width, 1);
7594 
7595   InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL,
7596                          &CM);
7597   LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
7598                     << L->getHeader()->getParent()->getName() << "\"\n");
7599   LVP.executePlan(LB, DT);
7600 
7601   // Mark the loop as already vectorized to avoid vectorizing again.
7602   Hints.setAlreadyVectorized();
7603 
7604   LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent()));
7605   return true;
7606 }
7607 
7608 bool LoopVectorizePass::processLoop(Loop *L) {
7609   assert((EnableVPlanNativePath || L->empty()) &&
7610          "VPlan-native path is not enabled. Only process inner loops.");
7611 
7612 #ifndef NDEBUG
7613   const std::string DebugLocStr = getDebugLocString(L);
7614 #endif /* NDEBUG */
7615 
7616   LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \""
7617                     << L->getHeader()->getParent()->getName() << "\" from "
7618                     << DebugLocStr << "\n");
7619 
7620   LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE);
7621 
7622   LLVM_DEBUG(
7623       dbgs() << "LV: Loop hints:"
7624              << " force="
7625              << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
7626                      ? "disabled"
7627                      : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
7628                             ? "enabled"
7629                             : "?"))
7630              << " width=" << Hints.getWidth()
7631              << " unroll=" << Hints.getInterleave() << "\n");
7632 
7633   // Function containing loop
7634   Function *F = L->getHeader()->getParent();
7635 
7636   // Looking at the diagnostic output is the only way to determine if a loop
7637   // was vectorized (other than looking at the IR or machine code), so it
7638   // is important to generate an optimization remark for each loop. Most of
7639   // these messages are generated as OptimizationRemarkAnalysis. Remarks
7640   // generated as OptimizationRemark and OptimizationRemarkMissed are
7641   // less verbose reporting vectorized loops and unvectorized loops that may
7642   // benefit from vectorization, respectively.
7643 
7644   if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
7645     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
7646     return false;
7647   }
7648 
7649   PredicatedScalarEvolution PSE(*SE, *L);
7650 
7651   // Check if it is legal to vectorize the loop.
7652   LoopVectorizationRequirements Requirements(*ORE);
7653   LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE,
7654                                 &Requirements, &Hints, DB, AC);
7655   if (!LVL.canVectorize(EnableVPlanNativePath)) {
7656     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
7657     Hints.emitRemarkWithHints();
7658     return false;
7659   }
7660 
7661   // Check the function attributes and profiles to find out if this function
7662   // should be optimized for size.
7663   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
7664       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL);
7665 
7666   // Entrance to the VPlan-native vectorization path. Outer loops are processed
7667   // here. They may require CFG and instruction level transformations before
7668   // even evaluating whether vectorization is profitable. Since we cannot modify
7669   // the incoming IR, we need to build VPlan upfront in the vectorization
7670   // pipeline.
7671   if (!L->empty())
7672     return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
7673                                         ORE, BFI, PSI, Hints);
7674 
7675   assert(L->empty() && "Inner loop expected.");
7676 
7677   // Check the loop for a trip count threshold: vectorize loops with a tiny trip
7678   // count by optimizing for size, to minimize overheads.
7679   auto ExpectedTC = getSmallBestKnownTC(*SE, L);
7680   if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
7681     LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
7682                       << "This loop is worth vectorizing only if no scalar "
7683                       << "iteration overheads are incurred.");
7684     if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
7685       LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
7686     else {
7687       LLVM_DEBUG(dbgs() << "\n");
7688       SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
7689     }
7690   }
7691 
7692   // Check the function attributes to see if implicit floats are allowed.
7693   // FIXME: This check doesn't seem possibly correct -- what if the loop is
7694   // an integer loop and the vector instructions selected are purely integer
7695   // vector instructions?
7696   if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
7697     reportVectorizationFailure(
7698         "Can't vectorize when the NoImplicitFloat attribute is used",
7699         "loop not vectorized due to NoImplicitFloat attribute",
7700         "NoImplicitFloat", ORE, L);
7701     Hints.emitRemarkWithHints();
7702     return false;
7703   }
7704 
7705   // Check if the target supports potentially unsafe FP vectorization.
7706   // FIXME: Add a check for the type of safety issue (denormal, signaling)
7707   // for the target we're vectorizing for, to make sure none of the
7708   // additional fp-math flags can help.
7709   if (Hints.isPotentiallyUnsafe() &&
7710       TTI->isFPVectorizationPotentiallyUnsafe()) {
7711     reportVectorizationFailure(
7712         "Potentially unsafe FP op prevents vectorization",
7713         "loop not vectorized due to unsafe FP support.",
7714         "UnsafeFP", ORE, L);
7715     Hints.emitRemarkWithHints();
7716     return false;
7717   }
7718 
7719   bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
7720   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
7721 
7722   // If an override option has been passed in for interleaved accesses, use it.
7723   if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
7724     UseInterleaved = EnableInterleavedMemAccesses;
7725 
7726   // Analyze interleaved memory accesses.
7727   if (UseInterleaved) {
7728     IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
7729   }
7730 
7731   // Use the cost model.
7732   LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
7733                                 F, &Hints, IAI);
7734   CM.collectValuesToIgnore();
7735 
7736   // Use the planner for vectorization.
7737   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI);
7738 
7739   // Get user vectorization factor.
7740   unsigned UserVF = Hints.getWidth();
7741 
7742   // Plan how to best vectorize, return the best VF and its cost.
7743   Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF);
7744 
7745   VectorizationFactor VF = VectorizationFactor::Disabled();
7746   unsigned IC = 1;
7747   unsigned UserIC = Hints.getInterleave();
7748 
7749   if (MaybeVF) {
7750     VF = *MaybeVF;
7751     // Select the interleave count.
7752     IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
7753   }
7754 
7755   // Identify the diagnostic messages that should be produced.
7756   std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
7757   bool VectorizeLoop = true, InterleaveLoop = true;
7758   if (Requirements.doesNotMeet(F, L, Hints)) {
7759     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization "
7760                          "requirements.\n");
7761     Hints.emitRemarkWithHints();
7762     return false;
7763   }
7764 
7765   if (VF.Width == 1) {
7766     LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
7767     VecDiagMsg = std::make_pair(
7768         "VectorizationNotBeneficial",
7769         "the cost-model indicates that vectorization is not beneficial");
7770     VectorizeLoop = false;
7771   }
7772 
7773   if (!MaybeVF && UserIC > 1) {
7774     // Tell the user interleaving was avoided up-front, despite being explicitly
7775     // requested.
7776     LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
7777                          "interleaving should be avoided up front\n");
7778     IntDiagMsg = std::make_pair(
7779         "InterleavingAvoided",
7780         "Ignoring UserIC, because interleaving was avoided up front");
7781     InterleaveLoop = false;
7782   } else if (IC == 1 && UserIC <= 1) {
7783     // Tell the user interleaving is not beneficial.
7784     LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
7785     IntDiagMsg = std::make_pair(
7786         "InterleavingNotBeneficial",
7787         "the cost-model indicates that interleaving is not beneficial");
7788     InterleaveLoop = false;
7789     if (UserIC == 1) {
7790       IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
7791       IntDiagMsg.second +=
7792           " and is explicitly disabled or interleave count is set to 1";
7793     }
7794   } else if (IC > 1 && UserIC == 1) {
7795     // Tell the user interleaving is beneficial, but it explicitly disabled.
7796     LLVM_DEBUG(
7797         dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
7798     IntDiagMsg = std::make_pair(
7799         "InterleavingBeneficialButDisabled",
7800         "the cost-model indicates that interleaving is beneficial "
7801         "but is explicitly disabled or interleave count is set to 1");
7802     InterleaveLoop = false;
7803   }
7804 
7805   // Override IC if user provided an interleave count.
7806   IC = UserIC > 0 ? UserIC : IC;
7807 
7808   // Emit diagnostic messages, if any.
7809   const char *VAPassName = Hints.vectorizeAnalysisPassName();
7810   if (!VectorizeLoop && !InterleaveLoop) {
7811     // Do not vectorize or interleaving the loop.
7812     ORE->emit([&]() {
7813       return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
7814                                       L->getStartLoc(), L->getHeader())
7815              << VecDiagMsg.second;
7816     });
7817     ORE->emit([&]() {
7818       return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
7819                                       L->getStartLoc(), L->getHeader())
7820              << IntDiagMsg.second;
7821     });
7822     return false;
7823   } else if (!VectorizeLoop && InterleaveLoop) {
7824     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
7825     ORE->emit([&]() {
7826       return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
7827                                         L->getStartLoc(), L->getHeader())
7828              << VecDiagMsg.second;
7829     });
7830   } else if (VectorizeLoop && !InterleaveLoop) {
7831     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
7832                       << ") in " << DebugLocStr << '\n');
7833     ORE->emit([&]() {
7834       return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
7835                                         L->getStartLoc(), L->getHeader())
7836              << IntDiagMsg.second;
7837     });
7838   } else if (VectorizeLoop && InterleaveLoop) {
7839     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
7840                       << ") in " << DebugLocStr << '\n');
7841     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
7842   }
7843 
7844   LVP.setBestPlan(VF.Width, IC);
7845 
7846   using namespace ore;
7847   bool DisableRuntimeUnroll = false;
7848   MDNode *OrigLoopID = L->getLoopID();
7849 
7850   if (!VectorizeLoop) {
7851     assert(IC > 1 && "interleave count should not be 1 or 0");
7852     // If we decided that it is not legal to vectorize the loop, then
7853     // interleave it.
7854     InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
7855                                &CM);
7856     LVP.executePlan(Unroller, DT);
7857 
7858     ORE->emit([&]() {
7859       return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
7860                                 L->getHeader())
7861              << "interleaved loop (interleaved count: "
7862              << NV("InterleaveCount", IC) << ")";
7863     });
7864   } else {
7865     // If we decided that it is *legal* to vectorize the loop, then do it.
7866     InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
7867                            &LVL, &CM);
7868     LVP.executePlan(LB, DT);
7869     ++LoopsVectorized;
7870 
7871     // Add metadata to disable runtime unrolling a scalar loop when there are
7872     // no runtime checks about strides and memory. A scalar loop that is
7873     // rarely used is not worth unrolling.
7874     if (!LB.areSafetyChecksAdded())
7875       DisableRuntimeUnroll = true;
7876 
7877     // Report the vectorization decision.
7878     ORE->emit([&]() {
7879       return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(),
7880                                 L->getHeader())
7881              << "vectorized loop (vectorization width: "
7882              << NV("VectorizationFactor", VF.Width)
7883              << ", interleaved count: " << NV("InterleaveCount", IC) << ")";
7884     });
7885   }
7886 
7887   Optional<MDNode *> RemainderLoopID =
7888       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
7889                                       LLVMLoopVectorizeFollowupEpilogue});
7890   if (RemainderLoopID.hasValue()) {
7891     L->setLoopID(RemainderLoopID.getValue());
7892   } else {
7893     if (DisableRuntimeUnroll)
7894       AddRuntimeUnrollDisableMetaData(L);
7895 
7896     // Mark the loop as already vectorized to avoid vectorizing again.
7897     Hints.setAlreadyVectorized();
7898   }
7899 
7900   LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent()));
7901   return true;
7902 }
7903 
7904 bool LoopVectorizePass::runImpl(
7905     Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_,
7906     DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_,
7907     DemandedBits &DB_, AliasAnalysis &AA_, AssumptionCache &AC_,
7908     std::function<const LoopAccessInfo &(Loop &)> &GetLAA_,
7909     OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) {
7910   SE = &SE_;
7911   LI = &LI_;
7912   TTI = &TTI_;
7913   DT = &DT_;
7914   BFI = &BFI_;
7915   TLI = TLI_;
7916   AA = &AA_;
7917   AC = &AC_;
7918   GetLAA = &GetLAA_;
7919   DB = &DB_;
7920   ORE = &ORE_;
7921   PSI = PSI_;
7922 
7923   // Don't attempt if
7924   // 1. the target claims to have no vector registers, and
7925   // 2. interleaving won't help ILP.
7926   //
7927   // The second condition is necessary because, even if the target has no
7928   // vector registers, loop vectorization may still enable scalar
7929   // interleaving.
7930   if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) &&
7931       TTI->getMaxInterleaveFactor(1) < 2)
7932     return false;
7933 
7934   bool Changed = false;
7935 
7936   // The vectorizer requires loops to be in simplified form.
7937   // Since simplification may add new inner loops, it has to run before the
7938   // legality and profitability checks. This means running the loop vectorizer
7939   // will simplify all loops, regardless of whether anything end up being
7940   // vectorized.
7941   for (auto &L : *LI)
7942     Changed |=
7943         simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
7944 
7945   // Build up a worklist of inner-loops to vectorize. This is necessary as
7946   // the act of vectorizing or partially unrolling a loop creates new loops
7947   // and can invalidate iterators across the loops.
7948   SmallVector<Loop *, 8> Worklist;
7949 
7950   for (Loop *L : *LI)
7951     collectSupportedLoops(*L, LI, ORE, Worklist);
7952 
7953   LoopsAnalyzed += Worklist.size();
7954 
7955   // Now walk the identified inner loops.
7956   while (!Worklist.empty()) {
7957     Loop *L = Worklist.pop_back_val();
7958 
7959     // For the inner loops we actually process, form LCSSA to simplify the
7960     // transform.
7961     Changed |= formLCSSARecursively(*L, *DT, LI, SE);
7962 
7963     Changed |= processLoop(L);
7964   }
7965 
7966   // Process each loop nest in the function.
7967   return Changed;
7968 }
7969 
7970 PreservedAnalyses LoopVectorizePass::run(Function &F,
7971                                          FunctionAnalysisManager &AM) {
7972     auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
7973     auto &LI = AM.getResult<LoopAnalysis>(F);
7974     auto &TTI = AM.getResult<TargetIRAnalysis>(F);
7975     auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
7976     auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F);
7977     auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
7978     auto &AA = AM.getResult<AAManager>(F);
7979     auto &AC = AM.getResult<AssumptionAnalysis>(F);
7980     auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
7981     auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
7982     MemorySSA *MSSA = EnableMSSALoopDependency
7983                           ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA()
7984                           : nullptr;
7985 
7986     auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
7987     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
7988         [&](Loop &L) -> const LoopAccessInfo & {
7989       LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI, MSSA};
7990       return LAM.getResult<LoopAccessAnalysis>(L, AR);
7991     };
7992     const ModuleAnalysisManager &MAM =
7993         AM.getResult<ModuleAnalysisManagerFunctionProxy>(F).getManager();
7994     ProfileSummaryInfo *PSI =
7995         MAM.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
7996     bool Changed =
7997         runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI);
7998     if (!Changed)
7999       return PreservedAnalyses::all();
8000     PreservedAnalyses PA;
8001 
8002     // We currently do not preserve loopinfo/dominator analyses with outer loop
8003     // vectorization. Until this is addressed, mark these analyses as preserved
8004     // only for non-VPlan-native path.
8005     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
8006     if (!EnableVPlanNativePath) {
8007       PA.preserve<LoopAnalysis>();
8008       PA.preserve<DominatorTreeAnalysis>();
8009     }
8010     PA.preserve<BasicAA>();
8011     PA.preserve<GlobalsAA>();
8012     return PA;
8013 }
8014