1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10 // and generates target-independent LLVM-IR.
11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12 // of instructions in order to estimate the profitability of vectorization.
13 //
14 // The loop vectorizer combines consecutive loop iterations into a single
15 // 'wide' iteration. After this transformation the index is incremented
16 // by the SIMD vector width, and not by one.
17 //
18 // This pass has three parts:
19 // 1. The main loop pass that drives the different parts.
20 // 2. LoopVectorizationLegality - A unit that checks for the legality
21 //    of the vectorization.
22 // 3. InnerLoopVectorizer - A unit that performs the actual
23 //    widening of instructions.
24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
25 //    of vectorization. It decides on the optimal vector width, which
26 //    can be one, if vectorization is not profitable.
27 //
28 // There is a development effort going on to migrate loop vectorizer to the
29 // VPlan infrastructure and to introduce outer loop vectorization support (see
30 // docs/Proposal/VectorizationPlan.rst and
31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32 // purpose, we temporarily introduced the VPlan-native vectorization path: an
33 // alternative vectorization path that is natively implemented on top of the
34 // VPlan infrastructure. See EnableVPlanNativePath for enabling.
35 //
36 //===----------------------------------------------------------------------===//
37 //
38 // The reduction-variable vectorization is based on the paper:
39 //  D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40 //
41 // Variable uniformity checks are inspired by:
42 //  Karrenberg, R. and Hack, S. Whole Function Vectorization.
43 //
44 // The interleaved access vectorization is based on the paper:
45 //  Dorit Nuzman, Ira Rosen and Ayal Zaks.  Auto-Vectorization of Interleaved
46 //  Data for SIMD
47 //
48 // Other ideas/concepts are from:
49 //  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50 //
51 //  S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua.  An Evaluation of
52 //  Vectorizing Compilers.
53 //
54 //===----------------------------------------------------------------------===//
55 
56 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
57 #include "LoopVectorizationPlanner.h"
58 #include "VPRecipeBuilder.h"
59 #include "VPlan.h"
60 #include "VPlanHCFGBuilder.h"
61 #include "VPlanPredicator.h"
62 #include "VPlanTransforms.h"
63 #include "llvm/ADT/APInt.h"
64 #include "llvm/ADT/ArrayRef.h"
65 #include "llvm/ADT/DenseMap.h"
66 #include "llvm/ADT/DenseMapInfo.h"
67 #include "llvm/ADT/Hashing.h"
68 #include "llvm/ADT/MapVector.h"
69 #include "llvm/ADT/None.h"
70 #include "llvm/ADT/Optional.h"
71 #include "llvm/ADT/STLExtras.h"
72 #include "llvm/ADT/SetVector.h"
73 #include "llvm/ADT/SmallPtrSet.h"
74 #include "llvm/ADT/SmallVector.h"
75 #include "llvm/ADT/Statistic.h"
76 #include "llvm/ADT/StringRef.h"
77 #include "llvm/ADT/Twine.h"
78 #include "llvm/ADT/iterator_range.h"
79 #include "llvm/Analysis/AssumptionCache.h"
80 #include "llvm/Analysis/BasicAliasAnalysis.h"
81 #include "llvm/Analysis/BlockFrequencyInfo.h"
82 #include "llvm/Analysis/CFG.h"
83 #include "llvm/Analysis/CodeMetrics.h"
84 #include "llvm/Analysis/DemandedBits.h"
85 #include "llvm/Analysis/GlobalsModRef.h"
86 #include "llvm/Analysis/LoopAccessAnalysis.h"
87 #include "llvm/Analysis/LoopAnalysisManager.h"
88 #include "llvm/Analysis/LoopInfo.h"
89 #include "llvm/Analysis/LoopIterator.h"
90 #include "llvm/Analysis/MemorySSA.h"
91 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
92 #include "llvm/Analysis/ProfileSummaryInfo.h"
93 #include "llvm/Analysis/ScalarEvolution.h"
94 #include "llvm/Analysis/ScalarEvolutionExpander.h"
95 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
96 #include "llvm/Analysis/TargetLibraryInfo.h"
97 #include "llvm/Analysis/TargetTransformInfo.h"
98 #include "llvm/Analysis/VectorUtils.h"
99 #include "llvm/IR/Attributes.h"
100 #include "llvm/IR/BasicBlock.h"
101 #include "llvm/IR/CFG.h"
102 #include "llvm/IR/Constant.h"
103 #include "llvm/IR/Constants.h"
104 #include "llvm/IR/DataLayout.h"
105 #include "llvm/IR/DebugInfoMetadata.h"
106 #include "llvm/IR/DebugLoc.h"
107 #include "llvm/IR/DerivedTypes.h"
108 #include "llvm/IR/DiagnosticInfo.h"
109 #include "llvm/IR/Dominators.h"
110 #include "llvm/IR/Function.h"
111 #include "llvm/IR/IRBuilder.h"
112 #include "llvm/IR/InstrTypes.h"
113 #include "llvm/IR/Instruction.h"
114 #include "llvm/IR/Instructions.h"
115 #include "llvm/IR/IntrinsicInst.h"
116 #include "llvm/IR/Intrinsics.h"
117 #include "llvm/IR/LLVMContext.h"
118 #include "llvm/IR/Metadata.h"
119 #include "llvm/IR/Module.h"
120 #include "llvm/IR/Operator.h"
121 #include "llvm/IR/Type.h"
122 #include "llvm/IR/Use.h"
123 #include "llvm/IR/User.h"
124 #include "llvm/IR/Value.h"
125 #include "llvm/IR/ValueHandle.h"
126 #include "llvm/IR/Verifier.h"
127 #include "llvm/InitializePasses.h"
128 #include "llvm/Pass.h"
129 #include "llvm/Support/Casting.h"
130 #include "llvm/Support/CommandLine.h"
131 #include "llvm/Support/Compiler.h"
132 #include "llvm/Support/Debug.h"
133 #include "llvm/Support/ErrorHandling.h"
134 #include "llvm/Support/MathExtras.h"
135 #include "llvm/Support/raw_ostream.h"
136 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
137 #include "llvm/Transforms/Utils/InjectTLIMappings.h"
138 #include "llvm/Transforms/Utils/LoopSimplify.h"
139 #include "llvm/Transforms/Utils/LoopUtils.h"
140 #include "llvm/Transforms/Utils/LoopVersioning.h"
141 #include "llvm/Transforms/Utils/SizeOpts.h"
142 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
143 #include <algorithm>
144 #include <cassert>
145 #include <cstdint>
146 #include <cstdlib>
147 #include <functional>
148 #include <iterator>
149 #include <limits>
150 #include <memory>
151 #include <string>
152 #include <tuple>
153 #include <utility>
154 
155 using namespace llvm;
156 
157 #define LV_NAME "loop-vectorize"
158 #define DEBUG_TYPE LV_NAME
159 
160 /// @{
161 /// Metadata attribute names
162 static const char *const LLVMLoopVectorizeFollowupAll =
163     "llvm.loop.vectorize.followup_all";
164 static const char *const LLVMLoopVectorizeFollowupVectorized =
165     "llvm.loop.vectorize.followup_vectorized";
166 static const char *const LLVMLoopVectorizeFollowupEpilogue =
167     "llvm.loop.vectorize.followup_epilogue";
168 /// @}
169 
170 STATISTIC(LoopsVectorized, "Number of loops vectorized");
171 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
172 
173 /// Loops with a known constant trip count below this number are vectorized only
174 /// if no scalar iteration overheads are incurred.
175 static cl::opt<unsigned> TinyTripCountVectorThreshold(
176     "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
177     cl::desc("Loops with a constant trip count that is smaller than this "
178              "value are vectorized only if no scalar iteration overheads "
179              "are incurred."));
180 
181 // Indicates that an epilogue is undesired, predication is preferred.
182 // This means that the vectorizer will try to fold the loop-tail (epilogue)
183 // into the loop and predicate the loop body accordingly.
184 static cl::opt<bool> PreferPredicateOverEpilog(
185     "prefer-predicate-over-epilog", cl::init(false), cl::Hidden,
186     cl::desc("Indicate that an epilogue is undesired, predication should be "
187              "used instead."));
188 
189 static cl::opt<bool> MaximizeBandwidth(
190     "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
191     cl::desc("Maximize bandwidth when selecting vectorization factor which "
192              "will be determined by the smallest type in loop."));
193 
194 static cl::opt<bool> EnableInterleavedMemAccesses(
195     "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
196     cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
197 
198 /// An interleave-group may need masking if it resides in a block that needs
199 /// predication, or in order to mask away gaps.
200 static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
201     "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
202     cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
203 
204 static cl::opt<unsigned> TinyTripCountInterleaveThreshold(
205     "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden,
206     cl::desc("We don't interleave loops with a estimated constant trip count "
207              "below this number"));
208 
209 static cl::opt<unsigned> ForceTargetNumScalarRegs(
210     "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
211     cl::desc("A flag that overrides the target's number of scalar registers."));
212 
213 static cl::opt<unsigned> ForceTargetNumVectorRegs(
214     "force-target-num-vector-regs", cl::init(0), cl::Hidden,
215     cl::desc("A flag that overrides the target's number of vector registers."));
216 
217 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
218     "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
219     cl::desc("A flag that overrides the target's max interleave factor for "
220              "scalar loops."));
221 
222 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
223     "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
224     cl::desc("A flag that overrides the target's max interleave factor for "
225              "vectorized loops."));
226 
227 static cl::opt<unsigned> ForceTargetInstructionCost(
228     "force-target-instruction-cost", cl::init(0), cl::Hidden,
229     cl::desc("A flag that overrides the target's expected cost for "
230              "an instruction to a single constant value. Mostly "
231              "useful for getting consistent testing."));
232 
233 static cl::opt<unsigned> SmallLoopCost(
234     "small-loop-cost", cl::init(20), cl::Hidden,
235     cl::desc(
236         "The cost of a loop that is considered 'small' by the interleaver."));
237 
238 static cl::opt<bool> LoopVectorizeWithBlockFrequency(
239     "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
240     cl::desc("Enable the use of the block frequency analysis to access PGO "
241              "heuristics minimizing code growth in cold regions and being more "
242              "aggressive in hot regions."));
243 
244 // Runtime interleave loops for load/store throughput.
245 static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
246     "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
247     cl::desc(
248         "Enable runtime interleaving until load/store ports are saturated"));
249 
250 /// The number of stores in a loop that are allowed to need predication.
251 static cl::opt<unsigned> NumberOfStoresToPredicate(
252     "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
253     cl::desc("Max number of stores to be predicated behind an if."));
254 
255 static cl::opt<bool> EnableIndVarRegisterHeur(
256     "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
257     cl::desc("Count the induction variable only once when interleaving"));
258 
259 static cl::opt<bool> EnableCondStoresVectorization(
260     "enable-cond-stores-vec", cl::init(true), cl::Hidden,
261     cl::desc("Enable if predication of stores during vectorization."));
262 
263 static cl::opt<unsigned> MaxNestedScalarReductionIC(
264     "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
265     cl::desc("The maximum interleave count to use when interleaving a scalar "
266              "reduction in a nested loop."));
267 
268 cl::opt<bool> EnableVPlanNativePath(
269     "enable-vplan-native-path", cl::init(false), cl::Hidden,
270     cl::desc("Enable VPlan-native vectorization path with "
271              "support for outer loop vectorization."));
272 
273 // FIXME: Remove this switch once we have divergence analysis. Currently we
274 // assume divergent non-backedge branches when this switch is true.
275 cl::opt<bool> EnableVPlanPredication(
276     "enable-vplan-predication", cl::init(false), cl::Hidden,
277     cl::desc("Enable VPlan-native vectorization path predicator with "
278              "support for outer loop vectorization."));
279 
280 // This flag enables the stress testing of the VPlan H-CFG construction in the
281 // VPlan-native vectorization path. It must be used in conjuction with
282 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
283 // verification of the H-CFGs built.
284 static cl::opt<bool> VPlanBuildStressTest(
285     "vplan-build-stress-test", cl::init(false), cl::Hidden,
286     cl::desc(
287         "Build VPlan for every supported loop nest in the function and bail "
288         "out right after the build (stress test the VPlan H-CFG construction "
289         "in the VPlan-native vectorization path)."));
290 
291 cl::opt<bool> llvm::EnableLoopInterleaving(
292     "interleave-loops", cl::init(true), cl::Hidden,
293     cl::desc("Enable loop interleaving in Loop vectorization passes"));
294 cl::opt<bool> llvm::EnableLoopVectorization(
295     "vectorize-loops", cl::init(true), cl::Hidden,
296     cl::desc("Run the Loop vectorization passes"));
297 
298 /// A helper function that returns the type of loaded or stored value.
299 static Type *getMemInstValueType(Value *I) {
300   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
301          "Expected Load or Store instruction");
302   if (auto *LI = dyn_cast<LoadInst>(I))
303     return LI->getType();
304   return cast<StoreInst>(I)->getValueOperand()->getType();
305 }
306 
307 /// A helper function that returns true if the given type is irregular. The
308 /// type is irregular if its allocated size doesn't equal the store size of an
309 /// element of the corresponding vector type at the given vectorization factor.
310 static bool hasIrregularType(Type *Ty, const DataLayout &DL, unsigned VF) {
311   // Determine if an array of VF elements of type Ty is "bitcast compatible"
312   // with a <VF x Ty> vector.
313   if (VF > 1) {
314     auto *VectorTy = VectorType::get(Ty, VF);
315     return VF * DL.getTypeAllocSize(Ty) != DL.getTypeStoreSize(VectorTy);
316   }
317 
318   // If the vectorization factor is one, we just check if an array of type Ty
319   // requires padding between elements.
320   return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
321 }
322 
323 /// A helper function that returns the reciprocal of the block probability of
324 /// predicated blocks. If we return X, we are assuming the predicated block
325 /// will execute once for every X iterations of the loop header.
326 ///
327 /// TODO: We should use actual block probability here, if available. Currently,
328 ///       we always assume predicated blocks have a 50% chance of executing.
329 static unsigned getReciprocalPredBlockProb() { return 2; }
330 
331 /// A helper function that adds a 'fast' flag to floating-point operations.
332 static Value *addFastMathFlag(Value *V) {
333   if (isa<FPMathOperator>(V))
334     cast<Instruction>(V)->setFastMathFlags(FastMathFlags::getFast());
335   return V;
336 }
337 
338 static Value *addFastMathFlag(Value *V, FastMathFlags FMF) {
339   if (isa<FPMathOperator>(V))
340     cast<Instruction>(V)->setFastMathFlags(FMF);
341   return V;
342 }
343 
344 /// A helper function that returns an integer or floating-point constant with
345 /// value C.
346 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
347   return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
348                            : ConstantFP::get(Ty, C);
349 }
350 
351 /// Returns "best known" trip count for the specified loop \p L as defined by
352 /// the following procedure:
353 ///   1) Returns exact trip count if it is known.
354 ///   2) Returns expected trip count according to profile data if any.
355 ///   3) Returns upper bound estimate if it is known.
356 ///   4) Returns None if all of the above failed.
357 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) {
358   // Check if exact trip count is known.
359   if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
360     return ExpectedTC;
361 
362   // Check if there is an expected trip count available from profile data.
363   if (LoopVectorizeWithBlockFrequency)
364     if (auto EstimatedTC = getLoopEstimatedTripCount(L))
365       return EstimatedTC;
366 
367   // Check if upper bound estimate is known.
368   if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
369     return ExpectedTC;
370 
371   return None;
372 }
373 
374 namespace llvm {
375 
376 /// InnerLoopVectorizer vectorizes loops which contain only one basic
377 /// block to a specified vectorization factor (VF).
378 /// This class performs the widening of scalars into vectors, or multiple
379 /// scalars. This class also implements the following features:
380 /// * It inserts an epilogue loop for handling loops that don't have iteration
381 ///   counts that are known to be a multiple of the vectorization factor.
382 /// * It handles the code generation for reduction variables.
383 /// * Scalarization (implementation using scalars) of un-vectorizable
384 ///   instructions.
385 /// InnerLoopVectorizer does not perform any vectorization-legality
386 /// checks, and relies on the caller to check for the different legality
387 /// aspects. The InnerLoopVectorizer relies on the
388 /// LoopVectorizationLegality class to provide information about the induction
389 /// and reduction variables that were found to a given vectorization factor.
390 class InnerLoopVectorizer {
391 public:
392   InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
393                       LoopInfo *LI, DominatorTree *DT,
394                       const TargetLibraryInfo *TLI,
395                       const TargetTransformInfo *TTI, AssumptionCache *AC,
396                       OptimizationRemarkEmitter *ORE, unsigned VecWidth,
397                       unsigned UnrollFactor, LoopVectorizationLegality *LVL,
398                       LoopVectorizationCostModel *CM)
399       : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
400         AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
401         Builder(PSE.getSE()->getContext()),
402         VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM) {}
403   virtual ~InnerLoopVectorizer() = default;
404 
405   /// Create a new empty loop. Unlink the old loop and connect the new one.
406   /// Return the pre-header block of the new loop.
407   BasicBlock *createVectorizedLoopSkeleton();
408 
409   /// Widen a single instruction within the innermost loop.
410   void widenInstruction(Instruction &I);
411 
412   /// Widen a single call instruction within the innermost loop.
413   void widenCallInstruction(CallInst &I);
414 
415   /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
416   void fixVectorizedLoop();
417 
418   // Return true if any runtime check is added.
419   bool areSafetyChecksAdded() { return AddedSafetyChecks; }
420 
421   /// A type for vectorized values in the new loop. Each value from the
422   /// original loop, when vectorized, is represented by UF vector values in the
423   /// new unrolled loop, where UF is the unroll factor.
424   using VectorParts = SmallVector<Value *, 2>;
425 
426   /// Vectorize a single GetElementPtrInst based on information gathered and
427   /// decisions taken during planning.
428   void widenGEP(GetElementPtrInst *GEP, unsigned UF, unsigned VF,
429                 bool IsPtrLoopInvariant, SmallBitVector &IsIndexLoopInvariant);
430 
431   /// Vectorize a single PHINode in a block. This method handles the induction
432   /// variable canonicalization. It supports both VF = 1 for unrolled loops and
433   /// arbitrary length vectors.
434   void widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF);
435 
436   /// A helper function to scalarize a single Instruction in the innermost loop.
437   /// Generates a sequence of scalar instances for each lane between \p MinLane
438   /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
439   /// inclusive..
440   void scalarizeInstruction(Instruction *Instr, const VPIteration &Instance,
441                             bool IfPredicateInstr);
442 
443   /// Widen an integer or floating-point induction variable \p IV. If \p Trunc
444   /// is provided, the integer induction variable will first be truncated to
445   /// the corresponding type.
446   void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr);
447 
448   /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a
449   /// vector or scalar value on-demand if one is not yet available. When
450   /// vectorizing a loop, we visit the definition of an instruction before its
451   /// uses. When visiting the definition, we either vectorize or scalarize the
452   /// instruction, creating an entry for it in the corresponding map. (In some
453   /// cases, such as induction variables, we will create both vector and scalar
454   /// entries.) Then, as we encounter uses of the definition, we derive values
455   /// for each scalar or vector use unless such a value is already available.
456   /// For example, if we scalarize a definition and one of its uses is vector,
457   /// we build the required vector on-demand with an insertelement sequence
458   /// when visiting the use. Otherwise, if the use is scalar, we can use the
459   /// existing scalar definition.
460   ///
461   /// Return a value in the new loop corresponding to \p V from the original
462   /// loop at unroll index \p Part. If the value has already been vectorized,
463   /// the corresponding vector entry in VectorLoopValueMap is returned. If,
464   /// however, the value has a scalar entry in VectorLoopValueMap, we construct
465   /// a new vector value on-demand by inserting the scalar values into a vector
466   /// with an insertelement sequence. If the value has been neither vectorized
467   /// nor scalarized, it must be loop invariant, so we simply broadcast the
468   /// value into a vector.
469   Value *getOrCreateVectorValue(Value *V, unsigned Part);
470 
471   /// Return a value in the new loop corresponding to \p V from the original
472   /// loop at unroll and vector indices \p Instance. If the value has been
473   /// vectorized but not scalarized, the necessary extractelement instruction
474   /// will be generated.
475   Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance);
476 
477   /// Construct the vector value of a scalarized value \p V one lane at a time.
478   void packScalarIntoVectorValue(Value *V, const VPIteration &Instance);
479 
480   /// Try to vectorize the interleaved access group that \p Instr belongs to
481   /// with the base address given in \p Addr, optionally masking the vector
482   /// operations if \p BlockInMask is non-null. Use \p State to translate given
483   /// VPValues to IR values in the vectorized loop.
484   void vectorizeInterleaveGroup(Instruction *Instr, VPTransformState &State,
485                                 VPValue *Addr, VPValue *BlockInMask = nullptr);
486 
487   /// Vectorize Load and Store instructions with the base address given in \p
488   /// Addr, optionally masking the vector operations if \p BlockInMask is
489   /// non-null. Use \p State to translate given VPValues to IR values in the
490   /// vectorized loop.
491   void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State,
492                                   VPValue *Addr, VPValue *StoredValue,
493                                   VPValue *BlockInMask);
494 
495   /// Set the debug location in the builder using the debug location in
496   /// the instruction.
497   void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr);
498 
499   /// Fix the non-induction PHIs in the OrigPHIsToFix vector.
500   void fixNonInductionPHIs(void);
501 
502 protected:
503   friend class LoopVectorizationPlanner;
504 
505   /// A small list of PHINodes.
506   using PhiVector = SmallVector<PHINode *, 4>;
507 
508   /// A type for scalarized values in the new loop. Each value from the
509   /// original loop, when scalarized, is represented by UF x VF scalar values
510   /// in the new unrolled loop, where UF is the unroll factor and VF is the
511   /// vectorization factor.
512   using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
513 
514   /// Set up the values of the IVs correctly when exiting the vector loop.
515   void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
516                     Value *CountRoundDown, Value *EndValue,
517                     BasicBlock *MiddleBlock);
518 
519   /// Create a new induction variable inside L.
520   PHINode *createInductionVariable(Loop *L, Value *Start, Value *End,
521                                    Value *Step, Instruction *DL);
522 
523   /// Handle all cross-iteration phis in the header.
524   void fixCrossIterationPHIs();
525 
526   /// Fix a first-order recurrence. This is the second phase of vectorizing
527   /// this phi node.
528   void fixFirstOrderRecurrence(PHINode *Phi);
529 
530   /// Fix a reduction cross-iteration phi. This is the second phase of
531   /// vectorizing this phi node.
532   void fixReduction(PHINode *Phi);
533 
534   /// Clear NSW/NUW flags from reduction instructions if necessary.
535   void clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc);
536 
537   /// The Loop exit block may have single value PHI nodes with some
538   /// incoming value. While vectorizing we only handled real values
539   /// that were defined inside the loop and we should have one value for
540   /// each predecessor of its parent basic block. See PR14725.
541   void fixLCSSAPHIs();
542 
543   /// Iteratively sink the scalarized operands of a predicated instruction into
544   /// the block that was created for it.
545   void sinkScalarOperands(Instruction *PredInst);
546 
547   /// Shrinks vector element sizes to the smallest bitwidth they can be legally
548   /// represented as.
549   void truncateToMinimalBitwidths();
550 
551   /// Create a broadcast instruction. This method generates a broadcast
552   /// instruction (shuffle) for loop invariant values and for the induction
553   /// value. If this is the induction variable then we extend it to N, N+1, ...
554   /// this is needed because each iteration in the loop corresponds to a SIMD
555   /// element.
556   virtual Value *getBroadcastInstrs(Value *V);
557 
558   /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...)
559   /// to each vector element of Val. The sequence starts at StartIndex.
560   /// \p Opcode is relevant for FP induction variable.
561   virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step,
562                                Instruction::BinaryOps Opcode =
563                                Instruction::BinaryOpsEnd);
564 
565   /// Compute scalar induction steps. \p ScalarIV is the scalar induction
566   /// variable on which to base the steps, \p Step is the size of the step, and
567   /// \p EntryVal is the value from the original loop that maps to the steps.
568   /// Note that \p EntryVal doesn't have to be an induction variable - it
569   /// can also be a truncate instruction.
570   void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal,
571                         const InductionDescriptor &ID);
572 
573   /// Create a vector induction phi node based on an existing scalar one. \p
574   /// EntryVal is the value from the original loop that maps to the vector phi
575   /// node, and \p Step is the loop-invariant step. If \p EntryVal is a
576   /// truncate instruction, instead of widening the original IV, we widen a
577   /// version of the IV truncated to \p EntryVal's type.
578   void createVectorIntOrFpInductionPHI(const InductionDescriptor &II,
579                                        Value *Step, Instruction *EntryVal);
580 
581   /// Returns true if an instruction \p I should be scalarized instead of
582   /// vectorized for the chosen vectorization factor.
583   bool shouldScalarizeInstruction(Instruction *I) const;
584 
585   /// Returns true if we should generate a scalar version of \p IV.
586   bool needsScalarInduction(Instruction *IV) const;
587 
588   /// If there is a cast involved in the induction variable \p ID, which should
589   /// be ignored in the vectorized loop body, this function records the
590   /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the
591   /// cast. We had already proved that the casted Phi is equal to the uncasted
592   /// Phi in the vectorized loop (under a runtime guard), and therefore
593   /// there is no need to vectorize the cast - the same value can be used in the
594   /// vector loop for both the Phi and the cast.
595   /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified,
596   /// Otherwise, \p VectorLoopValue is a widened/vectorized value.
597   ///
598   /// \p EntryVal is the value from the original loop that maps to the vector
599   /// phi node and is used to distinguish what is the IV currently being
600   /// processed - original one (if \p EntryVal is a phi corresponding to the
601   /// original IV) or the "newly-created" one based on the proof mentioned above
602   /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the
603   /// latter case \p EntryVal is a TruncInst and we must not record anything for
604   /// that IV, but it's error-prone to expect callers of this routine to care
605   /// about that, hence this explicit parameter.
606   void recordVectorLoopValueForInductionCast(const InductionDescriptor &ID,
607                                              const Instruction *EntryVal,
608                                              Value *VectorLoopValue,
609                                              unsigned Part,
610                                              unsigned Lane = UINT_MAX);
611 
612   /// Generate a shuffle sequence that will reverse the vector Vec.
613   virtual Value *reverseVector(Value *Vec);
614 
615   /// Returns (and creates if needed) the original loop trip count.
616   Value *getOrCreateTripCount(Loop *NewLoop);
617 
618   /// Returns (and creates if needed) the trip count of the widened loop.
619   Value *getOrCreateVectorTripCount(Loop *NewLoop);
620 
621   /// Returns a bitcasted value to the requested vector type.
622   /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
623   Value *createBitOrPointerCast(Value *V, VectorType *DstVTy,
624                                 const DataLayout &DL);
625 
626   /// Emit a bypass check to see if the vector trip count is zero, including if
627   /// it overflows.
628   void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass);
629 
630   /// Emit a bypass check to see if all of the SCEV assumptions we've
631   /// had to make are correct.
632   void emitSCEVChecks(Loop *L, BasicBlock *Bypass);
633 
634   /// Emit bypass checks to check any memory assumptions we may have made.
635   void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass);
636 
637   /// Compute the transformed value of Index at offset StartValue using step
638   /// StepValue.
639   /// For integer induction, returns StartValue + Index * StepValue.
640   /// For pointer induction, returns StartValue[Index * StepValue].
641   /// FIXME: The newly created binary instructions should contain nsw/nuw
642   /// flags, which can be found from the original scalar operations.
643   Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE,
644                               const DataLayout &DL,
645                               const InductionDescriptor &ID) const;
646 
647   /// Add additional metadata to \p To that was not present on \p Orig.
648   ///
649   /// Currently this is used to add the noalias annotations based on the
650   /// inserted memchecks.  Use this for instructions that are *cloned* into the
651   /// vector loop.
652   void addNewMetadata(Instruction *To, const Instruction *Orig);
653 
654   /// Add metadata from one instruction to another.
655   ///
656   /// This includes both the original MDs from \p From and additional ones (\see
657   /// addNewMetadata).  Use this for *newly created* instructions in the vector
658   /// loop.
659   void addMetadata(Instruction *To, Instruction *From);
660 
661   /// Similar to the previous function but it adds the metadata to a
662   /// vector of instructions.
663   void addMetadata(ArrayRef<Value *> To, Instruction *From);
664 
665   /// The original loop.
666   Loop *OrigLoop;
667 
668   /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
669   /// dynamic knowledge to simplify SCEV expressions and converts them to a
670   /// more usable form.
671   PredicatedScalarEvolution &PSE;
672 
673   /// Loop Info.
674   LoopInfo *LI;
675 
676   /// Dominator Tree.
677   DominatorTree *DT;
678 
679   /// Alias Analysis.
680   AliasAnalysis *AA;
681 
682   /// Target Library Info.
683   const TargetLibraryInfo *TLI;
684 
685   /// Target Transform Info.
686   const TargetTransformInfo *TTI;
687 
688   /// Assumption Cache.
689   AssumptionCache *AC;
690 
691   /// Interface to emit optimization remarks.
692   OptimizationRemarkEmitter *ORE;
693 
694   /// LoopVersioning.  It's only set up (non-null) if memchecks were
695   /// used.
696   ///
697   /// This is currently only used to add no-alias metadata based on the
698   /// memchecks.  The actually versioning is performed manually.
699   std::unique_ptr<LoopVersioning> LVer;
700 
701   /// The vectorization SIMD factor to use. Each vector will have this many
702   /// vector elements.
703   unsigned VF;
704 
705   /// The vectorization unroll factor to use. Each scalar is vectorized to this
706   /// many different vector instructions.
707   unsigned UF;
708 
709   /// The builder that we use
710   IRBuilder<> Builder;
711 
712   // --- Vectorization state ---
713 
714   /// The vector-loop preheader.
715   BasicBlock *LoopVectorPreHeader;
716 
717   /// The scalar-loop preheader.
718   BasicBlock *LoopScalarPreHeader;
719 
720   /// Middle Block between the vector and the scalar.
721   BasicBlock *LoopMiddleBlock;
722 
723   /// The ExitBlock of the scalar loop.
724   BasicBlock *LoopExitBlock;
725 
726   /// The vector loop body.
727   BasicBlock *LoopVectorBody;
728 
729   /// The scalar loop body.
730   BasicBlock *LoopScalarBody;
731 
732   /// A list of all bypass blocks. The first block is the entry of the loop.
733   SmallVector<BasicBlock *, 4> LoopBypassBlocks;
734 
735   /// The new Induction variable which was added to the new block.
736   PHINode *Induction = nullptr;
737 
738   /// The induction variable of the old basic block.
739   PHINode *OldInduction = nullptr;
740 
741   /// Maps values from the original loop to their corresponding values in the
742   /// vectorized loop. A key value can map to either vector values, scalar
743   /// values or both kinds of values, depending on whether the key was
744   /// vectorized and scalarized.
745   VectorizerValueMap VectorLoopValueMap;
746 
747   /// Store instructions that were predicated.
748   SmallVector<Instruction *, 4> PredicatedInstructions;
749 
750   /// Trip count of the original loop.
751   Value *TripCount = nullptr;
752 
753   /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
754   Value *VectorTripCount = nullptr;
755 
756   /// The legality analysis.
757   LoopVectorizationLegality *Legal;
758 
759   /// The profitablity analysis.
760   LoopVectorizationCostModel *Cost;
761 
762   // Record whether runtime checks are added.
763   bool AddedSafetyChecks = false;
764 
765   // Holds the end values for each induction variable. We save the end values
766   // so we can later fix-up the external users of the induction variables.
767   DenseMap<PHINode *, Value *> IVEndValues;
768 
769   // Vector of original scalar PHIs whose corresponding widened PHIs need to be
770   // fixed up at the end of vector code generation.
771   SmallVector<PHINode *, 8> OrigPHIsToFix;
772 };
773 
774 class InnerLoopUnroller : public InnerLoopVectorizer {
775 public:
776   InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
777                     LoopInfo *LI, DominatorTree *DT,
778                     const TargetLibraryInfo *TLI,
779                     const TargetTransformInfo *TTI, AssumptionCache *AC,
780                     OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
781                     LoopVectorizationLegality *LVL,
782                     LoopVectorizationCostModel *CM)
783       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1,
784                             UnrollFactor, LVL, CM) {}
785 
786 private:
787   Value *getBroadcastInstrs(Value *V) override;
788   Value *getStepVector(Value *Val, int StartIdx, Value *Step,
789                        Instruction::BinaryOps Opcode =
790                        Instruction::BinaryOpsEnd) override;
791   Value *reverseVector(Value *Vec) override;
792 };
793 
794 } // end namespace llvm
795 
796 /// Look for a meaningful debug location on the instruction or it's
797 /// operands.
798 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
799   if (!I)
800     return I;
801 
802   DebugLoc Empty;
803   if (I->getDebugLoc() != Empty)
804     return I;
805 
806   for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) {
807     if (Instruction *OpInst = dyn_cast<Instruction>(*OI))
808       if (OpInst->getDebugLoc() != Empty)
809         return OpInst;
810   }
811 
812   return I;
813 }
814 
815 void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) {
816   if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) {
817     const DILocation *DIL = Inst->getDebugLoc();
818     if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
819         !isa<DbgInfoIntrinsic>(Inst)) {
820       auto NewDIL = DIL->cloneByMultiplyingDuplicationFactor(UF * VF);
821       if (NewDIL)
822         B.SetCurrentDebugLocation(NewDIL.getValue());
823       else
824         LLVM_DEBUG(dbgs()
825                    << "Failed to create new discriminator: "
826                    << DIL->getFilename() << " Line: " << DIL->getLine());
827     }
828     else
829       B.SetCurrentDebugLocation(DIL);
830   } else
831     B.SetCurrentDebugLocation(DebugLoc());
832 }
833 
834 /// Write a record \p DebugMsg about vectorization failure to the debug
835 /// output stream. If \p I is passed, it is an instruction that prevents
836 /// vectorization.
837 #ifndef NDEBUG
838 static void debugVectorizationFailure(const StringRef DebugMsg,
839     Instruction *I) {
840   dbgs() << "LV: Not vectorizing: " << DebugMsg;
841   if (I != nullptr)
842     dbgs() << " " << *I;
843   else
844     dbgs() << '.';
845   dbgs() << '\n';
846 }
847 #endif
848 
849 /// Create an analysis remark that explains why vectorization failed
850 ///
851 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint).  \p
852 /// RemarkName is the identifier for the remark.  If \p I is passed it is an
853 /// instruction that prevents vectorization.  Otherwise \p TheLoop is used for
854 /// the location of the remark.  \return the remark object that can be
855 /// streamed to.
856 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
857     StringRef RemarkName, Loop *TheLoop, Instruction *I) {
858   Value *CodeRegion = TheLoop->getHeader();
859   DebugLoc DL = TheLoop->getStartLoc();
860 
861   if (I) {
862     CodeRegion = I->getParent();
863     // If there is no debug location attached to the instruction, revert back to
864     // using the loop's.
865     if (I->getDebugLoc())
866       DL = I->getDebugLoc();
867   }
868 
869   OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion);
870   R << "loop not vectorized: ";
871   return R;
872 }
873 
874 namespace llvm {
875 
876 void reportVectorizationFailure(const StringRef DebugMsg,
877     const StringRef OREMsg, const StringRef ORETag,
878     OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) {
879   LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I));
880   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
881   ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(),
882                 ORETag, TheLoop, I) << OREMsg);
883 }
884 
885 } // end namespace llvm
886 
887 #ifndef NDEBUG
888 /// \return string containing a file name and a line # for the given loop.
889 static std::string getDebugLocString(const Loop *L) {
890   std::string Result;
891   if (L) {
892     raw_string_ostream OS(Result);
893     if (const DebugLoc LoopDbgLoc = L->getStartLoc())
894       LoopDbgLoc.print(OS);
895     else
896       // Just print the module name.
897       OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
898     OS.flush();
899   }
900   return Result;
901 }
902 #endif
903 
904 void InnerLoopVectorizer::addNewMetadata(Instruction *To,
905                                          const Instruction *Orig) {
906   // If the loop was versioned with memchecks, add the corresponding no-alias
907   // metadata.
908   if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig)))
909     LVer->annotateInstWithNoAlias(To, Orig);
910 }
911 
912 void InnerLoopVectorizer::addMetadata(Instruction *To,
913                                       Instruction *From) {
914   propagateMetadata(To, From);
915   addNewMetadata(To, From);
916 }
917 
918 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,
919                                       Instruction *From) {
920   for (Value *V : To) {
921     if (Instruction *I = dyn_cast<Instruction>(V))
922       addMetadata(I, From);
923   }
924 }
925 
926 namespace llvm {
927 
928 // Loop vectorization cost-model hints how the scalar epilogue loop should be
929 // lowered.
930 enum ScalarEpilogueLowering {
931 
932   // The default: allowing scalar epilogues.
933   CM_ScalarEpilogueAllowed,
934 
935   // Vectorization with OptForSize: don't allow epilogues.
936   CM_ScalarEpilogueNotAllowedOptSize,
937 
938   // A special case of vectorisation with OptForSize: loops with a very small
939   // trip count are considered for vectorization under OptForSize, thereby
940   // making sure the cost of their loop body is dominant, free of runtime
941   // guards and scalar iteration overheads.
942   CM_ScalarEpilogueNotAllowedLowTripLoop,
943 
944   // Loop hint predicate indicating an epilogue is undesired.
945   CM_ScalarEpilogueNotNeededUsePredicate
946 };
947 
948 /// LoopVectorizationCostModel - estimates the expected speedups due to
949 /// vectorization.
950 /// In many cases vectorization is not profitable. This can happen because of
951 /// a number of reasons. In this class we mainly attempt to predict the
952 /// expected speedup/slowdowns due to the supported instruction set. We use the
953 /// TargetTransformInfo to query the different backends for the cost of
954 /// different operations.
955 class LoopVectorizationCostModel {
956 public:
957   LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
958                              PredicatedScalarEvolution &PSE, LoopInfo *LI,
959                              LoopVectorizationLegality *Legal,
960                              const TargetTransformInfo &TTI,
961                              const TargetLibraryInfo *TLI, DemandedBits *DB,
962                              AssumptionCache *AC,
963                              OptimizationRemarkEmitter *ORE, const Function *F,
964                              const LoopVectorizeHints *Hints,
965                              InterleavedAccessInfo &IAI)
966       : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
967         TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
968         Hints(Hints), InterleaveInfo(IAI) {}
969 
970   /// \return An upper bound for the vectorization factor, or None if
971   /// vectorization and interleaving should be avoided up front.
972   Optional<unsigned> computeMaxVF();
973 
974   /// \return True if runtime checks are required for vectorization, and false
975   /// otherwise.
976   bool runtimeChecksRequired();
977 
978   /// \return The most profitable vectorization factor and the cost of that VF.
979   /// This method checks every power of two up to MaxVF. If UserVF is not ZERO
980   /// then this vectorization factor will be selected if vectorization is
981   /// possible.
982   VectorizationFactor selectVectorizationFactor(unsigned MaxVF);
983 
984   /// Setup cost-based decisions for user vectorization factor.
985   void selectUserVectorizationFactor(unsigned UserVF) {
986     collectUniformsAndScalars(UserVF);
987     collectInstsToScalarize(UserVF);
988   }
989 
990   /// \return The size (in bits) of the smallest and widest types in the code
991   /// that needs to be vectorized. We ignore values that remain scalar such as
992   /// 64 bit loop indices.
993   std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
994 
995   /// \return The desired interleave count.
996   /// If interleave count has been specified by metadata it will be returned.
997   /// Otherwise, the interleave count is computed and returned. VF and LoopCost
998   /// are the selected vectorization factor and the cost of the selected VF.
999   unsigned selectInterleaveCount(unsigned VF, unsigned LoopCost);
1000 
1001   /// Memory access instruction may be vectorized in more than one way.
1002   /// Form of instruction after vectorization depends on cost.
1003   /// This function takes cost-based decisions for Load/Store instructions
1004   /// and collects them in a map. This decisions map is used for building
1005   /// the lists of loop-uniform and loop-scalar instructions.
1006   /// The calculated cost is saved with widening decision in order to
1007   /// avoid redundant calculations.
1008   void setCostBasedWideningDecision(unsigned VF);
1009 
1010   /// A struct that represents some properties of the register usage
1011   /// of a loop.
1012   struct RegisterUsage {
1013     /// Holds the number of loop invariant values that are used in the loop.
1014     /// The key is ClassID of target-provided register class.
1015     SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs;
1016     /// Holds the maximum number of concurrent live intervals in the loop.
1017     /// The key is ClassID of target-provided register class.
1018     SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
1019   };
1020 
1021   /// \return Returns information about the register usages of the loop for the
1022   /// given vectorization factors.
1023   SmallVector<RegisterUsage, 8> calculateRegisterUsage(ArrayRef<unsigned> VFs);
1024 
1025   /// Collect values we want to ignore in the cost model.
1026   void collectValuesToIgnore();
1027 
1028   /// \returns The smallest bitwidth each instruction can be represented with.
1029   /// The vector equivalents of these instructions should be truncated to this
1030   /// type.
1031   const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
1032     return MinBWs;
1033   }
1034 
1035   /// \returns True if it is more profitable to scalarize instruction \p I for
1036   /// vectorization factor \p VF.
1037   bool isProfitableToScalarize(Instruction *I, unsigned VF) const {
1038     assert(VF > 1 && "Profitable to scalarize relevant only for VF > 1.");
1039 
1040     // Cost model is not run in the VPlan-native path - return conservative
1041     // result until this changes.
1042     if (EnableVPlanNativePath)
1043       return false;
1044 
1045     auto Scalars = InstsToScalarize.find(VF);
1046     assert(Scalars != InstsToScalarize.end() &&
1047            "VF not yet analyzed for scalarization profitability");
1048     return Scalars->second.find(I) != Scalars->second.end();
1049   }
1050 
1051   /// Returns true if \p I is known to be uniform after vectorization.
1052   bool isUniformAfterVectorization(Instruction *I, unsigned VF) const {
1053     if (VF == 1)
1054       return true;
1055 
1056     // Cost model is not run in the VPlan-native path - return conservative
1057     // result until this changes.
1058     if (EnableVPlanNativePath)
1059       return false;
1060 
1061     auto UniformsPerVF = Uniforms.find(VF);
1062     assert(UniformsPerVF != Uniforms.end() &&
1063            "VF not yet analyzed for uniformity");
1064     return UniformsPerVF->second.find(I) != UniformsPerVF->second.end();
1065   }
1066 
1067   /// Returns true if \p I is known to be scalar after vectorization.
1068   bool isScalarAfterVectorization(Instruction *I, unsigned VF) const {
1069     if (VF == 1)
1070       return true;
1071 
1072     // Cost model is not run in the VPlan-native path - return conservative
1073     // result until this changes.
1074     if (EnableVPlanNativePath)
1075       return false;
1076 
1077     auto ScalarsPerVF = Scalars.find(VF);
1078     assert(ScalarsPerVF != Scalars.end() &&
1079            "Scalar values are not calculated for VF");
1080     return ScalarsPerVF->second.find(I) != ScalarsPerVF->second.end();
1081   }
1082 
1083   /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1084   /// for vectorization factor \p VF.
1085   bool canTruncateToMinimalBitwidth(Instruction *I, unsigned VF) const {
1086     return VF > 1 && MinBWs.find(I) != MinBWs.end() &&
1087            !isProfitableToScalarize(I, VF) &&
1088            !isScalarAfterVectorization(I, VF);
1089   }
1090 
1091   /// Decision that was taken during cost calculation for memory instruction.
1092   enum InstWidening {
1093     CM_Unknown,
1094     CM_Widen,         // For consecutive accesses with stride +1.
1095     CM_Widen_Reverse, // For consecutive accesses with stride -1.
1096     CM_Interleave,
1097     CM_GatherScatter,
1098     CM_Scalarize
1099   };
1100 
1101   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1102   /// instruction \p I and vector width \p VF.
1103   void setWideningDecision(Instruction *I, unsigned VF, InstWidening W,
1104                            unsigned Cost) {
1105     assert(VF >= 2 && "Expected VF >=2");
1106     WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1107   }
1108 
1109   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1110   /// interleaving group \p Grp and vector width \p VF.
1111   void setWideningDecision(const InterleaveGroup<Instruction> *Grp, unsigned VF,
1112                            InstWidening W, unsigned Cost) {
1113     assert(VF >= 2 && "Expected VF >=2");
1114     /// Broadcast this decicion to all instructions inside the group.
1115     /// But the cost will be assigned to one instruction only.
1116     for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1117       if (auto *I = Grp->getMember(i)) {
1118         if (Grp->getInsertPos() == I)
1119           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1120         else
1121           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1122       }
1123     }
1124   }
1125 
1126   /// Return the cost model decision for the given instruction \p I and vector
1127   /// width \p VF. Return CM_Unknown if this instruction did not pass
1128   /// through the cost modeling.
1129   InstWidening getWideningDecision(Instruction *I, unsigned VF) {
1130     assert(VF >= 2 && "Expected VF >=2");
1131 
1132     // Cost model is not run in the VPlan-native path - return conservative
1133     // result until this changes.
1134     if (EnableVPlanNativePath)
1135       return CM_GatherScatter;
1136 
1137     std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
1138     auto Itr = WideningDecisions.find(InstOnVF);
1139     if (Itr == WideningDecisions.end())
1140       return CM_Unknown;
1141     return Itr->second.first;
1142   }
1143 
1144   /// Return the vectorization cost for the given instruction \p I and vector
1145   /// width \p VF.
1146   unsigned getWideningCost(Instruction *I, unsigned VF) {
1147     assert(VF >= 2 && "Expected VF >=2");
1148     std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
1149     assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&
1150            "The cost is not calculated");
1151     return WideningDecisions[InstOnVF].second;
1152   }
1153 
1154   /// Return True if instruction \p I is an optimizable truncate whose operand
1155   /// is an induction variable. Such a truncate will be removed by adding a new
1156   /// induction variable with the destination type.
1157   bool isOptimizableIVTruncate(Instruction *I, unsigned VF) {
1158     // If the instruction is not a truncate, return false.
1159     auto *Trunc = dyn_cast<TruncInst>(I);
1160     if (!Trunc)
1161       return false;
1162 
1163     // Get the source and destination types of the truncate.
1164     Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1165     Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1166 
1167     // If the truncate is free for the given types, return false. Replacing a
1168     // free truncate with an induction variable would add an induction variable
1169     // update instruction to each iteration of the loop. We exclude from this
1170     // check the primary induction variable since it will need an update
1171     // instruction regardless.
1172     Value *Op = Trunc->getOperand(0);
1173     if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1174       return false;
1175 
1176     // If the truncated value is not an induction variable, return false.
1177     return Legal->isInductionPhi(Op);
1178   }
1179 
1180   /// Collects the instructions to scalarize for each predicated instruction in
1181   /// the loop.
1182   void collectInstsToScalarize(unsigned VF);
1183 
1184   /// Collect Uniform and Scalar values for the given \p VF.
1185   /// The sets depend on CM decision for Load/Store instructions
1186   /// that may be vectorized as interleave, gather-scatter or scalarized.
1187   void collectUniformsAndScalars(unsigned VF) {
1188     // Do the analysis once.
1189     if (VF == 1 || Uniforms.find(VF) != Uniforms.end())
1190       return;
1191     setCostBasedWideningDecision(VF);
1192     collectLoopUniforms(VF);
1193     collectLoopScalars(VF);
1194   }
1195 
1196   /// Returns true if the target machine supports masked store operation
1197   /// for the given \p DataType and kind of access to \p Ptr.
1198   bool isLegalMaskedStore(Type *DataType, Value *Ptr, MaybeAlign Alignment) {
1199     return Legal->isConsecutivePtr(Ptr) &&
1200            TTI.isLegalMaskedStore(DataType, Alignment);
1201   }
1202 
1203   /// Returns true if the target machine supports masked load operation
1204   /// for the given \p DataType and kind of access to \p Ptr.
1205   bool isLegalMaskedLoad(Type *DataType, Value *Ptr, MaybeAlign Alignment) {
1206     return Legal->isConsecutivePtr(Ptr) &&
1207            TTI.isLegalMaskedLoad(DataType, Alignment);
1208   }
1209 
1210   /// Returns true if the target machine supports masked scatter operation
1211   /// for the given \p DataType.
1212   bool isLegalMaskedScatter(Type *DataType, MaybeAlign Alignment) {
1213     return TTI.isLegalMaskedScatter(DataType, Alignment);
1214   }
1215 
1216   /// Returns true if the target machine supports masked gather operation
1217   /// for the given \p DataType.
1218   bool isLegalMaskedGather(Type *DataType, MaybeAlign Alignment) {
1219     return TTI.isLegalMaskedGather(DataType, Alignment);
1220   }
1221 
1222   /// Returns true if the target machine can represent \p V as a masked gather
1223   /// or scatter operation.
1224   bool isLegalGatherOrScatter(Value *V) {
1225     bool LI = isa<LoadInst>(V);
1226     bool SI = isa<StoreInst>(V);
1227     if (!LI && !SI)
1228       return false;
1229     auto *Ty = getMemInstValueType(V);
1230     MaybeAlign Align = getLoadStoreAlignment(V);
1231     return (LI && isLegalMaskedGather(Ty, Align)) ||
1232            (SI && isLegalMaskedScatter(Ty, Align));
1233   }
1234 
1235   /// Returns true if \p I is an instruction that will be scalarized with
1236   /// predication. Such instructions include conditional stores and
1237   /// instructions that may divide by zero.
1238   /// If a non-zero VF has been calculated, we check if I will be scalarized
1239   /// predication for that VF.
1240   bool isScalarWithPredication(Instruction *I, unsigned VF = 1);
1241 
1242   // Returns true if \p I is an instruction that will be predicated either
1243   // through scalar predication or masked load/store or masked gather/scatter.
1244   // Superset of instructions that return true for isScalarWithPredication.
1245   bool isPredicatedInst(Instruction *I) {
1246     if (!blockNeedsPredication(I->getParent()))
1247       return false;
1248     // Loads and stores that need some form of masked operation are predicated
1249     // instructions.
1250     if (isa<LoadInst>(I) || isa<StoreInst>(I))
1251       return Legal->isMaskRequired(I);
1252     return isScalarWithPredication(I);
1253   }
1254 
1255   /// Returns true if \p I is a memory instruction with consecutive memory
1256   /// access that can be widened.
1257   bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1);
1258 
1259   /// Returns true if \p I is a memory instruction in an interleaved-group
1260   /// of memory accesses that can be vectorized with wide vector loads/stores
1261   /// and shuffles.
1262   bool interleavedAccessCanBeWidened(Instruction *I, unsigned VF = 1);
1263 
1264   /// Check if \p Instr belongs to any interleaved access group.
1265   bool isAccessInterleaved(Instruction *Instr) {
1266     return InterleaveInfo.isInterleaved(Instr);
1267   }
1268 
1269   /// Get the interleaved access group that \p Instr belongs to.
1270   const InterleaveGroup<Instruction> *
1271   getInterleavedAccessGroup(Instruction *Instr) {
1272     return InterleaveInfo.getInterleaveGroup(Instr);
1273   }
1274 
1275   /// Returns true if an interleaved group requires a scalar iteration
1276   /// to handle accesses with gaps, and there is nothing preventing us from
1277   /// creating a scalar epilogue.
1278   bool requiresScalarEpilogue() const {
1279     return isScalarEpilogueAllowed() && InterleaveInfo.requiresScalarEpilogue();
1280   }
1281 
1282   /// Returns true if a scalar epilogue is not allowed due to optsize or a
1283   /// loop hint annotation.
1284   bool isScalarEpilogueAllowed() const {
1285     return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1286   }
1287 
1288   /// Returns true if all loop blocks should be masked to fold tail loop.
1289   bool foldTailByMasking() const { return FoldTailByMasking; }
1290 
1291   bool blockNeedsPredication(BasicBlock *BB) {
1292     return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1293   }
1294 
1295   /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1296   /// with factor VF.  Return the cost of the instruction, including
1297   /// scalarization overhead if it's needed.
1298   unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF);
1299 
1300   /// Estimate cost of a call instruction CI if it were vectorized with factor
1301   /// VF. Return the cost of the instruction, including scalarization overhead
1302   /// if it's needed. The flag NeedToScalarize shows if the call needs to be
1303   /// scalarized -
1304   /// i.e. either vector version isn't available, or is too expensive.
1305   unsigned getVectorCallCost(CallInst *CI, unsigned VF, bool &NeedToScalarize);
1306 
1307 private:
1308   unsigned NumPredStores = 0;
1309 
1310   /// \return An upper bound for the vectorization factor, larger than zero.
1311   /// One is returned if vectorization should best be avoided due to cost.
1312   unsigned computeFeasibleMaxVF(unsigned ConstTripCount);
1313 
1314   /// The vectorization cost is a combination of the cost itself and a boolean
1315   /// indicating whether any of the contributing operations will actually
1316   /// operate on
1317   /// vector values after type legalization in the backend. If this latter value
1318   /// is
1319   /// false, then all operations will be scalarized (i.e. no vectorization has
1320   /// actually taken place).
1321   using VectorizationCostTy = std::pair<unsigned, bool>;
1322 
1323   /// Returns the expected execution cost. The unit of the cost does
1324   /// not matter because we use the 'cost' units to compare different
1325   /// vector widths. The cost that is returned is *not* normalized by
1326   /// the factor width.
1327   VectorizationCostTy expectedCost(unsigned VF);
1328 
1329   /// Returns the execution time cost of an instruction for a given vector
1330   /// width. Vector width of one means scalar.
1331   VectorizationCostTy getInstructionCost(Instruction *I, unsigned VF);
1332 
1333   /// The cost-computation logic from getInstructionCost which provides
1334   /// the vector type as an output parameter.
1335   unsigned getInstructionCost(Instruction *I, unsigned VF, Type *&VectorTy);
1336 
1337   /// Calculate vectorization cost of memory instruction \p I.
1338   unsigned getMemoryInstructionCost(Instruction *I, unsigned VF);
1339 
1340   /// The cost computation for scalarized memory instruction.
1341   unsigned getMemInstScalarizationCost(Instruction *I, unsigned VF);
1342 
1343   /// The cost computation for interleaving group of memory instructions.
1344   unsigned getInterleaveGroupCost(Instruction *I, unsigned VF);
1345 
1346   /// The cost computation for Gather/Scatter instruction.
1347   unsigned getGatherScatterCost(Instruction *I, unsigned VF);
1348 
1349   /// The cost computation for widening instruction \p I with consecutive
1350   /// memory access.
1351   unsigned getConsecutiveMemOpCost(Instruction *I, unsigned VF);
1352 
1353   /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1354   /// Load: scalar load + broadcast.
1355   /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1356   /// element)
1357   unsigned getUniformMemOpCost(Instruction *I, unsigned VF);
1358 
1359   /// Estimate the overhead of scalarizing an instruction. This is a
1360   /// convenience wrapper for the type-based getScalarizationOverhead API.
1361   unsigned getScalarizationOverhead(Instruction *I, unsigned VF);
1362 
1363   /// Returns whether the instruction is a load or store and will be a emitted
1364   /// as a vector operation.
1365   bool isConsecutiveLoadOrStore(Instruction *I);
1366 
1367   /// Returns true if an artificially high cost for emulated masked memrefs
1368   /// should be used.
1369   bool useEmulatedMaskMemRefHack(Instruction *I);
1370 
1371   /// Map of scalar integer values to the smallest bitwidth they can be legally
1372   /// represented as. The vector equivalents of these values should be truncated
1373   /// to this type.
1374   MapVector<Instruction *, uint64_t> MinBWs;
1375 
1376   /// A type representing the costs for instructions if they were to be
1377   /// scalarized rather than vectorized. The entries are Instruction-Cost
1378   /// pairs.
1379   using ScalarCostsTy = DenseMap<Instruction *, unsigned>;
1380 
1381   /// A set containing all BasicBlocks that are known to present after
1382   /// vectorization as a predicated block.
1383   SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;
1384 
1385   /// Records whether it is allowed to have the original scalar loop execute at
1386   /// least once. This may be needed as a fallback loop in case runtime
1387   /// aliasing/dependence checks fail, or to handle the tail/remainder
1388   /// iterations when the trip count is unknown or doesn't divide by the VF,
1389   /// or as a peel-loop to handle gaps in interleave-groups.
1390   /// Under optsize and when the trip count is very small we don't allow any
1391   /// iterations to execute in the scalar loop.
1392   ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1393 
1394   /// All blocks of loop are to be masked to fold tail of scalar iterations.
1395   bool FoldTailByMasking = false;
1396 
1397   /// A map holding scalar costs for different vectorization factors. The
1398   /// presence of a cost for an instruction in the mapping indicates that the
1399   /// instruction will be scalarized when vectorizing with the associated
1400   /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1401   DenseMap<unsigned, ScalarCostsTy> InstsToScalarize;
1402 
1403   /// Holds the instructions known to be uniform after vectorization.
1404   /// The data is collected per VF.
1405   DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Uniforms;
1406 
1407   /// Holds the instructions known to be scalar after vectorization.
1408   /// The data is collected per VF.
1409   DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Scalars;
1410 
1411   /// Holds the instructions (address computations) that are forced to be
1412   /// scalarized.
1413   DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1414 
1415   /// Returns the expected difference in cost from scalarizing the expression
1416   /// feeding a predicated instruction \p PredInst. The instructions to
1417   /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1418   /// non-negative return value implies the expression will be scalarized.
1419   /// Currently, only single-use chains are considered for scalarization.
1420   int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
1421                               unsigned VF);
1422 
1423   /// Collect the instructions that are uniform after vectorization. An
1424   /// instruction is uniform if we represent it with a single scalar value in
1425   /// the vectorized loop corresponding to each vector iteration. Examples of
1426   /// uniform instructions include pointer operands of consecutive or
1427   /// interleaved memory accesses. Note that although uniformity implies an
1428   /// instruction will be scalar, the reverse is not true. In general, a
1429   /// scalarized instruction will be represented by VF scalar values in the
1430   /// vectorized loop, each corresponding to an iteration of the original
1431   /// scalar loop.
1432   void collectLoopUniforms(unsigned VF);
1433 
1434   /// Collect the instructions that are scalar after vectorization. An
1435   /// instruction is scalar if it is known to be uniform or will be scalarized
1436   /// during vectorization. Non-uniform scalarized instructions will be
1437   /// represented by VF values in the vectorized loop, each corresponding to an
1438   /// iteration of the original scalar loop.
1439   void collectLoopScalars(unsigned VF);
1440 
1441   /// Keeps cost model vectorization decision and cost for instructions.
1442   /// Right now it is used for memory instructions only.
1443   using DecisionList = DenseMap<std::pair<Instruction *, unsigned>,
1444                                 std::pair<InstWidening, unsigned>>;
1445 
1446   DecisionList WideningDecisions;
1447 
1448   /// Returns true if \p V is expected to be vectorized and it needs to be
1449   /// extracted.
1450   bool needsExtract(Value *V, unsigned VF) const {
1451     Instruction *I = dyn_cast<Instruction>(V);
1452     if (VF == 1 || !I || !TheLoop->contains(I) || TheLoop->isLoopInvariant(I))
1453       return false;
1454 
1455     // Assume we can vectorize V (and hence we need extraction) if the
1456     // scalars are not computed yet. This can happen, because it is called
1457     // via getScalarizationOverhead from setCostBasedWideningDecision, before
1458     // the scalars are collected. That should be a safe assumption in most
1459     // cases, because we check if the operands have vectorizable types
1460     // beforehand in LoopVectorizationLegality.
1461     return Scalars.find(VF) == Scalars.end() ||
1462            !isScalarAfterVectorization(I, VF);
1463   };
1464 
1465   /// Returns a range containing only operands needing to be extracted.
1466   SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1467                                                    unsigned VF) {
1468     return SmallVector<Value *, 4>(make_filter_range(
1469         Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1470   }
1471 
1472 public:
1473   /// The loop that we evaluate.
1474   Loop *TheLoop;
1475 
1476   /// Predicated scalar evolution analysis.
1477   PredicatedScalarEvolution &PSE;
1478 
1479   /// Loop Info analysis.
1480   LoopInfo *LI;
1481 
1482   /// Vectorization legality.
1483   LoopVectorizationLegality *Legal;
1484 
1485   /// Vector target information.
1486   const TargetTransformInfo &TTI;
1487 
1488   /// Target Library Info.
1489   const TargetLibraryInfo *TLI;
1490 
1491   /// Demanded bits analysis.
1492   DemandedBits *DB;
1493 
1494   /// Assumption cache.
1495   AssumptionCache *AC;
1496 
1497   /// Interface to emit optimization remarks.
1498   OptimizationRemarkEmitter *ORE;
1499 
1500   const Function *TheFunction;
1501 
1502   /// Loop Vectorize Hint.
1503   const LoopVectorizeHints *Hints;
1504 
1505   /// The interleave access information contains groups of interleaved accesses
1506   /// with the same stride and close to each other.
1507   InterleavedAccessInfo &InterleaveInfo;
1508 
1509   /// Values to ignore in the cost model.
1510   SmallPtrSet<const Value *, 16> ValuesToIgnore;
1511 
1512   /// Values to ignore in the cost model when VF > 1.
1513   SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1514 };
1515 
1516 } // end namespace llvm
1517 
1518 // Return true if \p OuterLp is an outer loop annotated with hints for explicit
1519 // vectorization. The loop needs to be annotated with #pragma omp simd
1520 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
1521 // vector length information is not provided, vectorization is not considered
1522 // explicit. Interleave hints are not allowed either. These limitations will be
1523 // relaxed in the future.
1524 // Please, note that we are currently forced to abuse the pragma 'clang
1525 // vectorize' semantics. This pragma provides *auto-vectorization hints*
1526 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
1527 // provides *explicit vectorization hints* (LV can bypass legal checks and
1528 // assume that vectorization is legal). However, both hints are implemented
1529 // using the same metadata (llvm.loop.vectorize, processed by
1530 // LoopVectorizeHints). This will be fixed in the future when the native IR
1531 // representation for pragma 'omp simd' is introduced.
1532 static bool isExplicitVecOuterLoop(Loop *OuterLp,
1533                                    OptimizationRemarkEmitter *ORE) {
1534   assert(!OuterLp->empty() && "This is not an outer loop");
1535   LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
1536 
1537   // Only outer loops with an explicit vectorization hint are supported.
1538   // Unannotated outer loops are ignored.
1539   if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
1540     return false;
1541 
1542   Function *Fn = OuterLp->getHeader()->getParent();
1543   if (!Hints.allowVectorization(Fn, OuterLp,
1544                                 true /*VectorizeOnlyWhenForced*/)) {
1545     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
1546     return false;
1547   }
1548 
1549   if (Hints.getInterleave() > 1) {
1550     // TODO: Interleave support is future work.
1551     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
1552                          "outer loops.\n");
1553     Hints.emitRemarkWithHints();
1554     return false;
1555   }
1556 
1557   return true;
1558 }
1559 
1560 static void collectSupportedLoops(Loop &L, LoopInfo *LI,
1561                                   OptimizationRemarkEmitter *ORE,
1562                                   SmallVectorImpl<Loop *> &V) {
1563   // Collect inner loops and outer loops without irreducible control flow. For
1564   // now, only collect outer loops that have explicit vectorization hints. If we
1565   // are stress testing the VPlan H-CFG construction, we collect the outermost
1566   // loop of every loop nest.
1567   if (L.empty() || VPlanBuildStressTest ||
1568       (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
1569     LoopBlocksRPO RPOT(&L);
1570     RPOT.perform(LI);
1571     if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
1572       V.push_back(&L);
1573       // TODO: Collect inner loops inside marked outer loops in case
1574       // vectorization fails for the outer loop. Do not invoke
1575       // 'containsIrreducibleCFG' again for inner loops when the outer loop is
1576       // already known to be reducible. We can use an inherited attribute for
1577       // that.
1578       return;
1579     }
1580   }
1581   for (Loop *InnerL : L)
1582     collectSupportedLoops(*InnerL, LI, ORE, V);
1583 }
1584 
1585 namespace {
1586 
1587 /// The LoopVectorize Pass.
1588 struct LoopVectorize : public FunctionPass {
1589   /// Pass identification, replacement for typeid
1590   static char ID;
1591 
1592   LoopVectorizePass Impl;
1593 
1594   explicit LoopVectorize(bool InterleaveOnlyWhenForced = false,
1595                          bool VectorizeOnlyWhenForced = false)
1596       : FunctionPass(ID) {
1597     Impl.InterleaveOnlyWhenForced = InterleaveOnlyWhenForced;
1598     Impl.VectorizeOnlyWhenForced = VectorizeOnlyWhenForced;
1599     initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
1600   }
1601 
1602   bool runOnFunction(Function &F) override {
1603     if (skipFunction(F))
1604       return false;
1605 
1606     auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
1607     auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
1608     auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
1609     auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
1610     auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
1611     auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
1612     auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
1613     auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
1614     auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
1615     auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
1616     auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
1617     auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
1618     auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
1619 
1620     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
1621         [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
1622 
1623     return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC,
1624                         GetLAA, *ORE, PSI);
1625   }
1626 
1627   void getAnalysisUsage(AnalysisUsage &AU) const override {
1628     AU.addRequired<AssumptionCacheTracker>();
1629     AU.addRequired<BlockFrequencyInfoWrapperPass>();
1630     AU.addRequired<DominatorTreeWrapperPass>();
1631     AU.addRequired<LoopInfoWrapperPass>();
1632     AU.addRequired<ScalarEvolutionWrapperPass>();
1633     AU.addRequired<TargetTransformInfoWrapperPass>();
1634     AU.addRequired<AAResultsWrapperPass>();
1635     AU.addRequired<LoopAccessLegacyAnalysis>();
1636     AU.addRequired<DemandedBitsWrapperPass>();
1637     AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
1638     AU.addRequired<InjectTLIMappingsLegacy>();
1639 
1640     // We currently do not preserve loopinfo/dominator analyses with outer loop
1641     // vectorization. Until this is addressed, mark these analyses as preserved
1642     // only for non-VPlan-native path.
1643     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
1644     if (!EnableVPlanNativePath) {
1645       AU.addPreserved<LoopInfoWrapperPass>();
1646       AU.addPreserved<DominatorTreeWrapperPass>();
1647     }
1648 
1649     AU.addPreserved<BasicAAWrapperPass>();
1650     AU.addPreserved<GlobalsAAWrapperPass>();
1651     AU.addRequired<ProfileSummaryInfoWrapperPass>();
1652   }
1653 };
1654 
1655 } // end anonymous namespace
1656 
1657 //===----------------------------------------------------------------------===//
1658 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
1659 // LoopVectorizationCostModel and LoopVectorizationPlanner.
1660 //===----------------------------------------------------------------------===//
1661 
1662 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
1663   // We need to place the broadcast of invariant variables outside the loop,
1664   // but only if it's proven safe to do so. Else, broadcast will be inside
1665   // vector loop body.
1666   Instruction *Instr = dyn_cast<Instruction>(V);
1667   bool SafeToHoist = OrigLoop->isLoopInvariant(V) &&
1668                      (!Instr ||
1669                       DT->dominates(Instr->getParent(), LoopVectorPreHeader));
1670   // Place the code for broadcasting invariant variables in the new preheader.
1671   IRBuilder<>::InsertPointGuard Guard(Builder);
1672   if (SafeToHoist)
1673     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
1674 
1675   // Broadcast the scalar into all locations in the vector.
1676   Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
1677 
1678   return Shuf;
1679 }
1680 
1681 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
1682     const InductionDescriptor &II, Value *Step, Instruction *EntryVal) {
1683   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1684          "Expected either an induction phi-node or a truncate of it!");
1685   Value *Start = II.getStartValue();
1686 
1687   // Construct the initial value of the vector IV in the vector loop preheader
1688   auto CurrIP = Builder.saveIP();
1689   Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
1690   if (isa<TruncInst>(EntryVal)) {
1691     assert(Start->getType()->isIntegerTy() &&
1692            "Truncation requires an integer type");
1693     auto *TruncType = cast<IntegerType>(EntryVal->getType());
1694     Step = Builder.CreateTrunc(Step, TruncType);
1695     Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
1696   }
1697   Value *SplatStart = Builder.CreateVectorSplat(VF, Start);
1698   Value *SteppedStart =
1699       getStepVector(SplatStart, 0, Step, II.getInductionOpcode());
1700 
1701   // We create vector phi nodes for both integer and floating-point induction
1702   // variables. Here, we determine the kind of arithmetic we will perform.
1703   Instruction::BinaryOps AddOp;
1704   Instruction::BinaryOps MulOp;
1705   if (Step->getType()->isIntegerTy()) {
1706     AddOp = Instruction::Add;
1707     MulOp = Instruction::Mul;
1708   } else {
1709     AddOp = II.getInductionOpcode();
1710     MulOp = Instruction::FMul;
1711   }
1712 
1713   // Multiply the vectorization factor by the step using integer or
1714   // floating-point arithmetic as appropriate.
1715   Value *ConstVF = getSignedIntOrFpConstant(Step->getType(), VF);
1716   Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF));
1717 
1718   // Create a vector splat to use in the induction update.
1719   //
1720   // FIXME: If the step is non-constant, we create the vector splat with
1721   //        IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
1722   //        handle a constant vector splat.
1723   Value *SplatVF =
1724       isa<Constant>(Mul)
1725           ? ConstantVector::getSplat({VF, false}, cast<Constant>(Mul))
1726           : Builder.CreateVectorSplat(VF, Mul);
1727   Builder.restoreIP(CurrIP);
1728 
1729   // We may need to add the step a number of times, depending on the unroll
1730   // factor. The last of those goes into the PHI.
1731   PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
1732                                     &*LoopVectorBody->getFirstInsertionPt());
1733   VecInd->setDebugLoc(EntryVal->getDebugLoc());
1734   Instruction *LastInduction = VecInd;
1735   for (unsigned Part = 0; Part < UF; ++Part) {
1736     VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction);
1737 
1738     if (isa<TruncInst>(EntryVal))
1739       addMetadata(LastInduction, EntryVal);
1740     recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, Part);
1741 
1742     LastInduction = cast<Instruction>(addFastMathFlag(
1743         Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")));
1744     LastInduction->setDebugLoc(EntryVal->getDebugLoc());
1745   }
1746 
1747   // Move the last step to the end of the latch block. This ensures consistent
1748   // placement of all induction updates.
1749   auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
1750   auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator());
1751   auto *ICmp = cast<Instruction>(Br->getCondition());
1752   LastInduction->moveBefore(ICmp);
1753   LastInduction->setName("vec.ind.next");
1754 
1755   VecInd->addIncoming(SteppedStart, LoopVectorPreHeader);
1756   VecInd->addIncoming(LastInduction, LoopVectorLatch);
1757 }
1758 
1759 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const {
1760   return Cost->isScalarAfterVectorization(I, VF) ||
1761          Cost->isProfitableToScalarize(I, VF);
1762 }
1763 
1764 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const {
1765   if (shouldScalarizeInstruction(IV))
1766     return true;
1767   auto isScalarInst = [&](User *U) -> bool {
1768     auto *I = cast<Instruction>(U);
1769     return (OrigLoop->contains(I) && shouldScalarizeInstruction(I));
1770   };
1771   return llvm::any_of(IV->users(), isScalarInst);
1772 }
1773 
1774 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast(
1775     const InductionDescriptor &ID, const Instruction *EntryVal,
1776     Value *VectorLoopVal, unsigned Part, unsigned Lane) {
1777   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1778          "Expected either an induction phi-node or a truncate of it!");
1779 
1780   // This induction variable is not the phi from the original loop but the
1781   // newly-created IV based on the proof that casted Phi is equal to the
1782   // uncasted Phi in the vectorized loop (under a runtime guard possibly). It
1783   // re-uses the same InductionDescriptor that original IV uses but we don't
1784   // have to do any recording in this case - that is done when original IV is
1785   // processed.
1786   if (isa<TruncInst>(EntryVal))
1787     return;
1788 
1789   const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts();
1790   if (Casts.empty())
1791     return;
1792   // Only the first Cast instruction in the Casts vector is of interest.
1793   // The rest of the Casts (if exist) have no uses outside the
1794   // induction update chain itself.
1795   Instruction *CastInst = *Casts.begin();
1796   if (Lane < UINT_MAX)
1797     VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal);
1798   else
1799     VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal);
1800 }
1801 
1802 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) {
1803   assert((IV->getType()->isIntegerTy() || IV != OldInduction) &&
1804          "Primary induction variable must have an integer type");
1805 
1806   auto II = Legal->getInductionVars().find(IV);
1807   assert(II != Legal->getInductionVars().end() && "IV is not an induction");
1808 
1809   auto ID = II->second;
1810   assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
1811 
1812   // The value from the original loop to which we are mapping the new induction
1813   // variable.
1814   Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
1815 
1816   auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
1817 
1818   // Generate code for the induction step. Note that induction steps are
1819   // required to be loop-invariant
1820   auto CreateStepValue = [&](const SCEV *Step) -> Value * {
1821     assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) &&
1822            "Induction step should be loop invariant");
1823     if (PSE.getSE()->isSCEVable(IV->getType())) {
1824       SCEVExpander Exp(*PSE.getSE(), DL, "induction");
1825       return Exp.expandCodeFor(Step, Step->getType(),
1826                                LoopVectorPreHeader->getTerminator());
1827     }
1828     return cast<SCEVUnknown>(Step)->getValue();
1829   };
1830 
1831   // The scalar value to broadcast. This is derived from the canonical
1832   // induction variable. If a truncation type is given, truncate the canonical
1833   // induction variable and step. Otherwise, derive these values from the
1834   // induction descriptor.
1835   auto CreateScalarIV = [&](Value *&Step) -> Value * {
1836     Value *ScalarIV = Induction;
1837     if (IV != OldInduction) {
1838       ScalarIV = IV->getType()->isIntegerTy()
1839                      ? Builder.CreateSExtOrTrunc(Induction, IV->getType())
1840                      : Builder.CreateCast(Instruction::SIToFP, Induction,
1841                                           IV->getType());
1842       ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID);
1843       ScalarIV->setName("offset.idx");
1844     }
1845     if (Trunc) {
1846       auto *TruncType = cast<IntegerType>(Trunc->getType());
1847       assert(Step->getType()->isIntegerTy() &&
1848              "Truncation requires an integer step");
1849       ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType);
1850       Step = Builder.CreateTrunc(Step, TruncType);
1851     }
1852     return ScalarIV;
1853   };
1854 
1855   // Create the vector values from the scalar IV, in the absence of creating a
1856   // vector IV.
1857   auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) {
1858     Value *Broadcasted = getBroadcastInstrs(ScalarIV);
1859     for (unsigned Part = 0; Part < UF; ++Part) {
1860       Value *EntryPart =
1861           getStepVector(Broadcasted, VF * Part, Step, ID.getInductionOpcode());
1862       VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart);
1863       if (Trunc)
1864         addMetadata(EntryPart, Trunc);
1865       recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part);
1866     }
1867   };
1868 
1869   // Now do the actual transformations, and start with creating the step value.
1870   Value *Step = CreateStepValue(ID.getStep());
1871   if (VF <= 1) {
1872     Value *ScalarIV = CreateScalarIV(Step);
1873     CreateSplatIV(ScalarIV, Step);
1874     return;
1875   }
1876 
1877   // Determine if we want a scalar version of the induction variable. This is
1878   // true if the induction variable itself is not widened, or if it has at
1879   // least one user in the loop that is not widened.
1880   auto NeedsScalarIV = needsScalarInduction(EntryVal);
1881   if (!NeedsScalarIV) {
1882     createVectorIntOrFpInductionPHI(ID, Step, EntryVal);
1883     return;
1884   }
1885 
1886   // Try to create a new independent vector induction variable. If we can't
1887   // create the phi node, we will splat the scalar induction variable in each
1888   // loop iteration.
1889   if (!shouldScalarizeInstruction(EntryVal)) {
1890     createVectorIntOrFpInductionPHI(ID, Step, EntryVal);
1891     Value *ScalarIV = CreateScalarIV(Step);
1892     // Create scalar steps that can be used by instructions we will later
1893     // scalarize. Note that the addition of the scalar steps will not increase
1894     // the number of instructions in the loop in the common case prior to
1895     // InstCombine. We will be trading one vector extract for each scalar step.
1896     buildScalarSteps(ScalarIV, Step, EntryVal, ID);
1897     return;
1898   }
1899 
1900   // If we haven't yet vectorized the induction variable, splat the scalar
1901   // induction variable, and build the necessary step vectors.
1902   // TODO: Don't do it unless the vectorized IV is really required.
1903   Value *ScalarIV = CreateScalarIV(Step);
1904   CreateSplatIV(ScalarIV, Step);
1905   buildScalarSteps(ScalarIV, Step, EntryVal, ID);
1906 }
1907 
1908 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step,
1909                                           Instruction::BinaryOps BinOp) {
1910   // Create and check the types.
1911   assert(Val->getType()->isVectorTy() && "Must be a vector");
1912   int VLen = Val->getType()->getVectorNumElements();
1913 
1914   Type *STy = Val->getType()->getScalarType();
1915   assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
1916          "Induction Step must be an integer or FP");
1917   assert(Step->getType() == STy && "Step has wrong type");
1918 
1919   SmallVector<Constant *, 8> Indices;
1920 
1921   if (STy->isIntegerTy()) {
1922     // Create a vector of consecutive numbers from zero to VF.
1923     for (int i = 0; i < VLen; ++i)
1924       Indices.push_back(ConstantInt::get(STy, StartIdx + i));
1925 
1926     // Add the consecutive indices to the vector value.
1927     Constant *Cv = ConstantVector::get(Indices);
1928     assert(Cv->getType() == Val->getType() && "Invalid consecutive vec");
1929     Step = Builder.CreateVectorSplat(VLen, Step);
1930     assert(Step->getType() == Val->getType() && "Invalid step vec");
1931     // FIXME: The newly created binary instructions should contain nsw/nuw flags,
1932     // which can be found from the original scalar operations.
1933     Step = Builder.CreateMul(Cv, Step);
1934     return Builder.CreateAdd(Val, Step, "induction");
1935   }
1936 
1937   // Floating point induction.
1938   assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
1939          "Binary Opcode should be specified for FP induction");
1940   // Create a vector of consecutive numbers from zero to VF.
1941   for (int i = 0; i < VLen; ++i)
1942     Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i)));
1943 
1944   // Add the consecutive indices to the vector value.
1945   Constant *Cv = ConstantVector::get(Indices);
1946 
1947   Step = Builder.CreateVectorSplat(VLen, Step);
1948 
1949   // Floating point operations had to be 'fast' to enable the induction.
1950   FastMathFlags Flags;
1951   Flags.setFast();
1952 
1953   Value *MulOp = Builder.CreateFMul(Cv, Step);
1954   if (isa<Instruction>(MulOp))
1955     // Have to check, MulOp may be a constant
1956     cast<Instruction>(MulOp)->setFastMathFlags(Flags);
1957 
1958   Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
1959   if (isa<Instruction>(BOp))
1960     cast<Instruction>(BOp)->setFastMathFlags(Flags);
1961   return BOp;
1962 }
1963 
1964 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
1965                                            Instruction *EntryVal,
1966                                            const InductionDescriptor &ID) {
1967   // We shouldn't have to build scalar steps if we aren't vectorizing.
1968   assert(VF > 1 && "VF should be greater than one");
1969 
1970   // Get the value type and ensure it and the step have the same integer type.
1971   Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
1972   assert(ScalarIVTy == Step->getType() &&
1973          "Val and Step should have the same type");
1974 
1975   // We build scalar steps for both integer and floating-point induction
1976   // variables. Here, we determine the kind of arithmetic we will perform.
1977   Instruction::BinaryOps AddOp;
1978   Instruction::BinaryOps MulOp;
1979   if (ScalarIVTy->isIntegerTy()) {
1980     AddOp = Instruction::Add;
1981     MulOp = Instruction::Mul;
1982   } else {
1983     AddOp = ID.getInductionOpcode();
1984     MulOp = Instruction::FMul;
1985   }
1986 
1987   // Determine the number of scalars we need to generate for each unroll
1988   // iteration. If EntryVal is uniform, we only need to generate the first
1989   // lane. Otherwise, we generate all VF values.
1990   unsigned Lanes =
1991       Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) ? 1
1992                                                                          : VF;
1993   // Compute the scalar steps and save the results in VectorLoopValueMap.
1994   for (unsigned Part = 0; Part < UF; ++Part) {
1995     for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
1996       auto *StartIdx = getSignedIntOrFpConstant(ScalarIVTy, VF * Part + Lane);
1997       auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step));
1998       auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul));
1999       VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add);
2000       recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane);
2001     }
2002   }
2003 }
2004 
2005 Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) {
2006   assert(V != Induction && "The new induction variable should not be used.");
2007   assert(!V->getType()->isVectorTy() && "Can't widen a vector");
2008   assert(!V->getType()->isVoidTy() && "Type does not produce a value");
2009 
2010   // If we have a stride that is replaced by one, do it here. Defer this for
2011   // the VPlan-native path until we start running Legal checks in that path.
2012   if (!EnableVPlanNativePath && Legal->hasStride(V))
2013     V = ConstantInt::get(V->getType(), 1);
2014 
2015   // If we have a vector mapped to this value, return it.
2016   if (VectorLoopValueMap.hasVectorValue(V, Part))
2017     return VectorLoopValueMap.getVectorValue(V, Part);
2018 
2019   // If the value has not been vectorized, check if it has been scalarized
2020   // instead. If it has been scalarized, and we actually need the value in
2021   // vector form, we will construct the vector values on demand.
2022   if (VectorLoopValueMap.hasAnyScalarValue(V)) {
2023     Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0});
2024 
2025     // If we've scalarized a value, that value should be an instruction.
2026     auto *I = cast<Instruction>(V);
2027 
2028     // If we aren't vectorizing, we can just copy the scalar map values over to
2029     // the vector map.
2030     if (VF == 1) {
2031       VectorLoopValueMap.setVectorValue(V, Part, ScalarValue);
2032       return ScalarValue;
2033     }
2034 
2035     // Get the last scalar instruction we generated for V and Part. If the value
2036     // is known to be uniform after vectorization, this corresponds to lane zero
2037     // of the Part unroll iteration. Otherwise, the last instruction is the one
2038     // we created for the last vector lane of the Part unroll iteration.
2039     unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) ? 0 : VF - 1;
2040     auto *LastInst = cast<Instruction>(
2041         VectorLoopValueMap.getScalarValue(V, {Part, LastLane}));
2042 
2043     // Set the insert point after the last scalarized instruction. This ensures
2044     // the insertelement sequence will directly follow the scalar definitions.
2045     auto OldIP = Builder.saveIP();
2046     auto NewIP = std::next(BasicBlock::iterator(LastInst));
2047     Builder.SetInsertPoint(&*NewIP);
2048 
2049     // However, if we are vectorizing, we need to construct the vector values.
2050     // If the value is known to be uniform after vectorization, we can just
2051     // broadcast the scalar value corresponding to lane zero for each unroll
2052     // iteration. Otherwise, we construct the vector values using insertelement
2053     // instructions. Since the resulting vectors are stored in
2054     // VectorLoopValueMap, we will only generate the insertelements once.
2055     Value *VectorValue = nullptr;
2056     if (Cost->isUniformAfterVectorization(I, VF)) {
2057       VectorValue = getBroadcastInstrs(ScalarValue);
2058       VectorLoopValueMap.setVectorValue(V, Part, VectorValue);
2059     } else {
2060       // Initialize packing with insertelements to start from undef.
2061       Value *Undef = UndefValue::get(VectorType::get(V->getType(), VF));
2062       VectorLoopValueMap.setVectorValue(V, Part, Undef);
2063       for (unsigned Lane = 0; Lane < VF; ++Lane)
2064         packScalarIntoVectorValue(V, {Part, Lane});
2065       VectorValue = VectorLoopValueMap.getVectorValue(V, Part);
2066     }
2067     Builder.restoreIP(OldIP);
2068     return VectorValue;
2069   }
2070 
2071   // If this scalar is unknown, assume that it is a constant or that it is
2072   // loop invariant. Broadcast V and save the value for future uses.
2073   Value *B = getBroadcastInstrs(V);
2074   VectorLoopValueMap.setVectorValue(V, Part, B);
2075   return B;
2076 }
2077 
2078 Value *
2079 InnerLoopVectorizer::getOrCreateScalarValue(Value *V,
2080                                             const VPIteration &Instance) {
2081   // If the value is not an instruction contained in the loop, it should
2082   // already be scalar.
2083   if (OrigLoop->isLoopInvariant(V))
2084     return V;
2085 
2086   assert(Instance.Lane > 0
2087              ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF)
2088              : true && "Uniform values only have lane zero");
2089 
2090   // If the value from the original loop has not been vectorized, it is
2091   // represented by UF x VF scalar values in the new loop. Return the requested
2092   // scalar value.
2093   if (VectorLoopValueMap.hasScalarValue(V, Instance))
2094     return VectorLoopValueMap.getScalarValue(V, Instance);
2095 
2096   // If the value has not been scalarized, get its entry in VectorLoopValueMap
2097   // for the given unroll part. If this entry is not a vector type (i.e., the
2098   // vectorization factor is one), there is no need to generate an
2099   // extractelement instruction.
2100   auto *U = getOrCreateVectorValue(V, Instance.Part);
2101   if (!U->getType()->isVectorTy()) {
2102     assert(VF == 1 && "Value not scalarized has non-vector type");
2103     return U;
2104   }
2105 
2106   // Otherwise, the value from the original loop has been vectorized and is
2107   // represented by UF vector values. Extract and return the requested scalar
2108   // value from the appropriate vector lane.
2109   return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane));
2110 }
2111 
2112 void InnerLoopVectorizer::packScalarIntoVectorValue(
2113     Value *V, const VPIteration &Instance) {
2114   assert(V != Induction && "The new induction variable should not be used.");
2115   assert(!V->getType()->isVectorTy() && "Can't pack a vector");
2116   assert(!V->getType()->isVoidTy() && "Type does not produce a value");
2117 
2118   Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance);
2119   Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part);
2120   VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst,
2121                                             Builder.getInt32(Instance.Lane));
2122   VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue);
2123 }
2124 
2125 Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
2126   assert(Vec->getType()->isVectorTy() && "Invalid type");
2127   SmallVector<Constant *, 8> ShuffleMask;
2128   for (unsigned i = 0; i < VF; ++i)
2129     ShuffleMask.push_back(Builder.getInt32(VF - i - 1));
2130 
2131   return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()),
2132                                      ConstantVector::get(ShuffleMask),
2133                                      "reverse");
2134 }
2135 
2136 // Return whether we allow using masked interleave-groups (for dealing with
2137 // strided loads/stores that reside in predicated blocks, or for dealing
2138 // with gaps).
2139 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2140   // If an override option has been passed in for interleaved accesses, use it.
2141   if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2142     return EnableMaskedInterleavedMemAccesses;
2143 
2144   return TTI.enableMaskedInterleavedAccessVectorization();
2145 }
2146 
2147 // Try to vectorize the interleave group that \p Instr belongs to.
2148 //
2149 // E.g. Translate following interleaved load group (factor = 3):
2150 //   for (i = 0; i < N; i+=3) {
2151 //     R = Pic[i];             // Member of index 0
2152 //     G = Pic[i+1];           // Member of index 1
2153 //     B = Pic[i+2];           // Member of index 2
2154 //     ... // do something to R, G, B
2155 //   }
2156 // To:
2157 //   %wide.vec = load <12 x i32>                       ; Read 4 tuples of R,G,B
2158 //   %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9>   ; R elements
2159 //   %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10>  ; G elements
2160 //   %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11>  ; B elements
2161 //
2162 // Or translate following interleaved store group (factor = 3):
2163 //   for (i = 0; i < N; i+=3) {
2164 //     ... do something to R, G, B
2165 //     Pic[i]   = R;           // Member of index 0
2166 //     Pic[i+1] = G;           // Member of index 1
2167 //     Pic[i+2] = B;           // Member of index 2
2168 //   }
2169 // To:
2170 //   %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2171 //   %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u>
2172 //   %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2173 //        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements
2174 //   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
2175 void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr,
2176                                                    VPTransformState &State,
2177                                                    VPValue *Addr,
2178                                                    VPValue *BlockInMask) {
2179   const InterleaveGroup<Instruction> *Group =
2180       Cost->getInterleavedAccessGroup(Instr);
2181   assert(Group && "Fail to get an interleaved access group.");
2182 
2183   // Skip if current instruction is not the insert position.
2184   if (Instr != Group->getInsertPos())
2185     return;
2186 
2187   const DataLayout &DL = Instr->getModule()->getDataLayout();
2188 
2189   // Prepare for the vector type of the interleaved load/store.
2190   Type *ScalarTy = getMemInstValueType(Instr);
2191   unsigned InterleaveFactor = Group->getFactor();
2192   Type *VecTy = VectorType::get(ScalarTy, InterleaveFactor * VF);
2193 
2194   // Prepare for the new pointers.
2195   SmallVector<Value *, 2> AddrParts;
2196   unsigned Index = Group->getIndex(Instr);
2197 
2198   // TODO: extend the masked interleaved-group support to reversed access.
2199   assert((!BlockInMask || !Group->isReverse()) &&
2200          "Reversed masked interleave-group not supported.");
2201 
2202   // If the group is reverse, adjust the index to refer to the last vector lane
2203   // instead of the first. We adjust the index from the first vector lane,
2204   // rather than directly getting the pointer for lane VF - 1, because the
2205   // pointer operand of the interleaved access is supposed to be uniform. For
2206   // uniform instructions, we're only required to generate a value for the
2207   // first vector lane in each unroll iteration.
2208   if (Group->isReverse())
2209     Index += (VF - 1) * Group->getFactor();
2210 
2211   for (unsigned Part = 0; Part < UF; Part++) {
2212     Value *AddrPart = State.get(Addr, {Part, 0});
2213     setDebugLocFromInst(Builder, AddrPart);
2214 
2215     // Notice current instruction could be any index. Need to adjust the address
2216     // to the member of index 0.
2217     //
2218     // E.g.  a = A[i+1];     // Member of index 1 (Current instruction)
2219     //       b = A[i];       // Member of index 0
2220     // Current pointer is pointed to A[i+1], adjust it to A[i].
2221     //
2222     // E.g.  A[i+1] = a;     // Member of index 1
2223     //       A[i]   = b;     // Member of index 0
2224     //       A[i+2] = c;     // Member of index 2 (Current instruction)
2225     // Current pointer is pointed to A[i+2], adjust it to A[i].
2226 
2227     bool InBounds = false;
2228     if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts()))
2229       InBounds = gep->isInBounds();
2230     AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index));
2231     cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds);
2232 
2233     // Cast to the vector pointer type.
2234     unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace();
2235     Type *PtrTy = VecTy->getPointerTo(AddressSpace);
2236     AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy));
2237   }
2238 
2239   setDebugLocFromInst(Builder, Instr);
2240   Value *UndefVec = UndefValue::get(VecTy);
2241 
2242   Value *MaskForGaps = nullptr;
2243   if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) {
2244     MaskForGaps = createBitMaskForGaps(Builder, VF, *Group);
2245     assert(MaskForGaps && "Mask for Gaps is required but it is null");
2246   }
2247 
2248   // Vectorize the interleaved load group.
2249   if (isa<LoadInst>(Instr)) {
2250     // For each unroll part, create a wide load for the group.
2251     SmallVector<Value *, 2> NewLoads;
2252     for (unsigned Part = 0; Part < UF; Part++) {
2253       Instruction *NewLoad;
2254       if (BlockInMask || MaskForGaps) {
2255         assert(useMaskedInterleavedAccesses(*TTI) &&
2256                "masked interleaved groups are not allowed.");
2257         Value *GroupMask = MaskForGaps;
2258         if (BlockInMask) {
2259           Value *BlockInMaskPart = State.get(BlockInMask, Part);
2260           auto *Undefs = UndefValue::get(BlockInMaskPart->getType());
2261           auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF);
2262           Value *ShuffledMask = Builder.CreateShuffleVector(
2263               BlockInMaskPart, Undefs, RepMask, "interleaved.mask");
2264           GroupMask = MaskForGaps
2265                           ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
2266                                                 MaskForGaps)
2267                           : ShuffledMask;
2268         }
2269         NewLoad =
2270             Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlign(),
2271                                      GroupMask, UndefVec, "wide.masked.vec");
2272       }
2273       else
2274         NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part],
2275                                             Group->getAlign(), "wide.vec");
2276       Group->addMetadata(NewLoad);
2277       NewLoads.push_back(NewLoad);
2278     }
2279 
2280     // For each member in the group, shuffle out the appropriate data from the
2281     // wide loads.
2282     for (unsigned I = 0; I < InterleaveFactor; ++I) {
2283       Instruction *Member = Group->getMember(I);
2284 
2285       // Skip the gaps in the group.
2286       if (!Member)
2287         continue;
2288 
2289       Constant *StrideMask = createStrideMask(Builder, I, InterleaveFactor, VF);
2290       for (unsigned Part = 0; Part < UF; Part++) {
2291         Value *StridedVec = Builder.CreateShuffleVector(
2292             NewLoads[Part], UndefVec, StrideMask, "strided.vec");
2293 
2294         // If this member has different type, cast the result type.
2295         if (Member->getType() != ScalarTy) {
2296           VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2297           StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2298         }
2299 
2300         if (Group->isReverse())
2301           StridedVec = reverseVector(StridedVec);
2302 
2303         VectorLoopValueMap.setVectorValue(Member, Part, StridedVec);
2304       }
2305     }
2306     return;
2307   }
2308 
2309   // The sub vector type for current instruction.
2310   VectorType *SubVT = VectorType::get(ScalarTy, VF);
2311 
2312   // Vectorize the interleaved store group.
2313   for (unsigned Part = 0; Part < UF; Part++) {
2314     // Collect the stored vector from each member.
2315     SmallVector<Value *, 4> StoredVecs;
2316     for (unsigned i = 0; i < InterleaveFactor; i++) {
2317       // Interleaved store group doesn't allow a gap, so each index has a member
2318       Instruction *Member = Group->getMember(i);
2319       assert(Member && "Fail to get a member from an interleaved store group");
2320 
2321       Value *StoredVec = getOrCreateVectorValue(
2322           cast<StoreInst>(Member)->getValueOperand(), Part);
2323       if (Group->isReverse())
2324         StoredVec = reverseVector(StoredVec);
2325 
2326       // If this member has different type, cast it to a unified type.
2327 
2328       if (StoredVec->getType() != SubVT)
2329         StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2330 
2331       StoredVecs.push_back(StoredVec);
2332     }
2333 
2334     // Concatenate all vectors into a wide vector.
2335     Value *WideVec = concatenateVectors(Builder, StoredVecs);
2336 
2337     // Interleave the elements in the wide vector.
2338     Constant *IMask = createInterleaveMask(Builder, VF, InterleaveFactor);
2339     Value *IVec = Builder.CreateShuffleVector(WideVec, UndefVec, IMask,
2340                                               "interleaved.vec");
2341 
2342     Instruction *NewStoreInstr;
2343     if (BlockInMask) {
2344       Value *BlockInMaskPart = State.get(BlockInMask, Part);
2345       auto *Undefs = UndefValue::get(BlockInMaskPart->getType());
2346       auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF);
2347       Value *ShuffledMask = Builder.CreateShuffleVector(
2348           BlockInMaskPart, Undefs, RepMask, "interleaved.mask");
2349       NewStoreInstr = Builder.CreateMaskedStore(
2350           IVec, AddrParts[Part], Group->getAlign(), ShuffledMask);
2351     }
2352     else
2353       NewStoreInstr =
2354           Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign());
2355 
2356     Group->addMetadata(NewStoreInstr);
2357   }
2358 }
2359 
2360 void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
2361                                                      VPTransformState &State,
2362                                                      VPValue *Addr,
2363                                                      VPValue *StoredValue,
2364                                                      VPValue *BlockInMask) {
2365   // Attempt to issue a wide load.
2366   LoadInst *LI = dyn_cast<LoadInst>(Instr);
2367   StoreInst *SI = dyn_cast<StoreInst>(Instr);
2368 
2369   assert((LI || SI) && "Invalid Load/Store instruction");
2370   assert((!SI || StoredValue) && "No stored value provided for widened store");
2371   assert((!LI || !StoredValue) && "Stored value provided for widened load");
2372 
2373   LoopVectorizationCostModel::InstWidening Decision =
2374       Cost->getWideningDecision(Instr, VF);
2375   assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
2376          "CM decision should be taken at this point");
2377   if (Decision == LoopVectorizationCostModel::CM_Interleave)
2378     return vectorizeInterleaveGroup(Instr, State, Addr, BlockInMask);
2379 
2380   Type *ScalarDataTy = getMemInstValueType(Instr);
2381   Type *DataTy = VectorType::get(ScalarDataTy, VF);
2382   // An alignment of 0 means target abi alignment. We need to use the scalar's
2383   // target abi alignment in such a case.
2384   const DataLayout &DL = Instr->getModule()->getDataLayout();
2385   const Align Alignment =
2386       DL.getValueOrABITypeAlignment(getLoadStoreAlignment(Instr), ScalarDataTy);
2387 
2388   // Determine if the pointer operand of the access is either consecutive or
2389   // reverse consecutive.
2390   bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse);
2391   bool ConsecutiveStride =
2392       Reverse || (Decision == LoopVectorizationCostModel::CM_Widen);
2393   bool CreateGatherScatter =
2394       (Decision == LoopVectorizationCostModel::CM_GatherScatter);
2395 
2396   // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector
2397   // gather/scatter. Otherwise Decision should have been to Scalarize.
2398   assert((ConsecutiveStride || CreateGatherScatter) &&
2399          "The instruction should be scalarized");
2400   (void)ConsecutiveStride;
2401 
2402   VectorParts BlockInMaskParts(UF);
2403   bool isMaskRequired = BlockInMask;
2404   if (isMaskRequired)
2405     for (unsigned Part = 0; Part < UF; ++Part)
2406       BlockInMaskParts[Part] = State.get(BlockInMask, Part);
2407 
2408   const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
2409     // Calculate the pointer for the specific unroll-part.
2410     GetElementPtrInst *PartPtr = nullptr;
2411 
2412     bool InBounds = false;
2413     if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
2414       InBounds = gep->isInBounds();
2415 
2416     if (Reverse) {
2417       // If the address is consecutive but reversed, then the
2418       // wide store needs to start at the last vector element.
2419       PartPtr = cast<GetElementPtrInst>(
2420           Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(-Part * VF)));
2421       PartPtr->setIsInBounds(InBounds);
2422       PartPtr = cast<GetElementPtrInst>(
2423           Builder.CreateGEP(ScalarDataTy, PartPtr, Builder.getInt32(1 - VF)));
2424       PartPtr->setIsInBounds(InBounds);
2425       if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
2426         BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]);
2427     } else {
2428       PartPtr = cast<GetElementPtrInst>(
2429           Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(Part * VF)));
2430       PartPtr->setIsInBounds(InBounds);
2431     }
2432 
2433     unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
2434     return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
2435   };
2436 
2437   // Handle Stores:
2438   if (SI) {
2439     setDebugLocFromInst(Builder, SI);
2440 
2441     for (unsigned Part = 0; Part < UF; ++Part) {
2442       Instruction *NewSI = nullptr;
2443       Value *StoredVal = State.get(StoredValue, Part);
2444       if (CreateGatherScatter) {
2445         Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
2446         Value *VectorGep = State.get(Addr, Part);
2447         NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
2448                                             MaskPart);
2449       } else {
2450         if (Reverse) {
2451           // If we store to reverse consecutive memory locations, then we need
2452           // to reverse the order of elements in the stored value.
2453           StoredVal = reverseVector(StoredVal);
2454           // We don't want to update the value in the map as it might be used in
2455           // another expression. So don't call resetVectorValue(StoredVal).
2456         }
2457         auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0}));
2458         if (isMaskRequired)
2459           NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
2460                                             BlockInMaskParts[Part]);
2461         else
2462           NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
2463       }
2464       addMetadata(NewSI, SI);
2465     }
2466     return;
2467   }
2468 
2469   // Handle loads.
2470   assert(LI && "Must have a load instruction");
2471   setDebugLocFromInst(Builder, LI);
2472   for (unsigned Part = 0; Part < UF; ++Part) {
2473     Value *NewLI;
2474     if (CreateGatherScatter) {
2475       Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
2476       Value *VectorGep = State.get(Addr, Part);
2477       NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart,
2478                                          nullptr, "wide.masked.gather");
2479       addMetadata(NewLI, LI);
2480     } else {
2481       auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0}));
2482       if (isMaskRequired)
2483         NewLI = Builder.CreateMaskedLoad(
2484             VecPtr, Alignment, BlockInMaskParts[Part], UndefValue::get(DataTy),
2485             "wide.masked.load");
2486       else
2487         NewLI =
2488             Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");
2489 
2490       // Add metadata to the load, but setVectorValue to the reverse shuffle.
2491       addMetadata(NewLI, LI);
2492       if (Reverse)
2493         NewLI = reverseVector(NewLI);
2494     }
2495     VectorLoopValueMap.setVectorValue(Instr, Part, NewLI);
2496   }
2497 }
2498 
2499 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
2500                                                const VPIteration &Instance,
2501                                                bool IfPredicateInstr) {
2502   assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
2503 
2504   setDebugLocFromInst(Builder, Instr);
2505 
2506   // Does this instruction return a value ?
2507   bool IsVoidRetTy = Instr->getType()->isVoidTy();
2508 
2509   Instruction *Cloned = Instr->clone();
2510   if (!IsVoidRetTy)
2511     Cloned->setName(Instr->getName() + ".cloned");
2512 
2513   // Replace the operands of the cloned instructions with their scalar
2514   // equivalents in the new loop.
2515   for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
2516     auto *NewOp = getOrCreateScalarValue(Instr->getOperand(op), Instance);
2517     Cloned->setOperand(op, NewOp);
2518   }
2519   addNewMetadata(Cloned, Instr);
2520 
2521   // Place the cloned scalar in the new loop.
2522   Builder.Insert(Cloned);
2523 
2524   // Add the cloned scalar to the scalar map entry.
2525   VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned);
2526 
2527   // If we just cloned a new assumption, add it the assumption cache.
2528   if (auto *II = dyn_cast<IntrinsicInst>(Cloned))
2529     if (II->getIntrinsicID() == Intrinsic::assume)
2530       AC->registerAssumption(II);
2531 
2532   // End if-block.
2533   if (IfPredicateInstr)
2534     PredicatedInstructions.push_back(Cloned);
2535 }
2536 
2537 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start,
2538                                                       Value *End, Value *Step,
2539                                                       Instruction *DL) {
2540   BasicBlock *Header = L->getHeader();
2541   BasicBlock *Latch = L->getLoopLatch();
2542   // As we're just creating this loop, it's possible no latch exists
2543   // yet. If so, use the header as this will be a single block loop.
2544   if (!Latch)
2545     Latch = Header;
2546 
2547   IRBuilder<> Builder(&*Header->getFirstInsertionPt());
2548   Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction);
2549   setDebugLocFromInst(Builder, OldInst);
2550   auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index");
2551 
2552   Builder.SetInsertPoint(Latch->getTerminator());
2553   setDebugLocFromInst(Builder, OldInst);
2554 
2555   // Create i+1 and fill the PHINode.
2556   Value *Next = Builder.CreateAdd(Induction, Step, "index.next");
2557   Induction->addIncoming(Start, L->getLoopPreheader());
2558   Induction->addIncoming(Next, Latch);
2559   // Create the compare.
2560   Value *ICmp = Builder.CreateICmpEQ(Next, End);
2561   Builder.CreateCondBr(ICmp, L->getExitBlock(), Header);
2562 
2563   // Now we have two terminators. Remove the old one from the block.
2564   Latch->getTerminator()->eraseFromParent();
2565 
2566   return Induction;
2567 }
2568 
2569 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
2570   if (TripCount)
2571     return TripCount;
2572 
2573   assert(L && "Create Trip Count for null loop.");
2574   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2575   // Find the loop boundaries.
2576   ScalarEvolution *SE = PSE.getSE();
2577   const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
2578   assert(BackedgeTakenCount != SE->getCouldNotCompute() &&
2579          "Invalid loop count");
2580 
2581   Type *IdxTy = Legal->getWidestInductionType();
2582   assert(IdxTy && "No type for induction");
2583 
2584   // The exit count might have the type of i64 while the phi is i32. This can
2585   // happen if we have an induction variable that is sign extended before the
2586   // compare. The only way that we get a backedge taken count is that the
2587   // induction variable was signed and as such will not overflow. In such a case
2588   // truncation is legal.
2589   if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) >
2590       IdxTy->getPrimitiveSizeInBits())
2591     BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
2592   BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
2593 
2594   // Get the total trip count from the count by adding 1.
2595   const SCEV *ExitCount = SE->getAddExpr(
2596       BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
2597 
2598   const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
2599 
2600   // Expand the trip count and place the new instructions in the preheader.
2601   // Notice that the pre-header does not change, only the loop body.
2602   SCEVExpander Exp(*SE, DL, "induction");
2603 
2604   // Count holds the overall loop count (N).
2605   TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
2606                                 L->getLoopPreheader()->getTerminator());
2607 
2608   if (TripCount->getType()->isPointerTy())
2609     TripCount =
2610         CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
2611                                     L->getLoopPreheader()->getTerminator());
2612 
2613   return TripCount;
2614 }
2615 
2616 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
2617   if (VectorTripCount)
2618     return VectorTripCount;
2619 
2620   Value *TC = getOrCreateTripCount(L);
2621   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2622 
2623   Type *Ty = TC->getType();
2624   Constant *Step = ConstantInt::get(Ty, VF * UF);
2625 
2626   // If the tail is to be folded by masking, round the number of iterations N
2627   // up to a multiple of Step instead of rounding down. This is done by first
2628   // adding Step-1 and then rounding down. Note that it's ok if this addition
2629   // overflows: the vector induction variable will eventually wrap to zero given
2630   // that it starts at zero and its Step is a power of two; the loop will then
2631   // exit, with the last early-exit vector comparison also producing all-true.
2632   if (Cost->foldTailByMasking()) {
2633     assert(isPowerOf2_32(VF * UF) &&
2634            "VF*UF must be a power of 2 when folding tail by masking");
2635     TC = Builder.CreateAdd(TC, ConstantInt::get(Ty, VF * UF - 1), "n.rnd.up");
2636   }
2637 
2638   // Now we need to generate the expression for the part of the loop that the
2639   // vectorized body will execute. This is equal to N - (N % Step) if scalar
2640   // iterations are not required for correctness, or N - Step, otherwise. Step
2641   // is equal to the vectorization factor (number of SIMD elements) times the
2642   // unroll factor (number of SIMD instructions).
2643   Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
2644 
2645   // If there is a non-reversed interleaved group that may speculatively access
2646   // memory out-of-bounds, we need to ensure that there will be at least one
2647   // iteration of the scalar epilogue loop. Thus, if the step evenly divides
2648   // the trip count, we set the remainder to be equal to the step. If the step
2649   // does not evenly divide the trip count, no adjustment is necessary since
2650   // there will already be scalar iterations. Note that the minimum iterations
2651   // check ensures that N >= Step.
2652   if (VF > 1 && Cost->requiresScalarEpilogue()) {
2653     auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
2654     R = Builder.CreateSelect(IsZero, Step, R);
2655   }
2656 
2657   VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
2658 
2659   return VectorTripCount;
2660 }
2661 
2662 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
2663                                                    const DataLayout &DL) {
2664   // Verify that V is a vector type with same number of elements as DstVTy.
2665   unsigned VF = DstVTy->getNumElements();
2666   VectorType *SrcVecTy = cast<VectorType>(V->getType());
2667   assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match");
2668   Type *SrcElemTy = SrcVecTy->getElementType();
2669   Type *DstElemTy = DstVTy->getElementType();
2670   assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
2671          "Vector elements must have same size");
2672 
2673   // Do a direct cast if element types are castable.
2674   if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
2675     return Builder.CreateBitOrPointerCast(V, DstVTy);
2676   }
2677   // V cannot be directly casted to desired vector type.
2678   // May happen when V is a floating point vector but DstVTy is a vector of
2679   // pointers or vice-versa. Handle this using a two-step bitcast using an
2680   // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
2681   assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
2682          "Only one type should be a pointer type");
2683   assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
2684          "Only one type should be a floating point type");
2685   Type *IntTy =
2686       IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
2687   VectorType *VecIntTy = VectorType::get(IntTy, VF);
2688   Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
2689   return Builder.CreateBitOrPointerCast(CastVal, DstVTy);
2690 }
2691 
2692 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
2693                                                          BasicBlock *Bypass) {
2694   Value *Count = getOrCreateTripCount(L);
2695   // Reuse existing vector loop preheader for TC checks.
2696   // Note that new preheader block is generated for vector loop.
2697   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
2698   IRBuilder<> Builder(TCCheckBlock->getTerminator());
2699 
2700   // Generate code to check if the loop's trip count is less than VF * UF, or
2701   // equal to it in case a scalar epilogue is required; this implies that the
2702   // vector trip count is zero. This check also covers the case where adding one
2703   // to the backedge-taken count overflowed leading to an incorrect trip count
2704   // of zero. In this case we will also jump to the scalar loop.
2705   auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE
2706                                           : ICmpInst::ICMP_ULT;
2707 
2708   // If tail is to be folded, vector loop takes care of all iterations.
2709   Value *CheckMinIters = Builder.getFalse();
2710   if (!Cost->foldTailByMasking())
2711     CheckMinIters = Builder.CreateICmp(
2712         P, Count, ConstantInt::get(Count->getType(), VF * UF),
2713         "min.iters.check");
2714 
2715   // Create new preheader for vector loop.
2716   LoopVectorPreHeader =
2717       SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
2718                  "vector.ph");
2719 
2720   assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
2721                                DT->getNode(Bypass)->getIDom()) &&
2722          "TC check is expected to dominate Bypass");
2723 
2724   // Update dominator for Bypass & LoopExit.
2725   DT->changeImmediateDominator(Bypass, TCCheckBlock);
2726   DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
2727 
2728   ReplaceInstWithInst(
2729       TCCheckBlock->getTerminator(),
2730       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
2731   LoopBypassBlocks.push_back(TCCheckBlock);
2732 }
2733 
2734 void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
2735   // Reuse existing vector loop preheader for SCEV checks.
2736   // Note that new preheader block is generated for vector loop.
2737   BasicBlock *const SCEVCheckBlock = LoopVectorPreHeader;
2738 
2739   // Generate the code to check that the SCEV assumptions that we made.
2740   // We want the new basic block to start at the first instruction in a
2741   // sequence of instructions that form a check.
2742   SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(),
2743                    "scev.check");
2744   Value *SCEVCheck = Exp.expandCodeForPredicate(
2745       &PSE.getUnionPredicate(), SCEVCheckBlock->getTerminator());
2746 
2747   if (auto *C = dyn_cast<ConstantInt>(SCEVCheck))
2748     if (C->isZero())
2749       return;
2750 
2751   assert(!SCEVCheckBlock->getParent()->hasOptSize() &&
2752          "Cannot SCEV check stride or overflow when optimizing for size");
2753 
2754   SCEVCheckBlock->setName("vector.scevcheck");
2755   // Create new preheader for vector loop.
2756   LoopVectorPreHeader =
2757       SplitBlock(SCEVCheckBlock, SCEVCheckBlock->getTerminator(), DT, LI,
2758                  nullptr, "vector.ph");
2759 
2760   // Update dominator only if this is first RT check.
2761   if (LoopBypassBlocks.empty()) {
2762     DT->changeImmediateDominator(Bypass, SCEVCheckBlock);
2763     DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock);
2764   }
2765 
2766   ReplaceInstWithInst(
2767       SCEVCheckBlock->getTerminator(),
2768       BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheck));
2769   LoopBypassBlocks.push_back(SCEVCheckBlock);
2770   AddedSafetyChecks = true;
2771 }
2772 
2773 void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {
2774   // VPlan-native path does not do any analysis for runtime checks currently.
2775   if (EnableVPlanNativePath)
2776     return;
2777 
2778   // Reuse existing vector loop preheader for runtime memory checks.
2779   // Note that new preheader block is generated for vector loop.
2780   BasicBlock *const MemCheckBlock = L->getLoopPreheader();
2781 
2782   // Generate the code that checks in runtime if arrays overlap. We put the
2783   // checks into a separate block to make the more common case of few elements
2784   // faster.
2785   Instruction *FirstCheckInst;
2786   Instruction *MemRuntimeCheck;
2787   std::tie(FirstCheckInst, MemRuntimeCheck) =
2788       Legal->getLAI()->addRuntimeChecks(MemCheckBlock->getTerminator());
2789   if (!MemRuntimeCheck)
2790     return;
2791 
2792   if (MemCheckBlock->getParent()->hasOptSize()) {
2793     assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
2794            "Cannot emit memory checks when optimizing for size, unless forced "
2795            "to vectorize.");
2796     ORE->emit([&]() {
2797       return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
2798                                         L->getStartLoc(), L->getHeader())
2799              << "Code-size may be reduced by not forcing "
2800                 "vectorization, or by source-code modifications "
2801                 "eliminating the need for runtime checks "
2802                 "(e.g., adding 'restrict').";
2803     });
2804   }
2805 
2806   MemCheckBlock->setName("vector.memcheck");
2807   // Create new preheader for vector loop.
2808   LoopVectorPreHeader =
2809       SplitBlock(MemCheckBlock, MemCheckBlock->getTerminator(), DT, LI, nullptr,
2810                  "vector.ph");
2811 
2812   // Update dominator only if this is first RT check.
2813   if (LoopBypassBlocks.empty()) {
2814     DT->changeImmediateDominator(Bypass, MemCheckBlock);
2815     DT->changeImmediateDominator(LoopExitBlock, MemCheckBlock);
2816   }
2817 
2818   ReplaceInstWithInst(
2819       MemCheckBlock->getTerminator(),
2820       BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheck));
2821   LoopBypassBlocks.push_back(MemCheckBlock);
2822   AddedSafetyChecks = true;
2823 
2824   // We currently don't use LoopVersioning for the actual loop cloning but we
2825   // still use it to add the noalias metadata.
2826   LVer = std::make_unique<LoopVersioning>(*Legal->getLAI(), OrigLoop, LI, DT,
2827                                           PSE.getSE());
2828   LVer->prepareNoAliasMetadata();
2829 }
2830 
2831 Value *InnerLoopVectorizer::emitTransformedIndex(
2832     IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL,
2833     const InductionDescriptor &ID) const {
2834 
2835   SCEVExpander Exp(*SE, DL, "induction");
2836   auto Step = ID.getStep();
2837   auto StartValue = ID.getStartValue();
2838   assert(Index->getType() == Step->getType() &&
2839          "Index type does not match StepValue type");
2840 
2841   // Note: the IR at this point is broken. We cannot use SE to create any new
2842   // SCEV and then expand it, hoping that SCEV's simplification will give us
2843   // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
2844   // lead to various SCEV crashes. So all we can do is to use builder and rely
2845   // on InstCombine for future simplifications. Here we handle some trivial
2846   // cases only.
2847   auto CreateAdd = [&B](Value *X, Value *Y) {
2848     assert(X->getType() == Y->getType() && "Types don't match!");
2849     if (auto *CX = dyn_cast<ConstantInt>(X))
2850       if (CX->isZero())
2851         return Y;
2852     if (auto *CY = dyn_cast<ConstantInt>(Y))
2853       if (CY->isZero())
2854         return X;
2855     return B.CreateAdd(X, Y);
2856   };
2857 
2858   auto CreateMul = [&B](Value *X, Value *Y) {
2859     assert(X->getType() == Y->getType() && "Types don't match!");
2860     if (auto *CX = dyn_cast<ConstantInt>(X))
2861       if (CX->isOne())
2862         return Y;
2863     if (auto *CY = dyn_cast<ConstantInt>(Y))
2864       if (CY->isOne())
2865         return X;
2866     return B.CreateMul(X, Y);
2867   };
2868 
2869   switch (ID.getKind()) {
2870   case InductionDescriptor::IK_IntInduction: {
2871     assert(Index->getType() == StartValue->getType() &&
2872            "Index type does not match StartValue type");
2873     if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne())
2874       return B.CreateSub(StartValue, Index);
2875     auto *Offset = CreateMul(
2876         Index, Exp.expandCodeFor(Step, Index->getType(), &*B.GetInsertPoint()));
2877     return CreateAdd(StartValue, Offset);
2878   }
2879   case InductionDescriptor::IK_PtrInduction: {
2880     assert(isa<SCEVConstant>(Step) &&
2881            "Expected constant step for pointer induction");
2882     return B.CreateGEP(
2883         StartValue->getType()->getPointerElementType(), StartValue,
2884         CreateMul(Index, Exp.expandCodeFor(Step, Index->getType(),
2885                                            &*B.GetInsertPoint())));
2886   }
2887   case InductionDescriptor::IK_FpInduction: {
2888     assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
2889     auto InductionBinOp = ID.getInductionBinOp();
2890     assert(InductionBinOp &&
2891            (InductionBinOp->getOpcode() == Instruction::FAdd ||
2892             InductionBinOp->getOpcode() == Instruction::FSub) &&
2893            "Original bin op should be defined for FP induction");
2894 
2895     Value *StepValue = cast<SCEVUnknown>(Step)->getValue();
2896 
2897     // Floating point operations had to be 'fast' to enable the induction.
2898     FastMathFlags Flags;
2899     Flags.setFast();
2900 
2901     Value *MulExp = B.CreateFMul(StepValue, Index);
2902     if (isa<Instruction>(MulExp))
2903       // We have to check, the MulExp may be a constant.
2904       cast<Instruction>(MulExp)->setFastMathFlags(Flags);
2905 
2906     Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
2907                                "induction");
2908     if (isa<Instruction>(BOp))
2909       cast<Instruction>(BOp)->setFastMathFlags(Flags);
2910 
2911     return BOp;
2912   }
2913   case InductionDescriptor::IK_NoInduction:
2914     return nullptr;
2915   }
2916   llvm_unreachable("invalid enum");
2917 }
2918 
2919 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
2920   /*
2921    In this function we generate a new loop. The new loop will contain
2922    the vectorized instructions while the old loop will continue to run the
2923    scalar remainder.
2924 
2925        [ ] <-- loop iteration number check.
2926     /   |
2927    /    v
2928   |    [ ] <-- vector loop bypass (may consist of multiple blocks).
2929   |  /  |
2930   | /   v
2931   ||   [ ]     <-- vector pre header.
2932   |/    |
2933   |     v
2934   |    [  ] \
2935   |    [  ]_|   <-- vector loop.
2936   |     |
2937   |     v
2938   |   -[ ]   <--- middle-block.
2939   |  /  |
2940   | /   v
2941   -|- >[ ]     <--- new preheader.
2942    |    |
2943    |    v
2944    |   [ ] \
2945    |   [ ]_|   <-- old scalar loop to handle remainder.
2946     \   |
2947      \  v
2948       >[ ]     <-- exit block.
2949    ...
2950    */
2951 
2952   MDNode *OrigLoopID = OrigLoop->getLoopID();
2953 
2954   // Some loops have a single integer induction variable, while other loops
2955   // don't. One example is c++ iterators that often have multiple pointer
2956   // induction variables. In the code below we also support a case where we
2957   // don't have a single induction variable.
2958   //
2959   // We try to obtain an induction variable from the original loop as hard
2960   // as possible. However if we don't find one that:
2961   //   - is an integer
2962   //   - counts from zero, stepping by one
2963   //   - is the size of the widest induction variable type
2964   // then we create a new one.
2965   OldInduction = Legal->getPrimaryInduction();
2966   Type *IdxTy = Legal->getWidestInductionType();
2967 
2968   // Split the single block loop into the two loop structure described above.
2969   LoopScalarBody = OrigLoop->getHeader();
2970   LoopVectorPreHeader = OrigLoop->getLoopPreheader();
2971   LoopExitBlock = OrigLoop->getExitBlock();
2972   assert(LoopExitBlock && "Must have an exit block");
2973   assert(LoopVectorPreHeader && "Invalid loop structure");
2974 
2975   LoopMiddleBlock =
2976       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
2977                  LI, nullptr, "middle.block");
2978   LoopScalarPreHeader =
2979       SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,
2980                  nullptr, "scalar.ph");
2981   // We intentionally don't let SplitBlock to update LoopInfo since
2982   // LoopVectorBody should belong to another loop than LoopVectorPreHeader.
2983   // LoopVectorBody is explicitly added to the correct place few lines later.
2984   LoopVectorBody =
2985       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
2986                  nullptr, nullptr, "vector.body");
2987 
2988   // Update dominator for loop exit.
2989   DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
2990 
2991   // Create and register the new vector loop.
2992   Loop *Lp = LI->AllocateLoop();
2993   Loop *ParentLoop = OrigLoop->getParentLoop();
2994 
2995   // Insert the new loop into the loop nest and register the new basic blocks
2996   // before calling any utilities such as SCEV that require valid LoopInfo.
2997   if (ParentLoop) {
2998     ParentLoop->addChildLoop(Lp);
2999   } else {
3000     LI->addTopLevelLoop(Lp);
3001   }
3002   Lp->addBasicBlockToLoop(LoopVectorBody, *LI);
3003 
3004   // Find the loop boundaries.
3005   Value *Count = getOrCreateTripCount(Lp);
3006 
3007   Value *StartIdx = ConstantInt::get(IdxTy, 0);
3008 
3009   // Now, compare the new count to zero. If it is zero skip the vector loop and
3010   // jump to the scalar loop. This check also covers the case where the
3011   // backedge-taken count is uint##_max: adding one to it will overflow leading
3012   // to an incorrect trip count of zero. In this (rare) case we will also jump
3013   // to the scalar loop.
3014   emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader);
3015 
3016   // Generate the code to check any assumptions that we've made for SCEV
3017   // expressions.
3018   emitSCEVChecks(Lp, LoopScalarPreHeader);
3019 
3020   // Generate the code that checks in runtime if arrays overlap. We put the
3021   // checks into a separate block to make the more common case of few elements
3022   // faster.
3023   emitMemRuntimeChecks(Lp, LoopScalarPreHeader);
3024 
3025   // Generate the induction variable.
3026   // The loop step is equal to the vectorization factor (num of SIMD elements)
3027   // times the unroll factor (num of SIMD instructions).
3028   Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
3029   Constant *Step = ConstantInt::get(IdxTy, VF * UF);
3030   Induction =
3031       createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
3032                               getDebugLocFromInstOrOperands(OldInduction));
3033 
3034   // We are going to resume the execution of the scalar loop.
3035   // Go over all of the induction variables that we found and fix the
3036   // PHIs that are left in the scalar version of the loop.
3037   // The starting values of PHI nodes depend on the counter of the last
3038   // iteration in the vectorized loop.
3039   // If we come from a bypass edge then we need to start from the original
3040   // start value.
3041 
3042   // This variable saves the new starting index for the scalar loop. It is used
3043   // to test if there are any tail iterations left once the vector loop has
3044   // completed.
3045   for (auto &InductionEntry : Legal->getInductionVars()) {
3046     PHINode *OrigPhi = InductionEntry.first;
3047     InductionDescriptor II = InductionEntry.second;
3048 
3049     // Create phi nodes to merge from the  backedge-taken check block.
3050     PHINode *BCResumeVal =
3051         PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
3052                         LoopScalarPreHeader->getTerminator());
3053     // Copy original phi DL over to the new one.
3054     BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
3055     Value *&EndValue = IVEndValues[OrigPhi];
3056     if (OrigPhi == OldInduction) {
3057       // We know what the end value is.
3058       EndValue = CountRoundDown;
3059     } else {
3060       IRBuilder<> B(Lp->getLoopPreheader()->getTerminator());
3061       Type *StepType = II.getStep()->getType();
3062       Instruction::CastOps CastOp =
3063           CastInst::getCastOpcode(CountRoundDown, true, StepType, true);
3064       Value *CRD = B.CreateCast(CastOp, CountRoundDown, StepType, "cast.crd");
3065       const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout();
3066       EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
3067       EndValue->setName("ind.end");
3068     }
3069 
3070     // The new PHI merges the original incoming value, in case of a bypass,
3071     // or the value at the end of the vectorized loop.
3072     BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);
3073 
3074     // Fix the scalar body counter (PHI node).
3075     // The old induction's phi node in the scalar body needs the truncated
3076     // value.
3077     for (BasicBlock *BB : LoopBypassBlocks)
3078       BCResumeVal->addIncoming(II.getStartValue(), BB);
3079     OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
3080   }
3081 
3082   // We need the OrigLoop (scalar loop part) latch terminator to help
3083   // produce correct debug info for the middle block BB instructions.
3084   // The legality check stage guarantees that the loop will have a single
3085   // latch.
3086   assert(isa<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()) &&
3087          "Scalar loop latch terminator isn't a branch");
3088   BranchInst *ScalarLatchBr =
3089       cast<BranchInst>(OrigLoop->getLoopLatch()->getTerminator());
3090 
3091   // Add a check in the middle block to see if we have completed
3092   // all of the iterations in the first vector loop.
3093   // If (N - N%VF) == N, then we *don't* need to run the remainder.
3094   // If tail is to be folded, we know we don't need to run the remainder.
3095   Value *CmpN = Builder.getTrue();
3096   if (!Cost->foldTailByMasking()) {
3097     CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count,
3098                            CountRoundDown, "cmp.n",
3099                            LoopMiddleBlock->getTerminator());
3100 
3101     // Here we use the same DebugLoc as the scalar loop latch branch instead
3102     // of the corresponding compare because they may have ended up with
3103     // different line numbers and we want to avoid awkward line stepping while
3104     // debugging. Eg. if the compare has got a line number inside the loop.
3105     cast<Instruction>(CmpN)->setDebugLoc(ScalarLatchBr->getDebugLoc());
3106   }
3107 
3108   BranchInst *BrInst =
3109       BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, CmpN);
3110   BrInst->setDebugLoc(ScalarLatchBr->getDebugLoc());
3111   ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst);
3112 
3113   // Get ready to start creating new instructions into the vectorized body.
3114   assert(LoopVectorPreHeader == Lp->getLoopPreheader() &&
3115          "Inconsistent vector loop preheader");
3116   Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt());
3117 
3118   Optional<MDNode *> VectorizedLoopID =
3119       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
3120                                       LLVMLoopVectorizeFollowupVectorized});
3121   if (VectorizedLoopID.hasValue()) {
3122     Lp->setLoopID(VectorizedLoopID.getValue());
3123 
3124     // Do not setAlreadyVectorized if loop attributes have been defined
3125     // explicitly.
3126     return LoopVectorPreHeader;
3127   }
3128 
3129   // Keep all loop hints from the original loop on the vector loop (we'll
3130   // replace the vectorizer-specific hints below).
3131   if (MDNode *LID = OrigLoop->getLoopID())
3132     Lp->setLoopID(LID);
3133 
3134   LoopVectorizeHints Hints(Lp, true, *ORE);
3135   Hints.setAlreadyVectorized();
3136 
3137 #ifdef EXPENSIVE_CHECKS
3138   assert(DT->verify(DominatorTree::VerificationLevel::Fast));
3139   LI->verify(*DT);
3140 #endif
3141 
3142   return LoopVectorPreHeader;
3143 }
3144 
3145 // Fix up external users of the induction variable. At this point, we are
3146 // in LCSSA form, with all external PHIs that use the IV having one input value,
3147 // coming from the remainder loop. We need those PHIs to also have a correct
3148 // value for the IV when arriving directly from the middle block.
3149 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
3150                                        const InductionDescriptor &II,
3151                                        Value *CountRoundDown, Value *EndValue,
3152                                        BasicBlock *MiddleBlock) {
3153   // There are two kinds of external IV usages - those that use the value
3154   // computed in the last iteration (the PHI) and those that use the penultimate
3155   // value (the value that feeds into the phi from the loop latch).
3156   // We allow both, but they, obviously, have different values.
3157 
3158   assert(OrigLoop->getExitBlock() && "Expected a single exit block");
3159 
3160   DenseMap<Value *, Value *> MissingVals;
3161 
3162   // An external user of the last iteration's value should see the value that
3163   // the remainder loop uses to initialize its own IV.
3164   Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
3165   for (User *U : PostInc->users()) {
3166     Instruction *UI = cast<Instruction>(U);
3167     if (!OrigLoop->contains(UI)) {
3168       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3169       MissingVals[UI] = EndValue;
3170     }
3171   }
3172 
3173   // An external user of the penultimate value need to see EndValue - Step.
3174   // The simplest way to get this is to recompute it from the constituent SCEVs,
3175   // that is Start + (Step * (CRD - 1)).
3176   for (User *U : OrigPhi->users()) {
3177     auto *UI = cast<Instruction>(U);
3178     if (!OrigLoop->contains(UI)) {
3179       const DataLayout &DL =
3180           OrigLoop->getHeader()->getModule()->getDataLayout();
3181       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3182 
3183       IRBuilder<> B(MiddleBlock->getTerminator());
3184       Value *CountMinusOne = B.CreateSub(
3185           CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1));
3186       Value *CMO =
3187           !II.getStep()->getType()->isIntegerTy()
3188               ? B.CreateCast(Instruction::SIToFP, CountMinusOne,
3189                              II.getStep()->getType())
3190               : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType());
3191       CMO->setName("cast.cmo");
3192       Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II);
3193       Escape->setName("ind.escape");
3194       MissingVals[UI] = Escape;
3195     }
3196   }
3197 
3198   for (auto &I : MissingVals) {
3199     PHINode *PHI = cast<PHINode>(I.first);
3200     // One corner case we have to handle is two IVs "chasing" each-other,
3201     // that is %IV2 = phi [...], [ %IV1, %latch ]
3202     // In this case, if IV1 has an external use, we need to avoid adding both
3203     // "last value of IV1" and "penultimate value of IV2". So, verify that we
3204     // don't already have an incoming value for the middle block.
3205     if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
3206       PHI->addIncoming(I.second, MiddleBlock);
3207   }
3208 }
3209 
3210 namespace {
3211 
3212 struct CSEDenseMapInfo {
3213   static bool canHandle(const Instruction *I) {
3214     return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
3215            isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
3216   }
3217 
3218   static inline Instruction *getEmptyKey() {
3219     return DenseMapInfo<Instruction *>::getEmptyKey();
3220   }
3221 
3222   static inline Instruction *getTombstoneKey() {
3223     return DenseMapInfo<Instruction *>::getTombstoneKey();
3224   }
3225 
3226   static unsigned getHashValue(const Instruction *I) {
3227     assert(canHandle(I) && "Unknown instruction!");
3228     return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
3229                                                            I->value_op_end()));
3230   }
3231 
3232   static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
3233     if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3234         LHS == getTombstoneKey() || RHS == getTombstoneKey())
3235       return LHS == RHS;
3236     return LHS->isIdenticalTo(RHS);
3237   }
3238 };
3239 
3240 } // end anonymous namespace
3241 
3242 ///Perform cse of induction variable instructions.
3243 static void cse(BasicBlock *BB) {
3244   // Perform simple cse.
3245   SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
3246   for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
3247     Instruction *In = &*I++;
3248 
3249     if (!CSEDenseMapInfo::canHandle(In))
3250       continue;
3251 
3252     // Check if we can replace this instruction with any of the
3253     // visited instructions.
3254     if (Instruction *V = CSEMap.lookup(In)) {
3255       In->replaceAllUsesWith(V);
3256       In->eraseFromParent();
3257       continue;
3258     }
3259 
3260     CSEMap[In] = In;
3261   }
3262 }
3263 
3264 unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
3265                                                        unsigned VF,
3266                                                        bool &NeedToScalarize) {
3267   Function *F = CI->getCalledFunction();
3268   Type *ScalarRetTy = CI->getType();
3269   SmallVector<Type *, 4> Tys, ScalarTys;
3270   for (auto &ArgOp : CI->arg_operands())
3271     ScalarTys.push_back(ArgOp->getType());
3272 
3273   // Estimate cost of scalarized vector call. The source operands are assumed
3274   // to be vectors, so we need to extract individual elements from there,
3275   // execute VF scalar calls, and then gather the result into the vector return
3276   // value.
3277   unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys);
3278   if (VF == 1)
3279     return ScalarCallCost;
3280 
3281   // Compute corresponding vector type for return value and arguments.
3282   Type *RetTy = ToVectorTy(ScalarRetTy, VF);
3283   for (Type *ScalarTy : ScalarTys)
3284     Tys.push_back(ToVectorTy(ScalarTy, VF));
3285 
3286   // Compute costs of unpacking argument values for the scalar calls and
3287   // packing the return values to a vector.
3288   unsigned ScalarizationCost = getScalarizationOverhead(CI, VF);
3289 
3290   unsigned Cost = ScalarCallCost * VF + ScalarizationCost;
3291 
3292   // If we can't emit a vector call for this function, then the currently found
3293   // cost is the cost we need to return.
3294   NeedToScalarize = true;
3295   VFShape Shape = VFShape::get(*CI, {VF, false}, false /*HasGlobalPred*/);
3296   Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
3297 
3298   if (!TLI || CI->isNoBuiltin() || !VecFunc)
3299     return Cost;
3300 
3301   // If the corresponding vector cost is cheaper, return its cost.
3302   unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys);
3303   if (VectorCallCost < Cost) {
3304     NeedToScalarize = false;
3305     return VectorCallCost;
3306   }
3307   return Cost;
3308 }
3309 
3310 unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
3311                                                             unsigned VF) {
3312   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
3313   assert(ID && "Expected intrinsic call!");
3314 
3315   FastMathFlags FMF;
3316   if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
3317     FMF = FPMO->getFastMathFlags();
3318 
3319   SmallVector<Value *, 4> Operands(CI->arg_operands());
3320   return TTI.getIntrinsicInstrCost(ID, CI->getType(), Operands, FMF, VF, CI);
3321 }
3322 
3323 static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
3324   auto *I1 = cast<IntegerType>(T1->getVectorElementType());
3325   auto *I2 = cast<IntegerType>(T2->getVectorElementType());
3326   return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3327 }
3328 static Type *largestIntegerVectorType(Type *T1, Type *T2) {
3329   auto *I1 = cast<IntegerType>(T1->getVectorElementType());
3330   auto *I2 = cast<IntegerType>(T2->getVectorElementType());
3331   return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3332 }
3333 
3334 void InnerLoopVectorizer::truncateToMinimalBitwidths() {
3335   // For every instruction `I` in MinBWs, truncate the operands, create a
3336   // truncated version of `I` and reextend its result. InstCombine runs
3337   // later and will remove any ext/trunc pairs.
3338   SmallPtrSet<Value *, 4> Erased;
3339   for (const auto &KV : Cost->getMinimalBitwidths()) {
3340     // If the value wasn't vectorized, we must maintain the original scalar
3341     // type. The absence of the value from VectorLoopValueMap indicates that it
3342     // wasn't vectorized.
3343     if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3344       continue;
3345     for (unsigned Part = 0; Part < UF; ++Part) {
3346       Value *I = getOrCreateVectorValue(KV.first, Part);
3347       if (Erased.find(I) != Erased.end() || I->use_empty() ||
3348           !isa<Instruction>(I))
3349         continue;
3350       Type *OriginalTy = I->getType();
3351       Type *ScalarTruncatedTy =
3352           IntegerType::get(OriginalTy->getContext(), KV.second);
3353       Type *TruncatedTy = VectorType::get(ScalarTruncatedTy,
3354                                           OriginalTy->getVectorNumElements());
3355       if (TruncatedTy == OriginalTy)
3356         continue;
3357 
3358       IRBuilder<> B(cast<Instruction>(I));
3359       auto ShrinkOperand = [&](Value *V) -> Value * {
3360         if (auto *ZI = dyn_cast<ZExtInst>(V))
3361           if (ZI->getSrcTy() == TruncatedTy)
3362             return ZI->getOperand(0);
3363         return B.CreateZExtOrTrunc(V, TruncatedTy);
3364       };
3365 
3366       // The actual instruction modification depends on the instruction type,
3367       // unfortunately.
3368       Value *NewI = nullptr;
3369       if (auto *BO = dyn_cast<BinaryOperator>(I)) {
3370         NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
3371                              ShrinkOperand(BO->getOperand(1)));
3372 
3373         // Any wrapping introduced by shrinking this operation shouldn't be
3374         // considered undefined behavior. So, we can't unconditionally copy
3375         // arithmetic wrapping flags to NewI.
3376         cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
3377       } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
3378         NewI =
3379             B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
3380                          ShrinkOperand(CI->getOperand(1)));
3381       } else if (auto *SI = dyn_cast<SelectInst>(I)) {
3382         NewI = B.CreateSelect(SI->getCondition(),
3383                               ShrinkOperand(SI->getTrueValue()),
3384                               ShrinkOperand(SI->getFalseValue()));
3385       } else if (auto *CI = dyn_cast<CastInst>(I)) {
3386         switch (CI->getOpcode()) {
3387         default:
3388           llvm_unreachable("Unhandled cast!");
3389         case Instruction::Trunc:
3390           NewI = ShrinkOperand(CI->getOperand(0));
3391           break;
3392         case Instruction::SExt:
3393           NewI = B.CreateSExtOrTrunc(
3394               CI->getOperand(0),
3395               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3396           break;
3397         case Instruction::ZExt:
3398           NewI = B.CreateZExtOrTrunc(
3399               CI->getOperand(0),
3400               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3401           break;
3402         }
3403       } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
3404         auto Elements0 = SI->getOperand(0)->getType()->getVectorNumElements();
3405         auto *O0 = B.CreateZExtOrTrunc(
3406             SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0));
3407         auto Elements1 = SI->getOperand(1)->getType()->getVectorNumElements();
3408         auto *O1 = B.CreateZExtOrTrunc(
3409             SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1));
3410 
3411         NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask());
3412       } else if (isa<LoadInst>(I) || isa<PHINode>(I)) {
3413         // Don't do anything with the operands, just extend the result.
3414         continue;
3415       } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
3416         auto Elements = IE->getOperand(0)->getType()->getVectorNumElements();
3417         auto *O0 = B.CreateZExtOrTrunc(
3418             IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3419         auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
3420         NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
3421       } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
3422         auto Elements = EE->getOperand(0)->getType()->getVectorNumElements();
3423         auto *O0 = B.CreateZExtOrTrunc(
3424             EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3425         NewI = B.CreateExtractElement(O0, EE->getOperand(2));
3426       } else {
3427         // If we don't know what to do, be conservative and don't do anything.
3428         continue;
3429       }
3430 
3431       // Lastly, extend the result.
3432       NewI->takeName(cast<Instruction>(I));
3433       Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
3434       I->replaceAllUsesWith(Res);
3435       cast<Instruction>(I)->eraseFromParent();
3436       Erased.insert(I);
3437       VectorLoopValueMap.resetVectorValue(KV.first, Part, Res);
3438     }
3439   }
3440 
3441   // We'll have created a bunch of ZExts that are now parentless. Clean up.
3442   for (const auto &KV : Cost->getMinimalBitwidths()) {
3443     // If the value wasn't vectorized, we must maintain the original scalar
3444     // type. The absence of the value from VectorLoopValueMap indicates that it
3445     // wasn't vectorized.
3446     if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3447       continue;
3448     for (unsigned Part = 0; Part < UF; ++Part) {
3449       Value *I = getOrCreateVectorValue(KV.first, Part);
3450       ZExtInst *Inst = dyn_cast<ZExtInst>(I);
3451       if (Inst && Inst->use_empty()) {
3452         Value *NewI = Inst->getOperand(0);
3453         Inst->eraseFromParent();
3454         VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI);
3455       }
3456     }
3457   }
3458 }
3459 
3460 void InnerLoopVectorizer::fixVectorizedLoop() {
3461   // Insert truncates and extends for any truncated instructions as hints to
3462   // InstCombine.
3463   if (VF > 1)
3464     truncateToMinimalBitwidths();
3465 
3466   // Fix widened non-induction PHIs by setting up the PHI operands.
3467   if (OrigPHIsToFix.size()) {
3468     assert(EnableVPlanNativePath &&
3469            "Unexpected non-induction PHIs for fixup in non VPlan-native path");
3470     fixNonInductionPHIs();
3471   }
3472 
3473   // At this point every instruction in the original loop is widened to a
3474   // vector form. Now we need to fix the recurrences in the loop. These PHI
3475   // nodes are currently empty because we did not want to introduce cycles.
3476   // This is the second stage of vectorizing recurrences.
3477   fixCrossIterationPHIs();
3478 
3479   // Forget the original basic block.
3480   PSE.getSE()->forgetLoop(OrigLoop);
3481 
3482   // Fix-up external users of the induction variables.
3483   for (auto &Entry : Legal->getInductionVars())
3484     fixupIVUsers(Entry.first, Entry.second,
3485                  getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)),
3486                  IVEndValues[Entry.first], LoopMiddleBlock);
3487 
3488   fixLCSSAPHIs();
3489   for (Instruction *PI : PredicatedInstructions)
3490     sinkScalarOperands(&*PI);
3491 
3492   // Remove redundant induction instructions.
3493   cse(LoopVectorBody);
3494 
3495   // Set/update profile weights for the vector and remainder loops as original
3496   // loop iterations are now distributed among them. Note that original loop
3497   // represented by LoopScalarBody becomes remainder loop after vectorization.
3498   //
3499   // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
3500   // end up getting slightly roughened result but that should be OK since
3501   // profile is not inherently precise anyway. Note also possible bypass of
3502   // vector code caused by legality checks is ignored, assigning all the weight
3503   // to the vector loop, optimistically.
3504   setProfileInfoAfterUnrolling(LI->getLoopFor(LoopScalarBody),
3505                                LI->getLoopFor(LoopVectorBody),
3506                                LI->getLoopFor(LoopScalarBody), VF * UF);
3507 }
3508 
3509 void InnerLoopVectorizer::fixCrossIterationPHIs() {
3510   // In order to support recurrences we need to be able to vectorize Phi nodes.
3511   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3512   // stage #2: We now need to fix the recurrences by adding incoming edges to
3513   // the currently empty PHI nodes. At this point every instruction in the
3514   // original loop is widened to a vector form so we can use them to construct
3515   // the incoming edges.
3516   for (PHINode &Phi : OrigLoop->getHeader()->phis()) {
3517     // Handle first-order recurrences and reductions that need to be fixed.
3518     if (Legal->isFirstOrderRecurrence(&Phi))
3519       fixFirstOrderRecurrence(&Phi);
3520     else if (Legal->isReductionVariable(&Phi))
3521       fixReduction(&Phi);
3522   }
3523 }
3524 
3525 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
3526   // This is the second phase of vectorizing first-order recurrences. An
3527   // overview of the transformation is described below. Suppose we have the
3528   // following loop.
3529   //
3530   //   for (int i = 0; i < n; ++i)
3531   //     b[i] = a[i] - a[i - 1];
3532   //
3533   // There is a first-order recurrence on "a". For this loop, the shorthand
3534   // scalar IR looks like:
3535   //
3536   //   scalar.ph:
3537   //     s_init = a[-1]
3538   //     br scalar.body
3539   //
3540   //   scalar.body:
3541   //     i = phi [0, scalar.ph], [i+1, scalar.body]
3542   //     s1 = phi [s_init, scalar.ph], [s2, scalar.body]
3543   //     s2 = a[i]
3544   //     b[i] = s2 - s1
3545   //     br cond, scalar.body, ...
3546   //
3547   // In this example, s1 is a recurrence because it's value depends on the
3548   // previous iteration. In the first phase of vectorization, we created a
3549   // temporary value for s1. We now complete the vectorization and produce the
3550   // shorthand vector IR shown below (for VF = 4, UF = 1).
3551   //
3552   //   vector.ph:
3553   //     v_init = vector(..., ..., ..., a[-1])
3554   //     br vector.body
3555   //
3556   //   vector.body
3557   //     i = phi [0, vector.ph], [i+4, vector.body]
3558   //     v1 = phi [v_init, vector.ph], [v2, vector.body]
3559   //     v2 = a[i, i+1, i+2, i+3];
3560   //     v3 = vector(v1(3), v2(0, 1, 2))
3561   //     b[i, i+1, i+2, i+3] = v2 - v3
3562   //     br cond, vector.body, middle.block
3563   //
3564   //   middle.block:
3565   //     x = v2(3)
3566   //     br scalar.ph
3567   //
3568   //   scalar.ph:
3569   //     s_init = phi [x, middle.block], [a[-1], otherwise]
3570   //     br scalar.body
3571   //
3572   // After execution completes the vector loop, we extract the next value of
3573   // the recurrence (x) to use as the initial value in the scalar loop.
3574 
3575   // Get the original loop preheader and single loop latch.
3576   auto *Preheader = OrigLoop->getLoopPreheader();
3577   auto *Latch = OrigLoop->getLoopLatch();
3578 
3579   // Get the initial and previous values of the scalar recurrence.
3580   auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader);
3581   auto *Previous = Phi->getIncomingValueForBlock(Latch);
3582 
3583   // Create a vector from the initial value.
3584   auto *VectorInit = ScalarInit;
3585   if (VF > 1) {
3586     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
3587     VectorInit = Builder.CreateInsertElement(
3588         UndefValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit,
3589         Builder.getInt32(VF - 1), "vector.recur.init");
3590   }
3591 
3592   // We constructed a temporary phi node in the first phase of vectorization.
3593   // This phi node will eventually be deleted.
3594   Builder.SetInsertPoint(
3595       cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0)));
3596 
3597   // Create a phi node for the new recurrence. The current value will either be
3598   // the initial value inserted into a vector or loop-varying vector value.
3599   auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur");
3600   VecPhi->addIncoming(VectorInit, LoopVectorPreHeader);
3601 
3602   // Get the vectorized previous value of the last part UF - 1. It appears last
3603   // among all unrolled iterations, due to the order of their construction.
3604   Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1);
3605 
3606   // Find and set the insertion point after the previous value if it is an
3607   // instruction.
3608   BasicBlock::iterator InsertPt;
3609   // Note that the previous value may have been constant-folded so it is not
3610   // guaranteed to be an instruction in the vector loop.
3611   // FIXME: Loop invariant values do not form recurrences. We should deal with
3612   //        them earlier.
3613   if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart))
3614     InsertPt = LoopVectorBody->getFirstInsertionPt();
3615   else {
3616     Instruction *PreviousInst = cast<Instruction>(PreviousLastPart);
3617     if (isa<PHINode>(PreviousLastPart))
3618       // If the previous value is a phi node, we should insert after all the phi
3619       // nodes in the block containing the PHI to avoid breaking basic block
3620       // verification. Note that the basic block may be different to
3621       // LoopVectorBody, in case we predicate the loop.
3622       InsertPt = PreviousInst->getParent()->getFirstInsertionPt();
3623     else
3624       InsertPt = ++PreviousInst->getIterator();
3625   }
3626   Builder.SetInsertPoint(&*InsertPt);
3627 
3628   // We will construct a vector for the recurrence by combining the values for
3629   // the current and previous iterations. This is the required shuffle mask.
3630   SmallVector<Constant *, 8> ShuffleMask(VF);
3631   ShuffleMask[0] = Builder.getInt32(VF - 1);
3632   for (unsigned I = 1; I < VF; ++I)
3633     ShuffleMask[I] = Builder.getInt32(I + VF - 1);
3634 
3635   // The vector from which to take the initial value for the current iteration
3636   // (actual or unrolled). Initially, this is the vector phi node.
3637   Value *Incoming = VecPhi;
3638 
3639   // Shuffle the current and previous vector and update the vector parts.
3640   for (unsigned Part = 0; Part < UF; ++Part) {
3641     Value *PreviousPart = getOrCreateVectorValue(Previous, Part);
3642     Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part);
3643     auto *Shuffle =
3644         VF > 1 ? Builder.CreateShuffleVector(Incoming, PreviousPart,
3645                                              ConstantVector::get(ShuffleMask))
3646                : Incoming;
3647     PhiPart->replaceAllUsesWith(Shuffle);
3648     cast<Instruction>(PhiPart)->eraseFromParent();
3649     VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle);
3650     Incoming = PreviousPart;
3651   }
3652 
3653   // Fix the latch value of the new recurrence in the vector loop.
3654   VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
3655 
3656   // Extract the last vector element in the middle block. This will be the
3657   // initial value for the recurrence when jumping to the scalar loop.
3658   auto *ExtractForScalar = Incoming;
3659   if (VF > 1) {
3660     Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
3661     ExtractForScalar = Builder.CreateExtractElement(
3662         ExtractForScalar, Builder.getInt32(VF - 1), "vector.recur.extract");
3663   }
3664   // Extract the second last element in the middle block if the
3665   // Phi is used outside the loop. We need to extract the phi itself
3666   // and not the last element (the phi update in the current iteration). This
3667   // will be the value when jumping to the exit block from the LoopMiddleBlock,
3668   // when the scalar loop is not run at all.
3669   Value *ExtractForPhiUsedOutsideLoop = nullptr;
3670   if (VF > 1)
3671     ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
3672         Incoming, Builder.getInt32(VF - 2), "vector.recur.extract.for.phi");
3673   // When loop is unrolled without vectorizing, initialize
3674   // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of
3675   // `Incoming`. This is analogous to the vectorized case above: extracting the
3676   // second last element when VF > 1.
3677   else if (UF > 1)
3678     ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2);
3679 
3680   // Fix the initial value of the original recurrence in the scalar loop.
3681   Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
3682   auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
3683   for (auto *BB : predecessors(LoopScalarPreHeader)) {
3684     auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
3685     Start->addIncoming(Incoming, BB);
3686   }
3687 
3688   Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start);
3689   Phi->setName("scalar.recur");
3690 
3691   // Finally, fix users of the recurrence outside the loop. The users will need
3692   // either the last value of the scalar recurrence or the last value of the
3693   // vector recurrence we extracted in the middle block. Since the loop is in
3694   // LCSSA form, we just need to find all the phi nodes for the original scalar
3695   // recurrence in the exit block, and then add an edge for the middle block.
3696   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3697     if (LCSSAPhi.getIncomingValue(0) == Phi) {
3698       LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
3699     }
3700   }
3701 }
3702 
3703 void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
3704   Constant *Zero = Builder.getInt32(0);
3705 
3706   // Get it's reduction variable descriptor.
3707   assert(Legal->isReductionVariable(Phi) &&
3708          "Unable to find the reduction variable");
3709   RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi];
3710 
3711   RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();
3712   TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
3713   Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
3714   RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind =
3715     RdxDesc.getMinMaxRecurrenceKind();
3716   setDebugLocFromInst(Builder, ReductionStartValue);
3717 
3718   // We need to generate a reduction vector from the incoming scalar.
3719   // To do so, we need to generate the 'identity' vector and override
3720   // one of the elements with the incoming scalar reduction. We need
3721   // to do it in the vector-loop preheader.
3722   Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
3723 
3724   // This is the vector-clone of the value that leaves the loop.
3725   Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType();
3726 
3727   // Find the reduction identity variable. Zero for addition, or, xor,
3728   // one for multiplication, -1 for And.
3729   Value *Identity;
3730   Value *VectorStart;
3731   if (RK == RecurrenceDescriptor::RK_IntegerMinMax ||
3732       RK == RecurrenceDescriptor::RK_FloatMinMax) {
3733     // MinMax reduction have the start value as their identify.
3734     if (VF == 1) {
3735       VectorStart = Identity = ReductionStartValue;
3736     } else {
3737       VectorStart = Identity =
3738         Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident");
3739     }
3740   } else {
3741     // Handle other reduction kinds:
3742     Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity(
3743         RK, VecTy->getScalarType());
3744     if (VF == 1) {
3745       Identity = Iden;
3746       // This vector is the Identity vector where the first element is the
3747       // incoming scalar reduction.
3748       VectorStart = ReductionStartValue;
3749     } else {
3750       Identity = ConstantVector::getSplat({VF, false}, Iden);
3751 
3752       // This vector is the Identity vector where the first element is the
3753       // incoming scalar reduction.
3754       VectorStart =
3755         Builder.CreateInsertElement(Identity, ReductionStartValue, Zero);
3756     }
3757   }
3758 
3759   // Wrap flags are in general invalid after vectorization, clear them.
3760   clearReductionWrapFlags(RdxDesc);
3761 
3762   // Fix the vector-loop phi.
3763 
3764   // Reductions do not have to start at zero. They can start with
3765   // any loop invariant values.
3766   BasicBlock *Latch = OrigLoop->getLoopLatch();
3767   Value *LoopVal = Phi->getIncomingValueForBlock(Latch);
3768 
3769   for (unsigned Part = 0; Part < UF; ++Part) {
3770     Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part);
3771     Value *Val = getOrCreateVectorValue(LoopVal, Part);
3772     // Make sure to add the reduction start value only to the
3773     // first unroll part.
3774     Value *StartVal = (Part == 0) ? VectorStart : Identity;
3775     cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader);
3776     cast<PHINode>(VecRdxPhi)
3777       ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
3778   }
3779 
3780   // Before each round, move the insertion point right between
3781   // the PHIs and the values we are going to write.
3782   // This allows us to write both PHINodes and the extractelement
3783   // instructions.
3784   Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3785 
3786   setDebugLocFromInst(Builder, LoopExitInst);
3787 
3788   // If tail is folded by masking, the vector value to leave the loop should be
3789   // a Select choosing between the vectorized LoopExitInst and vectorized Phi,
3790   // instead of the former.
3791   if (Cost->foldTailByMasking()) {
3792     for (unsigned Part = 0; Part < UF; ++Part) {
3793       Value *VecLoopExitInst =
3794           VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3795       Value *Sel = nullptr;
3796       for (User *U : VecLoopExitInst->users()) {
3797         if (isa<SelectInst>(U)) {
3798           assert(!Sel && "Reduction exit feeding two selects");
3799           Sel = U;
3800         } else
3801           assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select");
3802       }
3803       assert(Sel && "Reduction exit feeds no select");
3804       VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, Sel);
3805     }
3806   }
3807 
3808   // If the vector reduction can be performed in a smaller type, we truncate
3809   // then extend the loop exit value to enable InstCombine to evaluate the
3810   // entire expression in the smaller type.
3811   if (VF > 1 && Phi->getType() != RdxDesc.getRecurrenceType()) {
3812     Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
3813     Builder.SetInsertPoint(
3814         LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator());
3815     VectorParts RdxParts(UF);
3816     for (unsigned Part = 0; Part < UF; ++Part) {
3817       RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3818       Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3819       Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
3820                                         : Builder.CreateZExt(Trunc, VecTy);
3821       for (Value::user_iterator UI = RdxParts[Part]->user_begin();
3822            UI != RdxParts[Part]->user_end();)
3823         if (*UI != Trunc) {
3824           (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd);
3825           RdxParts[Part] = Extnd;
3826         } else {
3827           ++UI;
3828         }
3829     }
3830     Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3831     for (unsigned Part = 0; Part < UF; ++Part) {
3832       RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3833       VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]);
3834     }
3835   }
3836 
3837   // Reduce all of the unrolled parts into a single vector.
3838   Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0);
3839   unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK);
3840 
3841   // The middle block terminator has already been assigned a DebugLoc here (the
3842   // OrigLoop's single latch terminator). We want the whole middle block to
3843   // appear to execute on this line because: (a) it is all compiler generated,
3844   // (b) these instructions are always executed after evaluating the latch
3845   // conditional branch, and (c) other passes may add new predecessors which
3846   // terminate on this line. This is the easiest way to ensure we don't
3847   // accidentally cause an extra step back into the loop while debugging.
3848   setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator());
3849   for (unsigned Part = 1; Part < UF; ++Part) {
3850     Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3851     if (Op != Instruction::ICmp && Op != Instruction::FCmp)
3852       // Floating point operations had to be 'fast' to enable the reduction.
3853       ReducedPartRdx = addFastMathFlag(
3854           Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxPart,
3855                               ReducedPartRdx, "bin.rdx"),
3856           RdxDesc.getFastMathFlags());
3857     else
3858       ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx,
3859                                       RdxPart);
3860   }
3861 
3862   if (VF > 1) {
3863     bool NoNaN = Legal->hasFunNoNaNAttr();
3864     ReducedPartRdx =
3865         createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN);
3866     // If the reduction can be performed in a smaller type, we need to extend
3867     // the reduction to the wider type before we branch to the original loop.
3868     if (Phi->getType() != RdxDesc.getRecurrenceType())
3869       ReducedPartRdx =
3870         RdxDesc.isSigned()
3871         ? Builder.CreateSExt(ReducedPartRdx, Phi->getType())
3872         : Builder.CreateZExt(ReducedPartRdx, Phi->getType());
3873   }
3874 
3875   // Create a phi node that merges control-flow from the backedge-taken check
3876   // block and the middle block.
3877   PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx",
3878                                         LoopScalarPreHeader->getTerminator());
3879   for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
3880     BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]);
3881   BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
3882 
3883   // Now, we need to fix the users of the reduction variable
3884   // inside and outside of the scalar remainder loop.
3885   // We know that the loop is in LCSSA form. We need to update the
3886   // PHI nodes in the exit blocks.
3887   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3888     // All PHINodes need to have a single entry edge, or two if
3889     // we already fixed them.
3890     assert(LCSSAPhi.getNumIncomingValues() < 3 && "Invalid LCSSA PHI");
3891 
3892     // We found a reduction value exit-PHI. Update it with the
3893     // incoming bypass edge.
3894     if (LCSSAPhi.getIncomingValue(0) == LoopExitInst)
3895       LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
3896   } // end of the LCSSA phi scan.
3897 
3898     // Fix the scalar loop reduction variable with the incoming reduction sum
3899     // from the vector body and from the backedge value.
3900   int IncomingEdgeBlockIdx =
3901     Phi->getBasicBlockIndex(OrigLoop->getLoopLatch());
3902   assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
3903   // Pick the other block.
3904   int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
3905   Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
3906   Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
3907 }
3908 
3909 void InnerLoopVectorizer::clearReductionWrapFlags(
3910     RecurrenceDescriptor &RdxDesc) {
3911   RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();
3912   if (RK != RecurrenceDescriptor::RK_IntegerAdd &&
3913       RK != RecurrenceDescriptor::RK_IntegerMult)
3914     return;
3915 
3916   Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr();
3917   assert(LoopExitInstr && "null loop exit instruction");
3918   SmallVector<Instruction *, 8> Worklist;
3919   SmallPtrSet<Instruction *, 8> Visited;
3920   Worklist.push_back(LoopExitInstr);
3921   Visited.insert(LoopExitInstr);
3922 
3923   while (!Worklist.empty()) {
3924     Instruction *Cur = Worklist.pop_back_val();
3925     if (isa<OverflowingBinaryOperator>(Cur))
3926       for (unsigned Part = 0; Part < UF; ++Part) {
3927         Value *V = getOrCreateVectorValue(Cur, Part);
3928         cast<Instruction>(V)->dropPoisonGeneratingFlags();
3929       }
3930 
3931     for (User *U : Cur->users()) {
3932       Instruction *UI = cast<Instruction>(U);
3933       if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) &&
3934           Visited.insert(UI).second)
3935         Worklist.push_back(UI);
3936     }
3937   }
3938 }
3939 
3940 void InnerLoopVectorizer::fixLCSSAPHIs() {
3941   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3942     if (LCSSAPhi.getNumIncomingValues() == 1) {
3943       auto *IncomingValue = LCSSAPhi.getIncomingValue(0);
3944       // Non-instruction incoming values will have only one value.
3945       unsigned LastLane = 0;
3946       if (isa<Instruction>(IncomingValue))
3947           LastLane = Cost->isUniformAfterVectorization(
3948                          cast<Instruction>(IncomingValue), VF)
3949                          ? 0
3950                          : VF - 1;
3951       // Can be a loop invariant incoming value or the last scalar value to be
3952       // extracted from the vectorized loop.
3953       Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
3954       Value *lastIncomingValue =
3955           getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane });
3956       LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock);
3957     }
3958   }
3959 }
3960 
3961 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
3962   // The basic block and loop containing the predicated instruction.
3963   auto *PredBB = PredInst->getParent();
3964   auto *VectorLoop = LI->getLoopFor(PredBB);
3965 
3966   // Initialize a worklist with the operands of the predicated instruction.
3967   SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
3968 
3969   // Holds instructions that we need to analyze again. An instruction may be
3970   // reanalyzed if we don't yet know if we can sink it or not.
3971   SmallVector<Instruction *, 8> InstsToReanalyze;
3972 
3973   // Returns true if a given use occurs in the predicated block. Phi nodes use
3974   // their operands in their corresponding predecessor blocks.
3975   auto isBlockOfUsePredicated = [&](Use &U) -> bool {
3976     auto *I = cast<Instruction>(U.getUser());
3977     BasicBlock *BB = I->getParent();
3978     if (auto *Phi = dyn_cast<PHINode>(I))
3979       BB = Phi->getIncomingBlock(
3980           PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
3981     return BB == PredBB;
3982   };
3983 
3984   // Iteratively sink the scalarized operands of the predicated instruction
3985   // into the block we created for it. When an instruction is sunk, it's
3986   // operands are then added to the worklist. The algorithm ends after one pass
3987   // through the worklist doesn't sink a single instruction.
3988   bool Changed;
3989   do {
3990     // Add the instructions that need to be reanalyzed to the worklist, and
3991     // reset the changed indicator.
3992     Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
3993     InstsToReanalyze.clear();
3994     Changed = false;
3995 
3996     while (!Worklist.empty()) {
3997       auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
3998 
3999       // We can't sink an instruction if it is a phi node, is already in the
4000       // predicated block, is not in the loop, or may have side effects.
4001       if (!I || isa<PHINode>(I) || I->getParent() == PredBB ||
4002           !VectorLoop->contains(I) || I->mayHaveSideEffects())
4003         continue;
4004 
4005       // It's legal to sink the instruction if all its uses occur in the
4006       // predicated block. Otherwise, there's nothing to do yet, and we may
4007       // need to reanalyze the instruction.
4008       if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
4009         InstsToReanalyze.push_back(I);
4010         continue;
4011       }
4012 
4013       // Move the instruction to the beginning of the predicated block, and add
4014       // it's operands to the worklist.
4015       I->moveBefore(&*PredBB->getFirstInsertionPt());
4016       Worklist.insert(I->op_begin(), I->op_end());
4017 
4018       // The sinking may have enabled other instructions to be sunk, so we will
4019       // need to iterate.
4020       Changed = true;
4021     }
4022   } while (Changed);
4023 }
4024 
4025 void InnerLoopVectorizer::fixNonInductionPHIs() {
4026   for (PHINode *OrigPhi : OrigPHIsToFix) {
4027     PHINode *NewPhi =
4028         cast<PHINode>(VectorLoopValueMap.getVectorValue(OrigPhi, 0));
4029     unsigned NumIncomingValues = OrigPhi->getNumIncomingValues();
4030 
4031     SmallVector<BasicBlock *, 2> ScalarBBPredecessors(
4032         predecessors(OrigPhi->getParent()));
4033     SmallVector<BasicBlock *, 2> VectorBBPredecessors(
4034         predecessors(NewPhi->getParent()));
4035     assert(ScalarBBPredecessors.size() == VectorBBPredecessors.size() &&
4036            "Scalar and Vector BB should have the same number of predecessors");
4037 
4038     // The insertion point in Builder may be invalidated by the time we get
4039     // here. Force the Builder insertion point to something valid so that we do
4040     // not run into issues during insertion point restore in
4041     // getOrCreateVectorValue calls below.
4042     Builder.SetInsertPoint(NewPhi);
4043 
4044     // The predecessor order is preserved and we can rely on mapping between
4045     // scalar and vector block predecessors.
4046     for (unsigned i = 0; i < NumIncomingValues; ++i) {
4047       BasicBlock *NewPredBB = VectorBBPredecessors[i];
4048 
4049       // When looking up the new scalar/vector values to fix up, use incoming
4050       // values from original phi.
4051       Value *ScIncV =
4052           OrigPhi->getIncomingValueForBlock(ScalarBBPredecessors[i]);
4053 
4054       // Scalar incoming value may need a broadcast
4055       Value *NewIncV = getOrCreateVectorValue(ScIncV, 0);
4056       NewPhi->addIncoming(NewIncV, NewPredBB);
4057     }
4058   }
4059 }
4060 
4061 void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, unsigned UF,
4062                                    unsigned VF, bool IsPtrLoopInvariant,
4063                                    SmallBitVector &IsIndexLoopInvariant) {
4064   // Construct a vector GEP by widening the operands of the scalar GEP as
4065   // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
4066   // results in a vector of pointers when at least one operand of the GEP
4067   // is vector-typed. Thus, to keep the representation compact, we only use
4068   // vector-typed operands for loop-varying values.
4069 
4070   if (VF > 1 && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) {
4071     // If we are vectorizing, but the GEP has only loop-invariant operands,
4072     // the GEP we build (by only using vector-typed operands for
4073     // loop-varying values) would be a scalar pointer. Thus, to ensure we
4074     // produce a vector of pointers, we need to either arbitrarily pick an
4075     // operand to broadcast, or broadcast a clone of the original GEP.
4076     // Here, we broadcast a clone of the original.
4077     //
4078     // TODO: If at some point we decide to scalarize instructions having
4079     //       loop-invariant operands, this special case will no longer be
4080     //       required. We would add the scalarization decision to
4081     //       collectLoopScalars() and teach getVectorValue() to broadcast
4082     //       the lane-zero scalar value.
4083     auto *Clone = Builder.Insert(GEP->clone());
4084     for (unsigned Part = 0; Part < UF; ++Part) {
4085       Value *EntryPart = Builder.CreateVectorSplat(VF, Clone);
4086       VectorLoopValueMap.setVectorValue(GEP, Part, EntryPart);
4087       addMetadata(EntryPart, GEP);
4088     }
4089   } else {
4090     // If the GEP has at least one loop-varying operand, we are sure to
4091     // produce a vector of pointers. But if we are only unrolling, we want
4092     // to produce a scalar GEP for each unroll part. Thus, the GEP we
4093     // produce with the code below will be scalar (if VF == 1) or vector
4094     // (otherwise). Note that for the unroll-only case, we still maintain
4095     // values in the vector mapping with initVector, as we do for other
4096     // instructions.
4097     for (unsigned Part = 0; Part < UF; ++Part) {
4098       // The pointer operand of the new GEP. If it's loop-invariant, we
4099       // won't broadcast it.
4100       auto *Ptr = IsPtrLoopInvariant
4101                       ? GEP->getPointerOperand()
4102                       : getOrCreateVectorValue(GEP->getPointerOperand(), Part);
4103 
4104       // Collect all the indices for the new GEP. If any index is
4105       // loop-invariant, we won't broadcast it.
4106       SmallVector<Value *, 4> Indices;
4107       for (auto Index : enumerate(GEP->indices())) {
4108         Value *User = Index.value().get();
4109         if (IsIndexLoopInvariant[Index.index()])
4110           Indices.push_back(User);
4111         else
4112           Indices.push_back(getOrCreateVectorValue(User, Part));
4113       }
4114 
4115       // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
4116       // but it should be a vector, otherwise.
4117       auto *NewGEP =
4118           GEP->isInBounds()
4119               ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr,
4120                                           Indices)
4121               : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices);
4122       assert((VF == 1 || NewGEP->getType()->isVectorTy()) &&
4123              "NewGEP is not a pointer vector");
4124       VectorLoopValueMap.setVectorValue(GEP, Part, NewGEP);
4125       addMetadata(NewGEP, GEP);
4126     }
4127   }
4128 }
4129 
4130 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
4131                                               unsigned VF) {
4132   PHINode *P = cast<PHINode>(PN);
4133   if (EnableVPlanNativePath) {
4134     // Currently we enter here in the VPlan-native path for non-induction
4135     // PHIs where all control flow is uniform. We simply widen these PHIs.
4136     // Create a vector phi with no operands - the vector phi operands will be
4137     // set at the end of vector code generation.
4138     Type *VecTy =
4139         (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF);
4140     Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi");
4141     VectorLoopValueMap.setVectorValue(P, 0, VecPhi);
4142     OrigPHIsToFix.push_back(P);
4143 
4144     return;
4145   }
4146 
4147   assert(PN->getParent() == OrigLoop->getHeader() &&
4148          "Non-header phis should have been handled elsewhere");
4149 
4150   // In order to support recurrences we need to be able to vectorize Phi nodes.
4151   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
4152   // stage #1: We create a new vector PHI node with no incoming edges. We'll use
4153   // this value when we vectorize all of the instructions that use the PHI.
4154   if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) {
4155     for (unsigned Part = 0; Part < UF; ++Part) {
4156       // This is phase one of vectorizing PHIs.
4157       Type *VecTy =
4158           (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF);
4159       Value *EntryPart = PHINode::Create(
4160           VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt());
4161       VectorLoopValueMap.setVectorValue(P, Part, EntryPart);
4162     }
4163     return;
4164   }
4165 
4166   setDebugLocFromInst(Builder, P);
4167 
4168   // This PHINode must be an induction variable.
4169   // Make sure that we know about it.
4170   assert(Legal->getInductionVars().count(P) && "Not an induction variable");
4171 
4172   InductionDescriptor II = Legal->getInductionVars().lookup(P);
4173   const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
4174 
4175   // FIXME: The newly created binary instructions should contain nsw/nuw flags,
4176   // which can be found from the original scalar operations.
4177   switch (II.getKind()) {
4178   case InductionDescriptor::IK_NoInduction:
4179     llvm_unreachable("Unknown induction");
4180   case InductionDescriptor::IK_IntInduction:
4181   case InductionDescriptor::IK_FpInduction:
4182     llvm_unreachable("Integer/fp induction is handled elsewhere.");
4183   case InductionDescriptor::IK_PtrInduction: {
4184     // Handle the pointer induction variable case.
4185     assert(P->getType()->isPointerTy() && "Unexpected type.");
4186     // This is the normalized GEP that starts counting at zero.
4187     Value *PtrInd = Induction;
4188     PtrInd = Builder.CreateSExtOrTrunc(PtrInd, II.getStep()->getType());
4189     // Determine the number of scalars we need to generate for each unroll
4190     // iteration. If the instruction is uniform, we only need to generate the
4191     // first lane. Otherwise, we generate all VF values.
4192     unsigned Lanes = Cost->isUniformAfterVectorization(P, VF) ? 1 : VF;
4193     // These are the scalar results. Notice that we don't generate vector GEPs
4194     // because scalar GEPs result in better code.
4195     for (unsigned Part = 0; Part < UF; ++Part) {
4196       for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
4197         Constant *Idx = ConstantInt::get(PtrInd->getType(), Lane + Part * VF);
4198         Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
4199         Value *SclrGep =
4200             emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II);
4201         SclrGep->setName("next.gep");
4202         VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep);
4203       }
4204     }
4205     return;
4206   }
4207   }
4208 }
4209 
4210 /// A helper function for checking whether an integer division-related
4211 /// instruction may divide by zero (in which case it must be predicated if
4212 /// executed conditionally in the scalar code).
4213 /// TODO: It may be worthwhile to generalize and check isKnownNonZero().
4214 /// Non-zero divisors that are non compile-time constants will not be
4215 /// converted into multiplication, so we will still end up scalarizing
4216 /// the division, but can do so w/o predication.
4217 static bool mayDivideByZero(Instruction &I) {
4218   assert((I.getOpcode() == Instruction::UDiv ||
4219           I.getOpcode() == Instruction::SDiv ||
4220           I.getOpcode() == Instruction::URem ||
4221           I.getOpcode() == Instruction::SRem) &&
4222          "Unexpected instruction");
4223   Value *Divisor = I.getOperand(1);
4224   auto *CInt = dyn_cast<ConstantInt>(Divisor);
4225   return !CInt || CInt->isZero();
4226 }
4227 
4228 void InnerLoopVectorizer::widenInstruction(Instruction &I) {
4229   switch (I.getOpcode()) {
4230   case Instruction::Call:
4231   case Instruction::Br:
4232   case Instruction::PHI:
4233   case Instruction::GetElementPtr:
4234     llvm_unreachable("This instruction is handled by a different recipe.");
4235   case Instruction::UDiv:
4236   case Instruction::SDiv:
4237   case Instruction::SRem:
4238   case Instruction::URem:
4239   case Instruction::Add:
4240   case Instruction::FAdd:
4241   case Instruction::Sub:
4242   case Instruction::FSub:
4243   case Instruction::FNeg:
4244   case Instruction::Mul:
4245   case Instruction::FMul:
4246   case Instruction::FDiv:
4247   case Instruction::FRem:
4248   case Instruction::Shl:
4249   case Instruction::LShr:
4250   case Instruction::AShr:
4251   case Instruction::And:
4252   case Instruction::Or:
4253   case Instruction::Xor: {
4254     // Just widen unops and binops.
4255     setDebugLocFromInst(Builder, &I);
4256 
4257     for (unsigned Part = 0; Part < UF; ++Part) {
4258       SmallVector<Value *, 2> Ops;
4259       for (Value *Op : I.operands())
4260         Ops.push_back(getOrCreateVectorValue(Op, Part));
4261 
4262       Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops);
4263 
4264       if (auto *VecOp = dyn_cast<Instruction>(V))
4265         VecOp->copyIRFlags(&I);
4266 
4267       // Use this vector value for all users of the original instruction.
4268       VectorLoopValueMap.setVectorValue(&I, Part, V);
4269       addMetadata(V, &I);
4270     }
4271 
4272     break;
4273   }
4274   case Instruction::Select: {
4275     // Widen selects.
4276     // If the selector is loop invariant we can create a select
4277     // instruction with a scalar condition. Otherwise, use vector-select.
4278     auto *SE = PSE.getSE();
4279     bool InvariantCond =
4280         SE->isLoopInvariant(PSE.getSCEV(I.getOperand(0)), OrigLoop);
4281     setDebugLocFromInst(Builder, &I);
4282 
4283     // The condition can be loop invariant  but still defined inside the
4284     // loop. This means that we can't just use the original 'cond' value.
4285     // We have to take the 'vectorized' value and pick the first lane.
4286     // Instcombine will make this a no-op.
4287 
4288     auto *ScalarCond = getOrCreateScalarValue(I.getOperand(0), {0, 0});
4289 
4290     for (unsigned Part = 0; Part < UF; ++Part) {
4291       Value *Cond = getOrCreateVectorValue(I.getOperand(0), Part);
4292       Value *Op0 = getOrCreateVectorValue(I.getOperand(1), Part);
4293       Value *Op1 = getOrCreateVectorValue(I.getOperand(2), Part);
4294       Value *Sel =
4295           Builder.CreateSelect(InvariantCond ? ScalarCond : Cond, Op0, Op1);
4296       VectorLoopValueMap.setVectorValue(&I, Part, Sel);
4297       addMetadata(Sel, &I);
4298     }
4299 
4300     break;
4301   }
4302 
4303   case Instruction::ICmp:
4304   case Instruction::FCmp: {
4305     // Widen compares. Generate vector compares.
4306     bool FCmp = (I.getOpcode() == Instruction::FCmp);
4307     auto *Cmp = cast<CmpInst>(&I);
4308     setDebugLocFromInst(Builder, Cmp);
4309     for (unsigned Part = 0; Part < UF; ++Part) {
4310       Value *A = getOrCreateVectorValue(Cmp->getOperand(0), Part);
4311       Value *B = getOrCreateVectorValue(Cmp->getOperand(1), Part);
4312       Value *C = nullptr;
4313       if (FCmp) {
4314         // Propagate fast math flags.
4315         IRBuilder<>::FastMathFlagGuard FMFG(Builder);
4316         Builder.setFastMathFlags(Cmp->getFastMathFlags());
4317         C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
4318       } else {
4319         C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
4320       }
4321       VectorLoopValueMap.setVectorValue(&I, Part, C);
4322       addMetadata(C, &I);
4323     }
4324 
4325     break;
4326   }
4327 
4328   case Instruction::ZExt:
4329   case Instruction::SExt:
4330   case Instruction::FPToUI:
4331   case Instruction::FPToSI:
4332   case Instruction::FPExt:
4333   case Instruction::PtrToInt:
4334   case Instruction::IntToPtr:
4335   case Instruction::SIToFP:
4336   case Instruction::UIToFP:
4337   case Instruction::Trunc:
4338   case Instruction::FPTrunc:
4339   case Instruction::BitCast: {
4340     auto *CI = cast<CastInst>(&I);
4341     setDebugLocFromInst(Builder, CI);
4342 
4343     /// Vectorize casts.
4344     Type *DestTy =
4345         (VF == 1) ? CI->getType() : VectorType::get(CI->getType(), VF);
4346 
4347     for (unsigned Part = 0; Part < UF; ++Part) {
4348       Value *A = getOrCreateVectorValue(CI->getOperand(0), Part);
4349       Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
4350       VectorLoopValueMap.setVectorValue(&I, Part, Cast);
4351       addMetadata(Cast, &I);
4352     }
4353     break;
4354   }
4355   default:
4356     // This instruction is not vectorized by simple widening.
4357     LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
4358     llvm_unreachable("Unhandled instruction!");
4359   } // end of switch.
4360 }
4361 
4362 void InnerLoopVectorizer::widenCallInstruction(CallInst &I) {
4363   // Ignore dbg intrinsics.
4364   // TODO: Debug intrinsics should be skipped/handled during VPlan construction
4365   // rather than dropping them here.
4366   if (isa<DbgInfoIntrinsic>(I))
4367     return;
4368   setDebugLocFromInst(Builder, &I);
4369 
4370   Module *M = I.getParent()->getParent()->getParent();
4371   auto *CI = cast<CallInst>(&I);
4372 
4373   SmallVector<Type *, 4> Tys;
4374   for (Value *ArgOperand : CI->arg_operands())
4375     Tys.push_back(ToVectorTy(ArgOperand->getType(), VF));
4376 
4377   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
4378 
4379   // The flag shows whether we use Intrinsic or a usual Call for vectorized
4380   // version of the instruction.
4381   // Is it beneficial to perform intrinsic call compared to lib call?
4382   bool NeedToScalarize = false;
4383   unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize);
4384   bool UseVectorIntrinsic =
4385       ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost;
4386   assert((UseVectorIntrinsic || !NeedToScalarize) &&
4387          "Instruction should be scalarized elsewhere.");
4388 
4389   for (unsigned Part = 0; Part < UF; ++Part) {
4390     SmallVector<Value *, 4> Args;
4391     for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) {
4392       Value *Arg = CI->getArgOperand(i);
4393       // Some intrinsics have a scalar argument - don't replace it with a
4394       // vector.
4395       if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, i))
4396         Arg = getOrCreateVectorValue(CI->getArgOperand(i), Part);
4397       Args.push_back(Arg);
4398     }
4399 
4400     Function *VectorF;
4401     if (UseVectorIntrinsic) {
4402       // Use vector version of the intrinsic.
4403       Type *TysForDecl[] = {CI->getType()};
4404       if (VF > 1)
4405         TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
4406       VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
4407     } else {
4408       // Use vector version of the function call.
4409       const VFShape Shape =
4410           VFShape::get(*CI, {VF, false} /*EC*/, false /*HasGlobalPred*/);
4411 #ifndef NDEBUG
4412         const SmallVector<VFInfo, 8> Infos = VFDatabase::getMappings(*CI);
4413         assert(std::find_if(Infos.begin(), Infos.end(),
4414                             [&Shape](const VFInfo &Info) {
4415                               return Info.Shape == Shape;
4416                             }) != Infos.end() &&
4417                "Vector function shape is missing from the database.");
4418 #endif
4419         VectorF = VFDatabase(*CI).getVectorizedFunction(Shape);
4420     }
4421       assert(VectorF && "Can't create vector function.");
4422 
4423       SmallVector<OperandBundleDef, 1> OpBundles;
4424       CI->getOperandBundlesAsDefs(OpBundles);
4425       CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
4426 
4427       if (isa<FPMathOperator>(V))
4428         V->copyFastMathFlags(CI);
4429 
4430       VectorLoopValueMap.setVectorValue(&I, Part, V);
4431       addMetadata(V, &I);
4432   }
4433 }
4434 
4435 void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
4436   // We should not collect Scalars more than once per VF. Right now, this
4437   // function is called from collectUniformsAndScalars(), which already does
4438   // this check. Collecting Scalars for VF=1 does not make any sense.
4439   assert(VF >= 2 && Scalars.find(VF) == Scalars.end() &&
4440          "This function should not be visited twice for the same VF");
4441 
4442   SmallSetVector<Instruction *, 8> Worklist;
4443 
4444   // These sets are used to seed the analysis with pointers used by memory
4445   // accesses that will remain scalar.
4446   SmallSetVector<Instruction *, 8> ScalarPtrs;
4447   SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
4448 
4449   // A helper that returns true if the use of Ptr by MemAccess will be scalar.
4450   // The pointer operands of loads and stores will be scalar as long as the
4451   // memory access is not a gather or scatter operation. The value operand of a
4452   // store will remain scalar if the store is scalarized.
4453   auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
4454     InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
4455     assert(WideningDecision != CM_Unknown &&
4456            "Widening decision should be ready at this moment");
4457     if (auto *Store = dyn_cast<StoreInst>(MemAccess))
4458       if (Ptr == Store->getValueOperand())
4459         return WideningDecision == CM_Scalarize;
4460     assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
4461            "Ptr is neither a value or pointer operand");
4462     return WideningDecision != CM_GatherScatter;
4463   };
4464 
4465   // A helper that returns true if the given value is a bitcast or
4466   // getelementptr instruction contained in the loop.
4467   auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
4468     return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
4469             isa<GetElementPtrInst>(V)) &&
4470            !TheLoop->isLoopInvariant(V);
4471   };
4472 
4473   // A helper that evaluates a memory access's use of a pointer. If the use
4474   // will be a scalar use, and the pointer is only used by memory accesses, we
4475   // place the pointer in ScalarPtrs. Otherwise, the pointer is placed in
4476   // PossibleNonScalarPtrs.
4477   auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
4478     // We only care about bitcast and getelementptr instructions contained in
4479     // the loop.
4480     if (!isLoopVaryingBitCastOrGEP(Ptr))
4481       return;
4482 
4483     // If the pointer has already been identified as scalar (e.g., if it was
4484     // also identified as uniform), there's nothing to do.
4485     auto *I = cast<Instruction>(Ptr);
4486     if (Worklist.count(I))
4487       return;
4488 
4489     // If the use of the pointer will be a scalar use, and all users of the
4490     // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
4491     // place the pointer in PossibleNonScalarPtrs.
4492     if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
4493           return isa<LoadInst>(U) || isa<StoreInst>(U);
4494         }))
4495       ScalarPtrs.insert(I);
4496     else
4497       PossibleNonScalarPtrs.insert(I);
4498   };
4499 
4500   // We seed the scalars analysis with three classes of instructions: (1)
4501   // instructions marked uniform-after-vectorization, (2) bitcast and
4502   // getelementptr instructions used by memory accesses requiring a scalar use,
4503   // and (3) pointer induction variables and their update instructions (we
4504   // currently only scalarize these).
4505   //
4506   // (1) Add to the worklist all instructions that have been identified as
4507   // uniform-after-vectorization.
4508   Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
4509 
4510   // (2) Add to the worklist all bitcast and getelementptr instructions used by
4511   // memory accesses requiring a scalar use. The pointer operands of loads and
4512   // stores will be scalar as long as the memory accesses is not a gather or
4513   // scatter operation. The value operand of a store will remain scalar if the
4514   // store is scalarized.
4515   for (auto *BB : TheLoop->blocks())
4516     for (auto &I : *BB) {
4517       if (auto *Load = dyn_cast<LoadInst>(&I)) {
4518         evaluatePtrUse(Load, Load->getPointerOperand());
4519       } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
4520         evaluatePtrUse(Store, Store->getPointerOperand());
4521         evaluatePtrUse(Store, Store->getValueOperand());
4522       }
4523     }
4524   for (auto *I : ScalarPtrs)
4525     if (PossibleNonScalarPtrs.find(I) == PossibleNonScalarPtrs.end()) {
4526       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
4527       Worklist.insert(I);
4528     }
4529 
4530   // (3) Add to the worklist all pointer induction variables and their update
4531   // instructions.
4532   //
4533   // TODO: Once we are able to vectorize pointer induction variables we should
4534   //       no longer insert them into the worklist here.
4535   auto *Latch = TheLoop->getLoopLatch();
4536   for (auto &Induction : Legal->getInductionVars()) {
4537     auto *Ind = Induction.first;
4538     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4539     if (Induction.second.getKind() != InductionDescriptor::IK_PtrInduction)
4540       continue;
4541     Worklist.insert(Ind);
4542     Worklist.insert(IndUpdate);
4543     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
4544     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
4545                       << "\n");
4546   }
4547 
4548   // Insert the forced scalars.
4549   // FIXME: Currently widenPHIInstruction() often creates a dead vector
4550   // induction variable when the PHI user is scalarized.
4551   auto ForcedScalar = ForcedScalars.find(VF);
4552   if (ForcedScalar != ForcedScalars.end())
4553     for (auto *I : ForcedScalar->second)
4554       Worklist.insert(I);
4555 
4556   // Expand the worklist by looking through any bitcasts and getelementptr
4557   // instructions we've already identified as scalar. This is similar to the
4558   // expansion step in collectLoopUniforms(); however, here we're only
4559   // expanding to include additional bitcasts and getelementptr instructions.
4560   unsigned Idx = 0;
4561   while (Idx != Worklist.size()) {
4562     Instruction *Dst = Worklist[Idx++];
4563     if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
4564       continue;
4565     auto *Src = cast<Instruction>(Dst->getOperand(0));
4566     if (llvm::all_of(Src->users(), [&](User *U) -> bool {
4567           auto *J = cast<Instruction>(U);
4568           return !TheLoop->contains(J) || Worklist.count(J) ||
4569                  ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
4570                   isScalarUse(J, Src));
4571         })) {
4572       Worklist.insert(Src);
4573       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
4574     }
4575   }
4576 
4577   // An induction variable will remain scalar if all users of the induction
4578   // variable and induction variable update remain scalar.
4579   for (auto &Induction : Legal->getInductionVars()) {
4580     auto *Ind = Induction.first;
4581     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4582 
4583     // We already considered pointer induction variables, so there's no reason
4584     // to look at their users again.
4585     //
4586     // TODO: Once we are able to vectorize pointer induction variables we
4587     //       should no longer skip over them here.
4588     if (Induction.second.getKind() == InductionDescriptor::IK_PtrInduction)
4589       continue;
4590 
4591     // Determine if all users of the induction variable are scalar after
4592     // vectorization.
4593     auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4594       auto *I = cast<Instruction>(U);
4595       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I);
4596     });
4597     if (!ScalarInd)
4598       continue;
4599 
4600     // Determine if all users of the induction variable update instruction are
4601     // scalar after vectorization.
4602     auto ScalarIndUpdate =
4603         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4604           auto *I = cast<Instruction>(U);
4605           return I == Ind || !TheLoop->contains(I) || Worklist.count(I);
4606         });
4607     if (!ScalarIndUpdate)
4608       continue;
4609 
4610     // The induction variable and its update instruction will remain scalar.
4611     Worklist.insert(Ind);
4612     Worklist.insert(IndUpdate);
4613     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
4614     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
4615                       << "\n");
4616   }
4617 
4618   Scalars[VF].insert(Worklist.begin(), Worklist.end());
4619 }
4620 
4621 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, unsigned VF) {
4622   if (!blockNeedsPredication(I->getParent()))
4623     return false;
4624   switch(I->getOpcode()) {
4625   default:
4626     break;
4627   case Instruction::Load:
4628   case Instruction::Store: {
4629     if (!Legal->isMaskRequired(I))
4630       return false;
4631     auto *Ptr = getLoadStorePointerOperand(I);
4632     auto *Ty = getMemInstValueType(I);
4633     // We have already decided how to vectorize this instruction, get that
4634     // result.
4635     if (VF > 1) {
4636       InstWidening WideningDecision = getWideningDecision(I, VF);
4637       assert(WideningDecision != CM_Unknown &&
4638              "Widening decision should be ready at this moment");
4639       return WideningDecision == CM_Scalarize;
4640     }
4641     const MaybeAlign Alignment = getLoadStoreAlignment(I);
4642     return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
4643                                 isLegalMaskedGather(Ty, Alignment))
4644                             : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
4645                                 isLegalMaskedScatter(Ty, Alignment));
4646   }
4647   case Instruction::UDiv:
4648   case Instruction::SDiv:
4649   case Instruction::SRem:
4650   case Instruction::URem:
4651     return mayDivideByZero(*I);
4652   }
4653   return false;
4654 }
4655 
4656 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(Instruction *I,
4657                                                                unsigned VF) {
4658   assert(isAccessInterleaved(I) && "Expecting interleaved access.");
4659   assert(getWideningDecision(I, VF) == CM_Unknown &&
4660          "Decision should not be set yet.");
4661   auto *Group = getInterleavedAccessGroup(I);
4662   assert(Group && "Must have a group.");
4663 
4664   // If the instruction's allocated size doesn't equal it's type size, it
4665   // requires padding and will be scalarized.
4666   auto &DL = I->getModule()->getDataLayout();
4667   auto *ScalarTy = getMemInstValueType(I);
4668   if (hasIrregularType(ScalarTy, DL, VF))
4669     return false;
4670 
4671   // Check if masking is required.
4672   // A Group may need masking for one of two reasons: it resides in a block that
4673   // needs predication, or it was decided to use masking to deal with gaps.
4674   bool PredicatedAccessRequiresMasking =
4675       Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I);
4676   bool AccessWithGapsRequiresMasking =
4677       Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
4678   if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking)
4679     return true;
4680 
4681   // If masked interleaving is required, we expect that the user/target had
4682   // enabled it, because otherwise it either wouldn't have been created or
4683   // it should have been invalidated by the CostModel.
4684   assert(useMaskedInterleavedAccesses(TTI) &&
4685          "Masked interleave-groups for predicated accesses are not enabled.");
4686 
4687   auto *Ty = getMemInstValueType(I);
4688   const MaybeAlign Alignment = getLoadStoreAlignment(I);
4689   return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
4690                           : TTI.isLegalMaskedStore(Ty, Alignment);
4691 }
4692 
4693 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(Instruction *I,
4694                                                                unsigned VF) {
4695   // Get and ensure we have a valid memory instruction.
4696   LoadInst *LI = dyn_cast<LoadInst>(I);
4697   StoreInst *SI = dyn_cast<StoreInst>(I);
4698   assert((LI || SI) && "Invalid memory instruction");
4699 
4700   auto *Ptr = getLoadStorePointerOperand(I);
4701 
4702   // In order to be widened, the pointer should be consecutive, first of all.
4703   if (!Legal->isConsecutivePtr(Ptr))
4704     return false;
4705 
4706   // If the instruction is a store located in a predicated block, it will be
4707   // scalarized.
4708   if (isScalarWithPredication(I))
4709     return false;
4710 
4711   // If the instruction's allocated size doesn't equal it's type size, it
4712   // requires padding and will be scalarized.
4713   auto &DL = I->getModule()->getDataLayout();
4714   auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType();
4715   if (hasIrregularType(ScalarTy, DL, VF))
4716     return false;
4717 
4718   return true;
4719 }
4720 
4721 void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
4722   // We should not collect Uniforms more than once per VF. Right now,
4723   // this function is called from collectUniformsAndScalars(), which
4724   // already does this check. Collecting Uniforms for VF=1 does not make any
4725   // sense.
4726 
4727   assert(VF >= 2 && Uniforms.find(VF) == Uniforms.end() &&
4728          "This function should not be visited twice for the same VF");
4729 
4730   // Visit the list of Uniforms. If we'll not find any uniform value, we'll
4731   // not analyze again.  Uniforms.count(VF) will return 1.
4732   Uniforms[VF].clear();
4733 
4734   // We now know that the loop is vectorizable!
4735   // Collect instructions inside the loop that will remain uniform after
4736   // vectorization.
4737 
4738   // Global values, params and instructions outside of current loop are out of
4739   // scope.
4740   auto isOutOfScope = [&](Value *V) -> bool {
4741     Instruction *I = dyn_cast<Instruction>(V);
4742     return (!I || !TheLoop->contains(I));
4743   };
4744 
4745   SetVector<Instruction *> Worklist;
4746   BasicBlock *Latch = TheLoop->getLoopLatch();
4747 
4748   // Instructions that are scalar with predication must not be considered
4749   // uniform after vectorization, because that would create an erroneous
4750   // replicating region where only a single instance out of VF should be formed.
4751   // TODO: optimize such seldom cases if found important, see PR40816.
4752   auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
4753     if (isScalarWithPredication(I, VF)) {
4754       LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "
4755                         << *I << "\n");
4756       return;
4757     }
4758     LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
4759     Worklist.insert(I);
4760   };
4761 
4762   // Start with the conditional branch. If the branch condition is an
4763   // instruction contained in the loop that is only used by the branch, it is
4764   // uniform.
4765   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
4766   if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
4767     addToWorklistIfAllowed(Cmp);
4768 
4769   // Holds consecutive and consecutive-like pointers. Consecutive-like pointers
4770   // are pointers that are treated like consecutive pointers during
4771   // vectorization. The pointer operands of interleaved accesses are an
4772   // example.
4773   SmallSetVector<Instruction *, 8> ConsecutiveLikePtrs;
4774 
4775   // Holds pointer operands of instructions that are possibly non-uniform.
4776   SmallPtrSet<Instruction *, 8> PossibleNonUniformPtrs;
4777 
4778   auto isUniformDecision = [&](Instruction *I, unsigned VF) {
4779     InstWidening WideningDecision = getWideningDecision(I, VF);
4780     assert(WideningDecision != CM_Unknown &&
4781            "Widening decision should be ready at this moment");
4782 
4783     return (WideningDecision == CM_Widen ||
4784             WideningDecision == CM_Widen_Reverse ||
4785             WideningDecision == CM_Interleave);
4786   };
4787   // Iterate over the instructions in the loop, and collect all
4788   // consecutive-like pointer operands in ConsecutiveLikePtrs. If it's possible
4789   // that a consecutive-like pointer operand will be scalarized, we collect it
4790   // in PossibleNonUniformPtrs instead. We use two sets here because a single
4791   // getelementptr instruction can be used by both vectorized and scalarized
4792   // memory instructions. For example, if a loop loads and stores from the same
4793   // location, but the store is conditional, the store will be scalarized, and
4794   // the getelementptr won't remain uniform.
4795   for (auto *BB : TheLoop->blocks())
4796     for (auto &I : *BB) {
4797       // If there's no pointer operand, there's nothing to do.
4798       auto *Ptr = dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
4799       if (!Ptr)
4800         continue;
4801 
4802       // True if all users of Ptr are memory accesses that have Ptr as their
4803       // pointer operand.
4804       auto UsersAreMemAccesses =
4805           llvm::all_of(Ptr->users(), [&](User *U) -> bool {
4806             return getLoadStorePointerOperand(U) == Ptr;
4807           });
4808 
4809       // Ensure the memory instruction will not be scalarized or used by
4810       // gather/scatter, making its pointer operand non-uniform. If the pointer
4811       // operand is used by any instruction other than a memory access, we
4812       // conservatively assume the pointer operand may be non-uniform.
4813       if (!UsersAreMemAccesses || !isUniformDecision(&I, VF))
4814         PossibleNonUniformPtrs.insert(Ptr);
4815 
4816       // If the memory instruction will be vectorized and its pointer operand
4817       // is consecutive-like, or interleaving - the pointer operand should
4818       // remain uniform.
4819       else
4820         ConsecutiveLikePtrs.insert(Ptr);
4821     }
4822 
4823   // Add to the Worklist all consecutive and consecutive-like pointers that
4824   // aren't also identified as possibly non-uniform.
4825   for (auto *V : ConsecutiveLikePtrs)
4826     if (PossibleNonUniformPtrs.find(V) == PossibleNonUniformPtrs.end())
4827       addToWorklistIfAllowed(V);
4828 
4829   // Expand Worklist in topological order: whenever a new instruction
4830   // is added , its users should be already inside Worklist.  It ensures
4831   // a uniform instruction will only be used by uniform instructions.
4832   unsigned idx = 0;
4833   while (idx != Worklist.size()) {
4834     Instruction *I = Worklist[idx++];
4835 
4836     for (auto OV : I->operand_values()) {
4837       // isOutOfScope operands cannot be uniform instructions.
4838       if (isOutOfScope(OV))
4839         continue;
4840       // First order recurrence Phi's should typically be considered
4841       // non-uniform.
4842       auto *OP = dyn_cast<PHINode>(OV);
4843       if (OP && Legal->isFirstOrderRecurrence(OP))
4844         continue;
4845       // If all the users of the operand are uniform, then add the
4846       // operand into the uniform worklist.
4847       auto *OI = cast<Instruction>(OV);
4848       if (llvm::all_of(OI->users(), [&](User *U) -> bool {
4849             auto *J = cast<Instruction>(U);
4850             return Worklist.count(J) ||
4851                    (OI == getLoadStorePointerOperand(J) &&
4852                     isUniformDecision(J, VF));
4853           }))
4854         addToWorklistIfAllowed(OI);
4855     }
4856   }
4857 
4858   // Returns true if Ptr is the pointer operand of a memory access instruction
4859   // I, and I is known to not require scalarization.
4860   auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
4861     return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF);
4862   };
4863 
4864   // For an instruction to be added into Worklist above, all its users inside
4865   // the loop should also be in Worklist. However, this condition cannot be
4866   // true for phi nodes that form a cyclic dependence. We must process phi
4867   // nodes separately. An induction variable will remain uniform if all users
4868   // of the induction variable and induction variable update remain uniform.
4869   // The code below handles both pointer and non-pointer induction variables.
4870   for (auto &Induction : Legal->getInductionVars()) {
4871     auto *Ind = Induction.first;
4872     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4873 
4874     // Determine if all users of the induction variable are uniform after
4875     // vectorization.
4876     auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4877       auto *I = cast<Instruction>(U);
4878       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
4879              isVectorizedMemAccessUse(I, Ind);
4880     });
4881     if (!UniformInd)
4882       continue;
4883 
4884     // Determine if all users of the induction variable update instruction are
4885     // uniform after vectorization.
4886     auto UniformIndUpdate =
4887         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4888           auto *I = cast<Instruction>(U);
4889           return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
4890                  isVectorizedMemAccessUse(I, IndUpdate);
4891         });
4892     if (!UniformIndUpdate)
4893       continue;
4894 
4895     // The induction variable and its update instruction will remain uniform.
4896     addToWorklistIfAllowed(Ind);
4897     addToWorklistIfAllowed(IndUpdate);
4898   }
4899 
4900   Uniforms[VF].insert(Worklist.begin(), Worklist.end());
4901 }
4902 
4903 bool LoopVectorizationCostModel::runtimeChecksRequired() {
4904   LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
4905 
4906   if (Legal->getRuntimePointerChecking()->Need) {
4907     reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
4908         "runtime pointer checks needed. Enable vectorization of this "
4909         "loop with '#pragma clang loop vectorize(enable)' when "
4910         "compiling with -Os/-Oz",
4911         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4912     return true;
4913   }
4914 
4915   if (!PSE.getUnionPredicate().getPredicates().empty()) {
4916     reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
4917         "runtime SCEV checks needed. Enable vectorization of this "
4918         "loop with '#pragma clang loop vectorize(enable)' when "
4919         "compiling with -Os/-Oz",
4920         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4921     return true;
4922   }
4923 
4924   // FIXME: Avoid specializing for stride==1 instead of bailing out.
4925   if (!Legal->getLAI()->getSymbolicStrides().empty()) {
4926     reportVectorizationFailure("Runtime stride check is required with -Os/-Oz",
4927         "runtime stride == 1 checks needed. Enable vectorization of "
4928         "this loop with '#pragma clang loop vectorize(enable)' when "
4929         "compiling with -Os/-Oz",
4930         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4931     return true;
4932   }
4933 
4934   return false;
4935 }
4936 
4937 Optional<unsigned> LoopVectorizationCostModel::computeMaxVF() {
4938   if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
4939     // TODO: It may by useful to do since it's still likely to be dynamically
4940     // uniform if the target can skip.
4941     reportVectorizationFailure(
4942         "Not inserting runtime ptr check for divergent target",
4943         "runtime pointer checks needed. Not enabled for divergent target",
4944         "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
4945     return None;
4946   }
4947 
4948   unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
4949   LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
4950   if (TC == 1) {
4951     reportVectorizationFailure("Single iteration (non) loop",
4952         "loop trip count is one, irrelevant for vectorization",
4953         "SingleIterationLoop", ORE, TheLoop);
4954     return None;
4955   }
4956 
4957   switch (ScalarEpilogueStatus) {
4958   case CM_ScalarEpilogueAllowed:
4959     return computeFeasibleMaxVF(TC);
4960   case CM_ScalarEpilogueNotNeededUsePredicate:
4961     LLVM_DEBUG(
4962         dbgs() << "LV: vector predicate hint/switch found.\n"
4963                << "LV: Not allowing scalar epilogue, creating predicated "
4964                << "vector loop.\n");
4965     break;
4966   case CM_ScalarEpilogueNotAllowedLowTripLoop:
4967     // fallthrough as a special case of OptForSize
4968   case CM_ScalarEpilogueNotAllowedOptSize:
4969     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
4970       LLVM_DEBUG(
4971           dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
4972     else
4973       LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
4974                         << "count.\n");
4975 
4976     // Bail if runtime checks are required, which are not good when optimising
4977     // for size.
4978     if (runtimeChecksRequired())
4979       return None;
4980     break;
4981   }
4982 
4983   // Now try the tail folding
4984 
4985   // Invalidate interleave groups that require an epilogue if we can't mask
4986   // the interleave-group.
4987   if (!useMaskedInterleavedAccesses(TTI))
4988     InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
4989 
4990   unsigned MaxVF = computeFeasibleMaxVF(TC);
4991   if (TC > 0 && TC % MaxVF == 0) {
4992     // Accept MaxVF if we do not have a tail.
4993     LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
4994     return MaxVF;
4995   }
4996 
4997   // If we don't know the precise trip count, or if the trip count that we
4998   // found modulo the vectorization factor is not zero, try to fold the tail
4999   // by masking.
5000   // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
5001   if (Legal->prepareToFoldTailByMasking()) {
5002     FoldTailByMasking = true;
5003     return MaxVF;
5004   }
5005 
5006   if (TC == 0) {
5007     reportVectorizationFailure(
5008         "Unable to calculate the loop count due to complex control flow",
5009         "unable to calculate the loop count due to complex control flow",
5010         "UnknownLoopCountComplexCFG", ORE, TheLoop);
5011     return None;
5012   }
5013 
5014   reportVectorizationFailure(
5015       "Cannot optimize for size and vectorize at the same time.",
5016       "cannot optimize for size and vectorize at the same time. "
5017       "Enable vectorization of this loop with '#pragma clang loop "
5018       "vectorize(enable)' when compiling with -Os/-Oz",
5019       "NoTailLoopWithOptForSize", ORE, TheLoop);
5020   return None;
5021 }
5022 
5023 unsigned
5024 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) {
5025   MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
5026   unsigned SmallestType, WidestType;
5027   std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
5028   unsigned WidestRegister = TTI.getRegisterBitWidth(true);
5029 
5030   // Get the maximum safe dependence distance in bits computed by LAA.
5031   // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
5032   // the memory accesses that is most restrictive (involved in the smallest
5033   // dependence distance).
5034   unsigned MaxSafeRegisterWidth = Legal->getMaxSafeRegisterWidth();
5035 
5036   WidestRegister = std::min(WidestRegister, MaxSafeRegisterWidth);
5037 
5038   unsigned MaxVectorSize = WidestRegister / WidestType;
5039 
5040   LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
5041                     << " / " << WidestType << " bits.\n");
5042   LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
5043                     << WidestRegister << " bits.\n");
5044 
5045   assert(MaxVectorSize <= 256 && "Did not expect to pack so many elements"
5046                                  " into one vector!");
5047   if (MaxVectorSize == 0) {
5048     LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n");
5049     MaxVectorSize = 1;
5050     return MaxVectorSize;
5051   } else if (ConstTripCount && ConstTripCount < MaxVectorSize &&
5052              isPowerOf2_32(ConstTripCount)) {
5053     // We need to clamp the VF to be the ConstTripCount. There is no point in
5054     // choosing a higher viable VF as done in the loop below.
5055     LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: "
5056                       << ConstTripCount << "\n");
5057     MaxVectorSize = ConstTripCount;
5058     return MaxVectorSize;
5059   }
5060 
5061   unsigned MaxVF = MaxVectorSize;
5062   if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) ||
5063       (MaximizeBandwidth && isScalarEpilogueAllowed())) {
5064     // Collect all viable vectorization factors larger than the default MaxVF
5065     // (i.e. MaxVectorSize).
5066     SmallVector<unsigned, 8> VFs;
5067     unsigned NewMaxVectorSize = WidestRegister / SmallestType;
5068     for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2)
5069       VFs.push_back(VS);
5070 
5071     // For each VF calculate its register usage.
5072     auto RUs = calculateRegisterUsage(VFs);
5073 
5074     // Select the largest VF which doesn't require more registers than existing
5075     // ones.
5076     for (int i = RUs.size() - 1; i >= 0; --i) {
5077       bool Selected = true;
5078       for (auto& pair : RUs[i].MaxLocalUsers) {
5079         unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5080         if (pair.second > TargetNumRegisters)
5081           Selected = false;
5082       }
5083       if (Selected) {
5084         MaxVF = VFs[i];
5085         break;
5086       }
5087     }
5088     if (unsigned MinVF = TTI.getMinimumVF(SmallestType)) {
5089       if (MaxVF < MinVF) {
5090         LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
5091                           << ") with target's minimum: " << MinVF << '\n');
5092         MaxVF = MinVF;
5093       }
5094     }
5095   }
5096   return MaxVF;
5097 }
5098 
5099 VectorizationFactor
5100 LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) {
5101   float Cost = expectedCost(1).first;
5102   const float ScalarCost = Cost;
5103   unsigned Width = 1;
5104   LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n");
5105 
5106   bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
5107   if (ForceVectorization && MaxVF > 1) {
5108     // Ignore scalar width, because the user explicitly wants vectorization.
5109     // Initialize cost to max so that VF = 2 is, at least, chosen during cost
5110     // evaluation.
5111     Cost = std::numeric_limits<float>::max();
5112   }
5113 
5114   for (unsigned i = 2; i <= MaxVF; i *= 2) {
5115     // Notice that the vector loop needs to be executed less times, so
5116     // we need to divide the cost of the vector loops by the width of
5117     // the vector elements.
5118     VectorizationCostTy C = expectedCost(i);
5119     float VectorCost = C.first / (float)i;
5120     LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
5121                       << " costs: " << (int)VectorCost << ".\n");
5122     if (!C.second && !ForceVectorization) {
5123       LLVM_DEBUG(
5124           dbgs() << "LV: Not considering vector loop of width " << i
5125                  << " because it will not generate any vector instructions.\n");
5126       continue;
5127     }
5128     if (VectorCost < Cost) {
5129       Cost = VectorCost;
5130       Width = i;
5131     }
5132   }
5133 
5134   if (!EnableCondStoresVectorization && NumPredStores) {
5135     reportVectorizationFailure("There are conditional stores.",
5136         "store that is conditionally executed prevents vectorization",
5137         "ConditionalStore", ORE, TheLoop);
5138     Width = 1;
5139     Cost = ScalarCost;
5140   }
5141 
5142   LLVM_DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs()
5143              << "LV: Vectorization seems to be not beneficial, "
5144              << "but was forced by a user.\n");
5145   LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n");
5146   VectorizationFactor Factor = {Width, (unsigned)(Width * Cost)};
5147   return Factor;
5148 }
5149 
5150 std::pair<unsigned, unsigned>
5151 LoopVectorizationCostModel::getSmallestAndWidestTypes() {
5152   unsigned MinWidth = -1U;
5153   unsigned MaxWidth = 8;
5154   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5155 
5156   // For each block.
5157   for (BasicBlock *BB : TheLoop->blocks()) {
5158     // For each instruction in the loop.
5159     for (Instruction &I : BB->instructionsWithoutDebug()) {
5160       Type *T = I.getType();
5161 
5162       // Skip ignored values.
5163       if (ValuesToIgnore.find(&I) != ValuesToIgnore.end())
5164         continue;
5165 
5166       // Only examine Loads, Stores and PHINodes.
5167       if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
5168         continue;
5169 
5170       // Examine PHI nodes that are reduction variables. Update the type to
5171       // account for the recurrence type.
5172       if (auto *PN = dyn_cast<PHINode>(&I)) {
5173         if (!Legal->isReductionVariable(PN))
5174           continue;
5175         RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[PN];
5176         T = RdxDesc.getRecurrenceType();
5177       }
5178 
5179       // Examine the stored values.
5180       if (auto *ST = dyn_cast<StoreInst>(&I))
5181         T = ST->getValueOperand()->getType();
5182 
5183       // Ignore loaded pointer types and stored pointer types that are not
5184       // vectorizable.
5185       //
5186       // FIXME: The check here attempts to predict whether a load or store will
5187       //        be vectorized. We only know this for certain after a VF has
5188       //        been selected. Here, we assume that if an access can be
5189       //        vectorized, it will be. We should also look at extending this
5190       //        optimization to non-pointer types.
5191       //
5192       if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) &&
5193           !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I))
5194         continue;
5195 
5196       MinWidth = std::min(MinWidth,
5197                           (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
5198       MaxWidth = std::max(MaxWidth,
5199                           (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
5200     }
5201   }
5202 
5203   return {MinWidth, MaxWidth};
5204 }
5205 
5206 unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF,
5207                                                            unsigned LoopCost) {
5208   // -- The interleave heuristics --
5209   // We interleave the loop in order to expose ILP and reduce the loop overhead.
5210   // There are many micro-architectural considerations that we can't predict
5211   // at this level. For example, frontend pressure (on decode or fetch) due to
5212   // code size, or the number and capabilities of the execution ports.
5213   //
5214   // We use the following heuristics to select the interleave count:
5215   // 1. If the code has reductions, then we interleave to break the cross
5216   // iteration dependency.
5217   // 2. If the loop is really small, then we interleave to reduce the loop
5218   // overhead.
5219   // 3. We don't interleave if we think that we will spill registers to memory
5220   // due to the increased register pressure.
5221 
5222   if (!isScalarEpilogueAllowed())
5223     return 1;
5224 
5225   // We used the distance for the interleave count.
5226   if (Legal->getMaxSafeDepDistBytes() != -1U)
5227     return 1;
5228 
5229   // Do not interleave loops with a relatively small known or estimated trip
5230   // count.
5231   auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
5232   if (BestKnownTC && *BestKnownTC < TinyTripCountInterleaveThreshold)
5233     return 1;
5234 
5235   RegisterUsage R = calculateRegisterUsage({VF})[0];
5236   // We divide by these constants so assume that we have at least one
5237   // instruction that uses at least one register.
5238   for (auto& pair : R.MaxLocalUsers) {
5239     pair.second = std::max(pair.second, 1U);
5240   }
5241 
5242   // We calculate the interleave count using the following formula.
5243   // Subtract the number of loop invariants from the number of available
5244   // registers. These registers are used by all of the interleaved instances.
5245   // Next, divide the remaining registers by the number of registers that is
5246   // required by the loop, in order to estimate how many parallel instances
5247   // fit without causing spills. All of this is rounded down if necessary to be
5248   // a power of two. We want power of two interleave count to simplify any
5249   // addressing operations or alignment considerations.
5250   // We also want power of two interleave counts to ensure that the induction
5251   // variable of the vector loop wraps to zero, when tail is folded by masking;
5252   // this currently happens when OptForSize, in which case IC is set to 1 above.
5253   unsigned IC = UINT_MAX;
5254 
5255   for (auto& pair : R.MaxLocalUsers) {
5256     unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5257     LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
5258                       << " registers of "
5259                       << TTI.getRegisterClassName(pair.first) << " register class\n");
5260     if (VF == 1) {
5261       if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
5262         TargetNumRegisters = ForceTargetNumScalarRegs;
5263     } else {
5264       if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
5265         TargetNumRegisters = ForceTargetNumVectorRegs;
5266     }
5267     unsigned MaxLocalUsers = pair.second;
5268     unsigned LoopInvariantRegs = 0;
5269     if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end())
5270       LoopInvariantRegs = R.LoopInvariantRegs[pair.first];
5271 
5272     unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers);
5273     // Don't count the induction variable as interleaved.
5274     if (EnableIndVarRegisterHeur) {
5275       TmpIC =
5276           PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) /
5277                         std::max(1U, (MaxLocalUsers - 1)));
5278     }
5279 
5280     IC = std::min(IC, TmpIC);
5281   }
5282 
5283   // Clamp the interleave ranges to reasonable counts.
5284   unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF);
5285 
5286   // Check if the user has overridden the max.
5287   if (VF == 1) {
5288     if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
5289       MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
5290   } else {
5291     if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
5292       MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
5293   }
5294 
5295   // If trip count is known or estimated compile time constant, limit the
5296   // interleave count to be less than the trip count divided by VF.
5297   if (BestKnownTC) {
5298     MaxInterleaveCount = std::min(*BestKnownTC / VF, MaxInterleaveCount);
5299   }
5300 
5301   // If we did not calculate the cost for VF (because the user selected the VF)
5302   // then we calculate the cost of VF here.
5303   if (LoopCost == 0)
5304     LoopCost = expectedCost(VF).first;
5305 
5306   assert(LoopCost && "Non-zero loop cost expected");
5307 
5308   // Clamp the calculated IC to be between the 1 and the max interleave count
5309   // that the target and trip count allows.
5310   if (IC > MaxInterleaveCount)
5311     IC = MaxInterleaveCount;
5312   else if (IC < 1)
5313     IC = 1;
5314 
5315   // Interleave if we vectorized this loop and there is a reduction that could
5316   // benefit from interleaving.
5317   if (VF > 1 && !Legal->getReductionVars().empty()) {
5318     LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
5319     return IC;
5320   }
5321 
5322   // Note that if we've already vectorized the loop we will have done the
5323   // runtime check and so interleaving won't require further checks.
5324   bool InterleavingRequiresRuntimePointerCheck =
5325       (VF == 1 && Legal->getRuntimePointerChecking()->Need);
5326 
5327   // We want to interleave small loops in order to reduce the loop overhead and
5328   // potentially expose ILP opportunities.
5329   LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n');
5330   if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) {
5331     // We assume that the cost overhead is 1 and we use the cost model
5332     // to estimate the cost of the loop and interleave until the cost of the
5333     // loop overhead is about 5% of the cost of the loop.
5334     unsigned SmallIC =
5335         std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost));
5336 
5337     // Interleave until store/load ports (estimated by max interleave count) are
5338     // saturated.
5339     unsigned NumStores = Legal->getNumStores();
5340     unsigned NumLoads = Legal->getNumLoads();
5341     unsigned StoresIC = IC / (NumStores ? NumStores : 1);
5342     unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
5343 
5344     // If we have a scalar reduction (vector reductions are already dealt with
5345     // by this point), we can increase the critical path length if the loop
5346     // we're interleaving is inside another loop. Limit, by default to 2, so the
5347     // critical path only gets increased by one reduction operation.
5348     if (!Legal->getReductionVars().empty() && TheLoop->getLoopDepth() > 1) {
5349       unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
5350       SmallIC = std::min(SmallIC, F);
5351       StoresIC = std::min(StoresIC, F);
5352       LoadsIC = std::min(LoadsIC, F);
5353     }
5354 
5355     if (EnableLoadStoreRuntimeInterleave &&
5356         std::max(StoresIC, LoadsIC) > SmallIC) {
5357       LLVM_DEBUG(
5358           dbgs() << "LV: Interleaving to saturate store or load ports.\n");
5359       return std::max(StoresIC, LoadsIC);
5360     }
5361 
5362     LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
5363     return SmallIC;
5364   }
5365 
5366   // Interleave if this is a large loop (small loops are already dealt with by
5367   // this point) that could benefit from interleaving.
5368   bool HasReductions = !Legal->getReductionVars().empty();
5369   if (TTI.enableAggressiveInterleaving(HasReductions)) {
5370     LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5371     return IC;
5372   }
5373 
5374   LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
5375   return 1;
5376 }
5377 
5378 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
5379 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {
5380   // This function calculates the register usage by measuring the highest number
5381   // of values that are alive at a single location. Obviously, this is a very
5382   // rough estimation. We scan the loop in a topological order in order and
5383   // assign a number to each instruction. We use RPO to ensure that defs are
5384   // met before their users. We assume that each instruction that has in-loop
5385   // users starts an interval. We record every time that an in-loop value is
5386   // used, so we have a list of the first and last occurrences of each
5387   // instruction. Next, we transpose this data structure into a multi map that
5388   // holds the list of intervals that *end* at a specific location. This multi
5389   // map allows us to perform a linear search. We scan the instructions linearly
5390   // and record each time that a new interval starts, by placing it in a set.
5391   // If we find this value in the multi-map then we remove it from the set.
5392   // The max register usage is the maximum size of the set.
5393   // We also search for instructions that are defined outside the loop, but are
5394   // used inside the loop. We need this number separately from the max-interval
5395   // usage number because when we unroll, loop-invariant values do not take
5396   // more register.
5397   LoopBlocksDFS DFS(TheLoop);
5398   DFS.perform(LI);
5399 
5400   RegisterUsage RU;
5401 
5402   // Each 'key' in the map opens a new interval. The values
5403   // of the map are the index of the 'last seen' usage of the
5404   // instruction that is the key.
5405   using IntervalMap = DenseMap<Instruction *, unsigned>;
5406 
5407   // Maps instruction to its index.
5408   SmallVector<Instruction *, 64> IdxToInstr;
5409   // Marks the end of each interval.
5410   IntervalMap EndPoint;
5411   // Saves the list of instruction indices that are used in the loop.
5412   SmallPtrSet<Instruction *, 8> Ends;
5413   // Saves the list of values that are used in the loop but are
5414   // defined outside the loop, such as arguments and constants.
5415   SmallPtrSet<Value *, 8> LoopInvariants;
5416 
5417   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
5418     for (Instruction &I : BB->instructionsWithoutDebug()) {
5419       IdxToInstr.push_back(&I);
5420 
5421       // Save the end location of each USE.
5422       for (Value *U : I.operands()) {
5423         auto *Instr = dyn_cast<Instruction>(U);
5424 
5425         // Ignore non-instruction values such as arguments, constants, etc.
5426         if (!Instr)
5427           continue;
5428 
5429         // If this instruction is outside the loop then record it and continue.
5430         if (!TheLoop->contains(Instr)) {
5431           LoopInvariants.insert(Instr);
5432           continue;
5433         }
5434 
5435         // Overwrite previous end points.
5436         EndPoint[Instr] = IdxToInstr.size();
5437         Ends.insert(Instr);
5438       }
5439     }
5440   }
5441 
5442   // Saves the list of intervals that end with the index in 'key'.
5443   using InstrList = SmallVector<Instruction *, 2>;
5444   DenseMap<unsigned, InstrList> TransposeEnds;
5445 
5446   // Transpose the EndPoints to a list of values that end at each index.
5447   for (auto &Interval : EndPoint)
5448     TransposeEnds[Interval.second].push_back(Interval.first);
5449 
5450   SmallPtrSet<Instruction *, 8> OpenIntervals;
5451 
5452   // Get the size of the widest register.
5453   unsigned MaxSafeDepDist = -1U;
5454   if (Legal->getMaxSafeDepDistBytes() != -1U)
5455     MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8;
5456   unsigned WidestRegister =
5457       std::min(TTI.getRegisterBitWidth(true), MaxSafeDepDist);
5458   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5459 
5460   SmallVector<RegisterUsage, 8> RUs(VFs.size());
5461   SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size());
5462 
5463   LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
5464 
5465   // A lambda that gets the register usage for the given type and VF.
5466   auto GetRegUsage = [&DL, WidestRegister](Type *Ty, unsigned VF) {
5467     if (Ty->isTokenTy())
5468       return 0U;
5469     unsigned TypeSize = DL.getTypeSizeInBits(Ty->getScalarType());
5470     return std::max<unsigned>(1, VF * TypeSize / WidestRegister);
5471   };
5472 
5473   for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
5474     Instruction *I = IdxToInstr[i];
5475 
5476     // Remove all of the instructions that end at this location.
5477     InstrList &List = TransposeEnds[i];
5478     for (Instruction *ToRemove : List)
5479       OpenIntervals.erase(ToRemove);
5480 
5481     // Ignore instructions that are never used within the loop.
5482     if (Ends.find(I) == Ends.end())
5483       continue;
5484 
5485     // Skip ignored values.
5486     if (ValuesToIgnore.find(I) != ValuesToIgnore.end())
5487       continue;
5488 
5489     // For each VF find the maximum usage of registers.
5490     for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
5491       // Count the number of live intervals.
5492       SmallMapVector<unsigned, unsigned, 4> RegUsage;
5493 
5494       if (VFs[j] == 1) {
5495         for (auto Inst : OpenIntervals) {
5496           unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
5497           if (RegUsage.find(ClassID) == RegUsage.end())
5498             RegUsage[ClassID] = 1;
5499           else
5500             RegUsage[ClassID] += 1;
5501         }
5502       } else {
5503         collectUniformsAndScalars(VFs[j]);
5504         for (auto Inst : OpenIntervals) {
5505           // Skip ignored values for VF > 1.
5506           if (VecValuesToIgnore.find(Inst) != VecValuesToIgnore.end())
5507             continue;
5508           if (isScalarAfterVectorization(Inst, VFs[j])) {
5509             unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
5510             if (RegUsage.find(ClassID) == RegUsage.end())
5511               RegUsage[ClassID] = 1;
5512             else
5513               RegUsage[ClassID] += 1;
5514           } else {
5515             unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType());
5516             if (RegUsage.find(ClassID) == RegUsage.end())
5517               RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]);
5518             else
5519               RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]);
5520           }
5521         }
5522       }
5523 
5524       for (auto& pair : RegUsage) {
5525         if (MaxUsages[j].find(pair.first) != MaxUsages[j].end())
5526           MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second);
5527         else
5528           MaxUsages[j][pair.first] = pair.second;
5529       }
5530     }
5531 
5532     LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
5533                       << OpenIntervals.size() << '\n');
5534 
5535     // Add the current instruction to the list of open intervals.
5536     OpenIntervals.insert(I);
5537   }
5538 
5539   for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
5540     SmallMapVector<unsigned, unsigned, 4> Invariant;
5541 
5542     for (auto Inst : LoopInvariants) {
5543       unsigned Usage = VFs[i] == 1 ? 1 : GetRegUsage(Inst->getType(), VFs[i]);
5544       unsigned ClassID = TTI.getRegisterClassForType(VFs[i] > 1, Inst->getType());
5545       if (Invariant.find(ClassID) == Invariant.end())
5546         Invariant[ClassID] = Usage;
5547       else
5548         Invariant[ClassID] += Usage;
5549     }
5550 
5551     LLVM_DEBUG({
5552       dbgs() << "LV(REG): VF = " << VFs[i] << '\n';
5553       dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size()
5554              << " item\n";
5555       for (const auto &pair : MaxUsages[i]) {
5556         dbgs() << "LV(REG): RegisterClass: "
5557                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5558                << " registers\n";
5559       }
5560       dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()
5561              << " item\n";
5562       for (const auto &pair : Invariant) {
5563         dbgs() << "LV(REG): RegisterClass: "
5564                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5565                << " registers\n";
5566       }
5567     });
5568 
5569     RU.LoopInvariantRegs = Invariant;
5570     RU.MaxLocalUsers = MaxUsages[i];
5571     RUs[i] = RU;
5572   }
5573 
5574   return RUs;
5575 }
5576 
5577 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){
5578   // TODO: Cost model for emulated masked load/store is completely
5579   // broken. This hack guides the cost model to use an artificially
5580   // high enough value to practically disable vectorization with such
5581   // operations, except where previously deployed legality hack allowed
5582   // using very low cost values. This is to avoid regressions coming simply
5583   // from moving "masked load/store" check from legality to cost model.
5584   // Masked Load/Gather emulation was previously never allowed.
5585   // Limited number of Masked Store/Scatter emulation was allowed.
5586   assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction");
5587   return isa<LoadInst>(I) ||
5588          (isa<StoreInst>(I) &&
5589           NumPredStores > NumberOfStoresToPredicate);
5590 }
5591 
5592 void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) {
5593   // If we aren't vectorizing the loop, or if we've already collected the
5594   // instructions to scalarize, there's nothing to do. Collection may already
5595   // have occurred if we have a user-selected VF and are now computing the
5596   // expected cost for interleaving.
5597   if (VF < 2 || InstsToScalarize.find(VF) != InstsToScalarize.end())
5598     return;
5599 
5600   // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
5601   // not profitable to scalarize any instructions, the presence of VF in the
5602   // map will indicate that we've analyzed it already.
5603   ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
5604 
5605   // Find all the instructions that are scalar with predication in the loop and
5606   // determine if it would be better to not if-convert the blocks they are in.
5607   // If so, we also record the instructions to scalarize.
5608   for (BasicBlock *BB : TheLoop->blocks()) {
5609     if (!blockNeedsPredication(BB))
5610       continue;
5611     for (Instruction &I : *BB)
5612       if (isScalarWithPredication(&I)) {
5613         ScalarCostsTy ScalarCosts;
5614         // Do not apply discount logic if hacked cost is needed
5615         // for emulated masked memrefs.
5616         if (!useEmulatedMaskMemRefHack(&I) &&
5617             computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
5618           ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
5619         // Remember that BB will remain after vectorization.
5620         PredicatedBBsAfterVectorization.insert(BB);
5621       }
5622   }
5623 }
5624 
5625 int LoopVectorizationCostModel::computePredInstDiscount(
5626     Instruction *PredInst, DenseMap<Instruction *, unsigned> &ScalarCosts,
5627     unsigned VF) {
5628   assert(!isUniformAfterVectorization(PredInst, VF) &&
5629          "Instruction marked uniform-after-vectorization will be predicated");
5630 
5631   // Initialize the discount to zero, meaning that the scalar version and the
5632   // vector version cost the same.
5633   int Discount = 0;
5634 
5635   // Holds instructions to analyze. The instructions we visit are mapped in
5636   // ScalarCosts. Those instructions are the ones that would be scalarized if
5637   // we find that the scalar version costs less.
5638   SmallVector<Instruction *, 8> Worklist;
5639 
5640   // Returns true if the given instruction can be scalarized.
5641   auto canBeScalarized = [&](Instruction *I) -> bool {
5642     // We only attempt to scalarize instructions forming a single-use chain
5643     // from the original predicated block that would otherwise be vectorized.
5644     // Although not strictly necessary, we give up on instructions we know will
5645     // already be scalar to avoid traversing chains that are unlikely to be
5646     // beneficial.
5647     if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
5648         isScalarAfterVectorization(I, VF))
5649       return false;
5650 
5651     // If the instruction is scalar with predication, it will be analyzed
5652     // separately. We ignore it within the context of PredInst.
5653     if (isScalarWithPredication(I))
5654       return false;
5655 
5656     // If any of the instruction's operands are uniform after vectorization,
5657     // the instruction cannot be scalarized. This prevents, for example, a
5658     // masked load from being scalarized.
5659     //
5660     // We assume we will only emit a value for lane zero of an instruction
5661     // marked uniform after vectorization, rather than VF identical values.
5662     // Thus, if we scalarize an instruction that uses a uniform, we would
5663     // create uses of values corresponding to the lanes we aren't emitting code
5664     // for. This behavior can be changed by allowing getScalarValue to clone
5665     // the lane zero values for uniforms rather than asserting.
5666     for (Use &U : I->operands())
5667       if (auto *J = dyn_cast<Instruction>(U.get()))
5668         if (isUniformAfterVectorization(J, VF))
5669           return false;
5670 
5671     // Otherwise, we can scalarize the instruction.
5672     return true;
5673   };
5674 
5675   // Compute the expected cost discount from scalarizing the entire expression
5676   // feeding the predicated instruction. We currently only consider expressions
5677   // that are single-use instruction chains.
5678   Worklist.push_back(PredInst);
5679   while (!Worklist.empty()) {
5680     Instruction *I = Worklist.pop_back_val();
5681 
5682     // If we've already analyzed the instruction, there's nothing to do.
5683     if (ScalarCosts.find(I) != ScalarCosts.end())
5684       continue;
5685 
5686     // Compute the cost of the vector instruction. Note that this cost already
5687     // includes the scalarization overhead of the predicated instruction.
5688     unsigned VectorCost = getInstructionCost(I, VF).first;
5689 
5690     // Compute the cost of the scalarized instruction. This cost is the cost of
5691     // the instruction as if it wasn't if-converted and instead remained in the
5692     // predicated block. We will scale this cost by block probability after
5693     // computing the scalarization overhead.
5694     unsigned ScalarCost = VF * getInstructionCost(I, 1).first;
5695 
5696     // Compute the scalarization overhead of needed insertelement instructions
5697     // and phi nodes.
5698     if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) {
5699       ScalarCost += TTI.getScalarizationOverhead(ToVectorTy(I->getType(), VF),
5700                                                  true, false);
5701       ScalarCost += VF * TTI.getCFInstrCost(Instruction::PHI);
5702     }
5703 
5704     // Compute the scalarization overhead of needed extractelement
5705     // instructions. For each of the instruction's operands, if the operand can
5706     // be scalarized, add it to the worklist; otherwise, account for the
5707     // overhead.
5708     for (Use &U : I->operands())
5709       if (auto *J = dyn_cast<Instruction>(U.get())) {
5710         assert(VectorType::isValidElementType(J->getType()) &&
5711                "Instruction has non-scalar type");
5712         if (canBeScalarized(J))
5713           Worklist.push_back(J);
5714         else if (needsExtract(J, VF))
5715           ScalarCost += TTI.getScalarizationOverhead(
5716                               ToVectorTy(J->getType(),VF), false, true);
5717       }
5718 
5719     // Scale the total scalar cost by block probability.
5720     ScalarCost /= getReciprocalPredBlockProb();
5721 
5722     // Compute the discount. A non-negative discount means the vector version
5723     // of the instruction costs more, and scalarizing would be beneficial.
5724     Discount += VectorCost - ScalarCost;
5725     ScalarCosts[I] = ScalarCost;
5726   }
5727 
5728   return Discount;
5729 }
5730 
5731 LoopVectorizationCostModel::VectorizationCostTy
5732 LoopVectorizationCostModel::expectedCost(unsigned VF) {
5733   VectorizationCostTy Cost;
5734 
5735   // For each block.
5736   for (BasicBlock *BB : TheLoop->blocks()) {
5737     VectorizationCostTy BlockCost;
5738 
5739     // For each instruction in the old loop.
5740     for (Instruction &I : BB->instructionsWithoutDebug()) {
5741       // Skip ignored values.
5742       if (ValuesToIgnore.find(&I) != ValuesToIgnore.end() ||
5743           (VF > 1 && VecValuesToIgnore.find(&I) != VecValuesToIgnore.end()))
5744         continue;
5745 
5746       VectorizationCostTy C = getInstructionCost(&I, VF);
5747 
5748       // Check if we should override the cost.
5749       if (ForceTargetInstructionCost.getNumOccurrences() > 0)
5750         C.first = ForceTargetInstructionCost;
5751 
5752       BlockCost.first += C.first;
5753       BlockCost.second |= C.second;
5754       LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first
5755                         << " for VF " << VF << " For instruction: " << I
5756                         << '\n');
5757     }
5758 
5759     // If we are vectorizing a predicated block, it will have been
5760     // if-converted. This means that the block's instructions (aside from
5761     // stores and instructions that may divide by zero) will now be
5762     // unconditionally executed. For the scalar case, we may not always execute
5763     // the predicated block. Thus, scale the block's cost by the probability of
5764     // executing it.
5765     if (VF == 1 && blockNeedsPredication(BB))
5766       BlockCost.first /= getReciprocalPredBlockProb();
5767 
5768     Cost.first += BlockCost.first;
5769     Cost.second |= BlockCost.second;
5770   }
5771 
5772   return Cost;
5773 }
5774 
5775 /// Gets Address Access SCEV after verifying that the access pattern
5776 /// is loop invariant except the induction variable dependence.
5777 ///
5778 /// This SCEV can be sent to the Target in order to estimate the address
5779 /// calculation cost.
5780 static const SCEV *getAddressAccessSCEV(
5781               Value *Ptr,
5782               LoopVectorizationLegality *Legal,
5783               PredicatedScalarEvolution &PSE,
5784               const Loop *TheLoop) {
5785 
5786   auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
5787   if (!Gep)
5788     return nullptr;
5789 
5790   // We are looking for a gep with all loop invariant indices except for one
5791   // which should be an induction variable.
5792   auto SE = PSE.getSE();
5793   unsigned NumOperands = Gep->getNumOperands();
5794   for (unsigned i = 1; i < NumOperands; ++i) {
5795     Value *Opd = Gep->getOperand(i);
5796     if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
5797         !Legal->isInductionVariable(Opd))
5798       return nullptr;
5799   }
5800 
5801   // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
5802   return PSE.getSCEV(Ptr);
5803 }
5804 
5805 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
5806   return Legal->hasStride(I->getOperand(0)) ||
5807          Legal->hasStride(I->getOperand(1));
5808 }
5809 
5810 unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
5811                                                                  unsigned VF) {
5812   assert(VF > 1 && "Scalarization cost of instruction implies vectorization.");
5813   Type *ValTy = getMemInstValueType(I);
5814   auto SE = PSE.getSE();
5815 
5816   unsigned AS = getLoadStoreAddressSpace(I);
5817   Value *Ptr = getLoadStorePointerOperand(I);
5818   Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
5819 
5820   // Figure out whether the access is strided and get the stride value
5821   // if it's known in compile time
5822   const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
5823 
5824   // Get the cost of the scalar memory instruction and address computation.
5825   unsigned Cost = VF * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
5826 
5827   // Don't pass *I here, since it is scalar but will actually be part of a
5828   // vectorized loop where the user of it is a vectorized instruction.
5829   const MaybeAlign Alignment = getLoadStoreAlignment(I);
5830   Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(),
5831                                    Alignment, AS);
5832 
5833   // Get the overhead of the extractelement and insertelement instructions
5834   // we might create due to scalarization.
5835   Cost += getScalarizationOverhead(I, VF);
5836 
5837   // If we have a predicated store, it may not be executed for each vector
5838   // lane. Scale the cost by the probability of executing the predicated
5839   // block.
5840   if (isPredicatedInst(I)) {
5841     Cost /= getReciprocalPredBlockProb();
5842 
5843     if (useEmulatedMaskMemRefHack(I))
5844       // Artificially setting to a high enough value to practically disable
5845       // vectorization with such operations.
5846       Cost = 3000000;
5847   }
5848 
5849   return Cost;
5850 }
5851 
5852 unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
5853                                                              unsigned VF) {
5854   Type *ValTy = getMemInstValueType(I);
5855   Type *VectorTy = ToVectorTy(ValTy, VF);
5856   Value *Ptr = getLoadStorePointerOperand(I);
5857   unsigned AS = getLoadStoreAddressSpace(I);
5858   int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
5859 
5860   assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
5861          "Stride should be 1 or -1 for consecutive memory access");
5862   const MaybeAlign Alignment = getLoadStoreAlignment(I);
5863   unsigned Cost = 0;
5864   if (Legal->isMaskRequired(I))
5865     Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy,
5866                                       Alignment ? Alignment->value() : 0, AS);
5867   else
5868     Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, I);
5869 
5870   bool Reverse = ConsecutiveStride < 0;
5871   if (Reverse)
5872     Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
5873   return Cost;
5874 }
5875 
5876 unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
5877                                                          unsigned VF) {
5878   Type *ValTy = getMemInstValueType(I);
5879   Type *VectorTy = ToVectorTy(ValTy, VF);
5880   const MaybeAlign Alignment = getLoadStoreAlignment(I);
5881   unsigned AS = getLoadStoreAddressSpace(I);
5882   if (isa<LoadInst>(I)) {
5883     return TTI.getAddressComputationCost(ValTy) +
5884            TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS) +
5885            TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
5886   }
5887   StoreInst *SI = cast<StoreInst>(I);
5888 
5889   bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand());
5890   return TTI.getAddressComputationCost(ValTy) +
5891          TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS) +
5892          (isLoopInvariantStoreValue
5893               ? 0
5894               : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
5895                                        VF - 1));
5896 }
5897 
5898 unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
5899                                                           unsigned VF) {
5900   Type *ValTy = getMemInstValueType(I);
5901   Type *VectorTy = ToVectorTy(ValTy, VF);
5902   const MaybeAlign Alignment = getLoadStoreAlignment(I);
5903   Value *Ptr = getLoadStorePointerOperand(I);
5904 
5905   return TTI.getAddressComputationCost(VectorTy) +
5906          TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr,
5907                                     Legal->isMaskRequired(I),
5908                                     Alignment ? Alignment->value() : 0, I);
5909 }
5910 
5911 unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
5912                                                             unsigned VF) {
5913   Type *ValTy = getMemInstValueType(I);
5914   Type *VectorTy = ToVectorTy(ValTy, VF);
5915   unsigned AS = getLoadStoreAddressSpace(I);
5916 
5917   auto Group = getInterleavedAccessGroup(I);
5918   assert(Group && "Fail to get an interleaved access group.");
5919 
5920   unsigned InterleaveFactor = Group->getFactor();
5921   Type *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
5922 
5923   // Holds the indices of existing members in an interleaved load group.
5924   // An interleaved store group doesn't need this as it doesn't allow gaps.
5925   SmallVector<unsigned, 4> Indices;
5926   if (isa<LoadInst>(I)) {
5927     for (unsigned i = 0; i < InterleaveFactor; i++)
5928       if (Group->getMember(i))
5929         Indices.push_back(i);
5930   }
5931 
5932   // Calculate the cost of the whole interleaved group.
5933   bool UseMaskForGaps =
5934       Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
5935   unsigned Cost = TTI.getInterleavedMemoryOpCost(
5936       I->getOpcode(), WideVecTy, Group->getFactor(), Indices,
5937       Group->getAlign().value(), AS, Legal->isMaskRequired(I), UseMaskForGaps);
5938 
5939   if (Group->isReverse()) {
5940     // TODO: Add support for reversed masked interleaved access.
5941     assert(!Legal->isMaskRequired(I) &&
5942            "Reverse masked interleaved access not supported.");
5943     Cost += Group->getNumMembers() *
5944             TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
5945   }
5946   return Cost;
5947 }
5948 
5949 unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
5950                                                               unsigned VF) {
5951   // Calculate scalar cost only. Vectorization cost should be ready at this
5952   // moment.
5953   if (VF == 1) {
5954     Type *ValTy = getMemInstValueType(I);
5955     const MaybeAlign Alignment = getLoadStoreAlignment(I);
5956     unsigned AS = getLoadStoreAddressSpace(I);
5957 
5958     return TTI.getAddressComputationCost(ValTy) +
5959            TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, I);
5960   }
5961   return getWideningCost(I, VF);
5962 }
5963 
5964 LoopVectorizationCostModel::VectorizationCostTy
5965 LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
5966   // If we know that this instruction will remain uniform, check the cost of
5967   // the scalar version.
5968   if (isUniformAfterVectorization(I, VF))
5969     VF = 1;
5970 
5971   if (VF > 1 && isProfitableToScalarize(I, VF))
5972     return VectorizationCostTy(InstsToScalarize[VF][I], false);
5973 
5974   // Forced scalars do not have any scalarization overhead.
5975   auto ForcedScalar = ForcedScalars.find(VF);
5976   if (VF > 1 && ForcedScalar != ForcedScalars.end()) {
5977     auto InstSet = ForcedScalar->second;
5978     if (InstSet.find(I) != InstSet.end())
5979       return VectorizationCostTy((getInstructionCost(I, 1).first * VF), false);
5980   }
5981 
5982   Type *VectorTy;
5983   unsigned C = getInstructionCost(I, VF, VectorTy);
5984 
5985   bool TypeNotScalarized =
5986       VF > 1 && VectorTy->isVectorTy() && TTI.getNumberOfParts(VectorTy) < VF;
5987   return VectorizationCostTy(C, TypeNotScalarized);
5988 }
5989 
5990 unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
5991                                                               unsigned VF) {
5992 
5993   if (VF == 1)
5994     return 0;
5995 
5996   unsigned Cost = 0;
5997   Type *RetTy = ToVectorTy(I->getType(), VF);
5998   if (!RetTy->isVoidTy() &&
5999       (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
6000     Cost += TTI.getScalarizationOverhead(RetTy, true, false);
6001 
6002   // Some targets keep addresses scalar.
6003   if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
6004     return Cost;
6005 
6006   // Some targets support efficient element stores.
6007   if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
6008     return Cost;
6009 
6010   // Collect operands to consider.
6011   CallInst *CI = dyn_cast<CallInst>(I);
6012   Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands();
6013 
6014   // Skip operands that do not require extraction/scalarization and do not incur
6015   // any overhead.
6016   return Cost + TTI.getOperandsScalarizationOverhead(
6017                     filterExtractingOperands(Ops, VF), VF);
6018 }
6019 
6020 void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {
6021   if (VF == 1)
6022     return;
6023   NumPredStores = 0;
6024   for (BasicBlock *BB : TheLoop->blocks()) {
6025     // For each instruction in the old loop.
6026     for (Instruction &I : *BB) {
6027       Value *Ptr =  getLoadStorePointerOperand(&I);
6028       if (!Ptr)
6029         continue;
6030 
6031       // TODO: We should generate better code and update the cost model for
6032       // predicated uniform stores. Today they are treated as any other
6033       // predicated store (see added test cases in
6034       // invariant-store-vectorization.ll).
6035       if (isa<StoreInst>(&I) && isScalarWithPredication(&I))
6036         NumPredStores++;
6037 
6038       if (Legal->isUniform(Ptr) &&
6039           // Conditional loads and stores should be scalarized and predicated.
6040           // isScalarWithPredication cannot be used here since masked
6041           // gather/scatters are not considered scalar with predication.
6042           !Legal->blockNeedsPredication(I.getParent())) {
6043         // TODO: Avoid replicating loads and stores instead of
6044         // relying on instcombine to remove them.
6045         // Load: Scalar load + broadcast
6046         // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
6047         unsigned Cost = getUniformMemOpCost(&I, VF);
6048         setWideningDecision(&I, VF, CM_Scalarize, Cost);
6049         continue;
6050       }
6051 
6052       // We assume that widening is the best solution when possible.
6053       if (memoryInstructionCanBeWidened(&I, VF)) {
6054         unsigned Cost = getConsecutiveMemOpCost(&I, VF);
6055         int ConsecutiveStride =
6056                Legal->isConsecutivePtr(getLoadStorePointerOperand(&I));
6057         assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6058                "Expected consecutive stride.");
6059         InstWidening Decision =
6060             ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
6061         setWideningDecision(&I, VF, Decision, Cost);
6062         continue;
6063       }
6064 
6065       // Choose between Interleaving, Gather/Scatter or Scalarization.
6066       unsigned InterleaveCost = std::numeric_limits<unsigned>::max();
6067       unsigned NumAccesses = 1;
6068       if (isAccessInterleaved(&I)) {
6069         auto Group = getInterleavedAccessGroup(&I);
6070         assert(Group && "Fail to get an interleaved access group.");
6071 
6072         // Make one decision for the whole group.
6073         if (getWideningDecision(&I, VF) != CM_Unknown)
6074           continue;
6075 
6076         NumAccesses = Group->getNumMembers();
6077         if (interleavedAccessCanBeWidened(&I, VF))
6078           InterleaveCost = getInterleaveGroupCost(&I, VF);
6079       }
6080 
6081       unsigned GatherScatterCost =
6082           isLegalGatherOrScatter(&I)
6083               ? getGatherScatterCost(&I, VF) * NumAccesses
6084               : std::numeric_limits<unsigned>::max();
6085 
6086       unsigned ScalarizationCost =
6087           getMemInstScalarizationCost(&I, VF) * NumAccesses;
6088 
6089       // Choose better solution for the current VF,
6090       // write down this decision and use it during vectorization.
6091       unsigned Cost;
6092       InstWidening Decision;
6093       if (InterleaveCost <= GatherScatterCost &&
6094           InterleaveCost < ScalarizationCost) {
6095         Decision = CM_Interleave;
6096         Cost = InterleaveCost;
6097       } else if (GatherScatterCost < ScalarizationCost) {
6098         Decision = CM_GatherScatter;
6099         Cost = GatherScatterCost;
6100       } else {
6101         Decision = CM_Scalarize;
6102         Cost = ScalarizationCost;
6103       }
6104       // If the instructions belongs to an interleave group, the whole group
6105       // receives the same decision. The whole group receives the cost, but
6106       // the cost will actually be assigned to one instruction.
6107       if (auto Group = getInterleavedAccessGroup(&I))
6108         setWideningDecision(Group, VF, Decision, Cost);
6109       else
6110         setWideningDecision(&I, VF, Decision, Cost);
6111     }
6112   }
6113 
6114   // Make sure that any load of address and any other address computation
6115   // remains scalar unless there is gather/scatter support. This avoids
6116   // inevitable extracts into address registers, and also has the benefit of
6117   // activating LSR more, since that pass can't optimize vectorized
6118   // addresses.
6119   if (TTI.prefersVectorizedAddressing())
6120     return;
6121 
6122   // Start with all scalar pointer uses.
6123   SmallPtrSet<Instruction *, 8> AddrDefs;
6124   for (BasicBlock *BB : TheLoop->blocks())
6125     for (Instruction &I : *BB) {
6126       Instruction *PtrDef =
6127         dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
6128       if (PtrDef && TheLoop->contains(PtrDef) &&
6129           getWideningDecision(&I, VF) != CM_GatherScatter)
6130         AddrDefs.insert(PtrDef);
6131     }
6132 
6133   // Add all instructions used to generate the addresses.
6134   SmallVector<Instruction *, 4> Worklist;
6135   for (auto *I : AddrDefs)
6136     Worklist.push_back(I);
6137   while (!Worklist.empty()) {
6138     Instruction *I = Worklist.pop_back_val();
6139     for (auto &Op : I->operands())
6140       if (auto *InstOp = dyn_cast<Instruction>(Op))
6141         if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
6142             AddrDefs.insert(InstOp).second)
6143           Worklist.push_back(InstOp);
6144   }
6145 
6146   for (auto *I : AddrDefs) {
6147     if (isa<LoadInst>(I)) {
6148       // Setting the desired widening decision should ideally be handled in
6149       // by cost functions, but since this involves the task of finding out
6150       // if the loaded register is involved in an address computation, it is
6151       // instead changed here when we know this is the case.
6152       InstWidening Decision = getWideningDecision(I, VF);
6153       if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
6154         // Scalarize a widened load of address.
6155         setWideningDecision(I, VF, CM_Scalarize,
6156                             (VF * getMemoryInstructionCost(I, 1)));
6157       else if (auto Group = getInterleavedAccessGroup(I)) {
6158         // Scalarize an interleave group of address loads.
6159         for (unsigned I = 0; I < Group->getFactor(); ++I) {
6160           if (Instruction *Member = Group->getMember(I))
6161             setWideningDecision(Member, VF, CM_Scalarize,
6162                                 (VF * getMemoryInstructionCost(Member, 1)));
6163         }
6164       }
6165     } else
6166       // Make sure I gets scalarized and a cost estimate without
6167       // scalarization overhead.
6168       ForcedScalars[VF].insert(I);
6169   }
6170 }
6171 
6172 unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
6173                                                         unsigned VF,
6174                                                         Type *&VectorTy) {
6175   Type *RetTy = I->getType();
6176   if (canTruncateToMinimalBitwidth(I, VF))
6177     RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
6178   VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF);
6179   auto SE = PSE.getSE();
6180 
6181   // TODO: We need to estimate the cost of intrinsic calls.
6182   switch (I->getOpcode()) {
6183   case Instruction::GetElementPtr:
6184     // We mark this instruction as zero-cost because the cost of GEPs in
6185     // vectorized code depends on whether the corresponding memory instruction
6186     // is scalarized or not. Therefore, we handle GEPs with the memory
6187     // instruction cost.
6188     return 0;
6189   case Instruction::Br: {
6190     // In cases of scalarized and predicated instructions, there will be VF
6191     // predicated blocks in the vectorized loop. Each branch around these
6192     // blocks requires also an extract of its vector compare i1 element.
6193     bool ScalarPredicatedBB = false;
6194     BranchInst *BI = cast<BranchInst>(I);
6195     if (VF > 1 && BI->isConditional() &&
6196         (PredicatedBBsAfterVectorization.find(BI->getSuccessor(0)) !=
6197              PredicatedBBsAfterVectorization.end() ||
6198          PredicatedBBsAfterVectorization.find(BI->getSuccessor(1)) !=
6199              PredicatedBBsAfterVectorization.end()))
6200       ScalarPredicatedBB = true;
6201 
6202     if (ScalarPredicatedBB) {
6203       // Return cost for branches around scalarized and predicated blocks.
6204       Type *Vec_i1Ty =
6205           VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
6206       return (TTI.getScalarizationOverhead(Vec_i1Ty, false, true) +
6207               (TTI.getCFInstrCost(Instruction::Br) * VF));
6208     } else if (I->getParent() == TheLoop->getLoopLatch() || VF == 1)
6209       // The back-edge branch will remain, as will all scalar branches.
6210       return TTI.getCFInstrCost(Instruction::Br);
6211     else
6212       // This branch will be eliminated by if-conversion.
6213       return 0;
6214     // Note: We currently assume zero cost for an unconditional branch inside
6215     // a predicated block since it will become a fall-through, although we
6216     // may decide in the future to call TTI for all branches.
6217   }
6218   case Instruction::PHI: {
6219     auto *Phi = cast<PHINode>(I);
6220 
6221     // First-order recurrences are replaced by vector shuffles inside the loop.
6222     // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type.
6223     if (VF > 1 && Legal->isFirstOrderRecurrence(Phi))
6224       return TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector,
6225                                 VectorTy, VF - 1, VectorType::get(RetTy, 1));
6226 
6227     // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
6228     // converted into select instructions. We require N - 1 selects per phi
6229     // node, where N is the number of incoming values.
6230     if (VF > 1 && Phi->getParent() != TheLoop->getHeader())
6231       return (Phi->getNumIncomingValues() - 1) *
6232              TTI.getCmpSelInstrCost(
6233                  Instruction::Select, ToVectorTy(Phi->getType(), VF),
6234                  ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF));
6235 
6236     return TTI.getCFInstrCost(Instruction::PHI);
6237   }
6238   case Instruction::UDiv:
6239   case Instruction::SDiv:
6240   case Instruction::URem:
6241   case Instruction::SRem:
6242     // If we have a predicated instruction, it may not be executed for each
6243     // vector lane. Get the scalarization cost and scale this amount by the
6244     // probability of executing the predicated block. If the instruction is not
6245     // predicated, we fall through to the next case.
6246     if (VF > 1 && isScalarWithPredication(I)) {
6247       unsigned Cost = 0;
6248 
6249       // These instructions have a non-void type, so account for the phi nodes
6250       // that we will create. This cost is likely to be zero. The phi node
6251       // cost, if any, should be scaled by the block probability because it
6252       // models a copy at the end of each predicated block.
6253       Cost += VF * TTI.getCFInstrCost(Instruction::PHI);
6254 
6255       // The cost of the non-predicated instruction.
6256       Cost += VF * TTI.getArithmeticInstrCost(I->getOpcode(), RetTy);
6257 
6258       // The cost of insertelement and extractelement instructions needed for
6259       // scalarization.
6260       Cost += getScalarizationOverhead(I, VF);
6261 
6262       // Scale the cost by the probability of executing the predicated blocks.
6263       // This assumes the predicated block for each vector lane is equally
6264       // likely.
6265       return Cost / getReciprocalPredBlockProb();
6266     }
6267     LLVM_FALLTHROUGH;
6268   case Instruction::Add:
6269   case Instruction::FAdd:
6270   case Instruction::Sub:
6271   case Instruction::FSub:
6272   case Instruction::Mul:
6273   case Instruction::FMul:
6274   case Instruction::FDiv:
6275   case Instruction::FRem:
6276   case Instruction::Shl:
6277   case Instruction::LShr:
6278   case Instruction::AShr:
6279   case Instruction::And:
6280   case Instruction::Or:
6281   case Instruction::Xor: {
6282     // Since we will replace the stride by 1 the multiplication should go away.
6283     if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal))
6284       return 0;
6285     // Certain instructions can be cheaper to vectorize if they have a constant
6286     // second vector operand. One example of this are shifts on x86.
6287     Value *Op2 = I->getOperand(1);
6288     TargetTransformInfo::OperandValueProperties Op2VP;
6289     TargetTransformInfo::OperandValueKind Op2VK =
6290         TTI.getOperandInfo(Op2, Op2VP);
6291     if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2))
6292       Op2VK = TargetTransformInfo::OK_UniformValue;
6293 
6294     SmallVector<const Value *, 4> Operands(I->operand_values());
6295     unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
6296     return N * TTI.getArithmeticInstrCost(
6297                    I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue,
6298                    Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I);
6299   }
6300   case Instruction::FNeg: {
6301     unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
6302     return N * TTI.getArithmeticInstrCost(
6303                    I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue,
6304                    TargetTransformInfo::OK_AnyValue,
6305                    TargetTransformInfo::OP_None, TargetTransformInfo::OP_None,
6306                    I->getOperand(0), I);
6307   }
6308   case Instruction::Select: {
6309     SelectInst *SI = cast<SelectInst>(I);
6310     const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
6311     bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
6312     Type *CondTy = SI->getCondition()->getType();
6313     if (!ScalarCond)
6314       CondTy = VectorType::get(CondTy, VF);
6315 
6316     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, I);
6317   }
6318   case Instruction::ICmp:
6319   case Instruction::FCmp: {
6320     Type *ValTy = I->getOperand(0)->getType();
6321     Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
6322     if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
6323       ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
6324     VectorTy = ToVectorTy(ValTy, VF);
6325     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, I);
6326   }
6327   case Instruction::Store:
6328   case Instruction::Load: {
6329     unsigned Width = VF;
6330     if (Width > 1) {
6331       InstWidening Decision = getWideningDecision(I, Width);
6332       assert(Decision != CM_Unknown &&
6333              "CM decision should be taken at this point");
6334       if (Decision == CM_Scalarize)
6335         Width = 1;
6336     }
6337     VectorTy = ToVectorTy(getMemInstValueType(I), Width);
6338     return getMemoryInstructionCost(I, VF);
6339   }
6340   case Instruction::ZExt:
6341   case Instruction::SExt:
6342   case Instruction::FPToUI:
6343   case Instruction::FPToSI:
6344   case Instruction::FPExt:
6345   case Instruction::PtrToInt:
6346   case Instruction::IntToPtr:
6347   case Instruction::SIToFP:
6348   case Instruction::UIToFP:
6349   case Instruction::Trunc:
6350   case Instruction::FPTrunc:
6351   case Instruction::BitCast: {
6352     // We optimize the truncation of induction variables having constant
6353     // integer steps. The cost of these truncations is the same as the scalar
6354     // operation.
6355     if (isOptimizableIVTruncate(I, VF)) {
6356       auto *Trunc = cast<TruncInst>(I);
6357       return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
6358                                   Trunc->getSrcTy(), Trunc);
6359     }
6360 
6361     Type *SrcScalarTy = I->getOperand(0)->getType();
6362     Type *SrcVecTy =
6363         VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
6364     if (canTruncateToMinimalBitwidth(I, VF)) {
6365       // This cast is going to be shrunk. This may remove the cast or it might
6366       // turn it into slightly different cast. For example, if MinBW == 16,
6367       // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
6368       //
6369       // Calculate the modified src and dest types.
6370       Type *MinVecTy = VectorTy;
6371       if (I->getOpcode() == Instruction::Trunc) {
6372         SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
6373         VectorTy =
6374             largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
6375       } else if (I->getOpcode() == Instruction::ZExt ||
6376                  I->getOpcode() == Instruction::SExt) {
6377         SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy);
6378         VectorTy =
6379             smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
6380       }
6381     }
6382 
6383     unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
6384     return N * TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy, I);
6385   }
6386   case Instruction::Call: {
6387     bool NeedToScalarize;
6388     CallInst *CI = cast<CallInst>(I);
6389     unsigned CallCost = getVectorCallCost(CI, VF, NeedToScalarize);
6390     if (getVectorIntrinsicIDForCall(CI, TLI))
6391       return std::min(CallCost, getVectorIntrinsicCost(CI, VF));
6392     return CallCost;
6393   }
6394   default:
6395     // The cost of executing VF copies of the scalar instruction. This opcode
6396     // is unknown. Assume that it is the same as 'mul'.
6397     return VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy) +
6398            getScalarizationOverhead(I, VF);
6399   } // end of switch.
6400 }
6401 
6402 char LoopVectorize::ID = 0;
6403 
6404 static const char lv_name[] = "Loop Vectorization";
6405 
6406 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
6407 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
6408 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)
6409 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
6410 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
6411 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
6412 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
6413 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
6414 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
6415 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
6416 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
6417 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
6418 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
6419 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
6420 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy)
6421 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
6422 
6423 namespace llvm {
6424 
6425 Pass *createLoopVectorizePass() { return new LoopVectorize(); }
6426 
6427 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced,
6428                               bool VectorizeOnlyWhenForced) {
6429   return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced);
6430 }
6431 
6432 } // end namespace llvm
6433 
6434 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
6435   // Check if the pointer operand of a load or store instruction is
6436   // consecutive.
6437   if (auto *Ptr = getLoadStorePointerOperand(Inst))
6438     return Legal->isConsecutivePtr(Ptr);
6439   return false;
6440 }
6441 
6442 void LoopVectorizationCostModel::collectValuesToIgnore() {
6443   // Ignore ephemeral values.
6444   CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
6445 
6446   // Ignore type-promoting instructions we identified during reduction
6447   // detection.
6448   for (auto &Reduction : Legal->getReductionVars()) {
6449     RecurrenceDescriptor &RedDes = Reduction.second;
6450     SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
6451     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
6452   }
6453   // Ignore type-casting instructions we identified during induction
6454   // detection.
6455   for (auto &Induction : Legal->getInductionVars()) {
6456     InductionDescriptor &IndDes = Induction.second;
6457     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
6458     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
6459   }
6460 }
6461 
6462 // TODO: we could return a pair of values that specify the max VF and
6463 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
6464 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
6465 // doesn't have a cost model that can choose which plan to execute if
6466 // more than one is generated.
6467 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits,
6468                                  LoopVectorizationCostModel &CM) {
6469   unsigned WidestType;
6470   std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
6471   return WidestVectorRegBits / WidestType;
6472 }
6473 
6474 VectorizationFactor
6475 LoopVectorizationPlanner::planInVPlanNativePath(unsigned UserVF) {
6476   unsigned VF = UserVF;
6477   // Outer loop handling: They may require CFG and instruction level
6478   // transformations before even evaluating whether vectorization is profitable.
6479   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
6480   // the vectorization pipeline.
6481   if (!OrigLoop->empty()) {
6482     // If the user doesn't provide a vectorization factor, determine a
6483     // reasonable one.
6484     if (!UserVF) {
6485       VF = determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM);
6486       LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
6487 
6488       // Make sure we have a VF > 1 for stress testing.
6489       if (VPlanBuildStressTest && VF < 2) {
6490         LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
6491                           << "overriding computed VF.\n");
6492         VF = 4;
6493       }
6494     }
6495     assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
6496     assert(isPowerOf2_32(VF) && "VF needs to be a power of two");
6497     LLVM_DEBUG(dbgs() << "LV: Using " << (UserVF ? "user " : "") << "VF " << VF
6498                       << " to build VPlans.\n");
6499     buildVPlans(VF, VF);
6500 
6501     // For VPlan build stress testing, we bail out after VPlan construction.
6502     if (VPlanBuildStressTest)
6503       return VectorizationFactor::Disabled();
6504 
6505     return {VF, 0};
6506   }
6507 
6508   LLVM_DEBUG(
6509       dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
6510                 "VPlan-native path.\n");
6511   return VectorizationFactor::Disabled();
6512 }
6513 
6514 Optional<VectorizationFactor> LoopVectorizationPlanner::plan(unsigned UserVF) {
6515   assert(OrigLoop->empty() && "Inner loop expected.");
6516   Optional<unsigned> MaybeMaxVF = CM.computeMaxVF();
6517   if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved.
6518     return None;
6519 
6520   // Invalidate interleave groups if all blocks of loop will be predicated.
6521   if (CM.blockNeedsPredication(OrigLoop->getHeader()) &&
6522       !useMaskedInterleavedAccesses(*TTI)) {
6523     LLVM_DEBUG(
6524         dbgs()
6525         << "LV: Invalidate all interleaved groups due to fold-tail by masking "
6526            "which requires masked-interleaved support.\n");
6527     CM.InterleaveInfo.reset();
6528   }
6529 
6530   if (UserVF) {
6531     LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
6532     assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two");
6533     // Collect the instructions (and their associated costs) that will be more
6534     // profitable to scalarize.
6535     CM.selectUserVectorizationFactor(UserVF);
6536     buildVPlansWithVPRecipes(UserVF, UserVF);
6537     LLVM_DEBUG(printPlans(dbgs()));
6538     return {{UserVF, 0}};
6539   }
6540 
6541   unsigned MaxVF = MaybeMaxVF.getValue();
6542   assert(MaxVF != 0 && "MaxVF is zero.");
6543 
6544   for (unsigned VF = 1; VF <= MaxVF; VF *= 2) {
6545     // Collect Uniform and Scalar instructions after vectorization with VF.
6546     CM.collectUniformsAndScalars(VF);
6547 
6548     // Collect the instructions (and their associated costs) that will be more
6549     // profitable to scalarize.
6550     if (VF > 1)
6551       CM.collectInstsToScalarize(VF);
6552   }
6553 
6554   buildVPlansWithVPRecipes(1, MaxVF);
6555   LLVM_DEBUG(printPlans(dbgs()));
6556   if (MaxVF == 1)
6557     return VectorizationFactor::Disabled();
6558 
6559   // Select the optimal vectorization factor.
6560   return CM.selectVectorizationFactor(MaxVF);
6561 }
6562 
6563 void LoopVectorizationPlanner::setBestPlan(unsigned VF, unsigned UF) {
6564   LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF
6565                     << '\n');
6566   BestVF = VF;
6567   BestUF = UF;
6568 
6569   erase_if(VPlans, [VF](const VPlanPtr &Plan) {
6570     return !Plan->hasVF(VF);
6571   });
6572   assert(VPlans.size() == 1 && "Best VF has not a single VPlan.");
6573 }
6574 
6575 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV,
6576                                            DominatorTree *DT) {
6577   // Perform the actual loop transformation.
6578 
6579   // 1. Create a new empty loop. Unlink the old loop and connect the new one.
6580   VPCallbackILV CallbackILV(ILV);
6581 
6582   VPTransformState State{BestVF, BestUF,      LI,
6583                          DT,     ILV.Builder, ILV.VectorLoopValueMap,
6584                          &ILV,   CallbackILV};
6585   State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
6586   State.TripCount = ILV.getOrCreateTripCount(nullptr);
6587 
6588   //===------------------------------------------------===//
6589   //
6590   // Notice: any optimization or new instruction that go
6591   // into the code below should also be implemented in
6592   // the cost-model.
6593   //
6594   //===------------------------------------------------===//
6595 
6596   // 2. Copy and widen instructions from the old loop into the new loop.
6597   assert(VPlans.size() == 1 && "Not a single VPlan to execute.");
6598   VPlans.front()->execute(&State);
6599 
6600   // 3. Fix the vectorized code: take care of header phi's, live-outs,
6601   //    predication, updating analyses.
6602   ILV.fixVectorizedLoop();
6603 }
6604 
6605 void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
6606     SmallPtrSetImpl<Instruction *> &DeadInstructions) {
6607   BasicBlock *Latch = OrigLoop->getLoopLatch();
6608 
6609   // We create new control-flow for the vectorized loop, so the original
6610   // condition will be dead after vectorization if it's only used by the
6611   // branch.
6612   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
6613   if (Cmp && Cmp->hasOneUse())
6614     DeadInstructions.insert(Cmp);
6615 
6616   // We create new "steps" for induction variable updates to which the original
6617   // induction variables map. An original update instruction will be dead if
6618   // all its users except the induction variable are dead.
6619   for (auto &Induction : Legal->getInductionVars()) {
6620     PHINode *Ind = Induction.first;
6621     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
6622     if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
6623           return U == Ind || DeadInstructions.find(cast<Instruction>(U)) !=
6624                                  DeadInstructions.end();
6625         }))
6626       DeadInstructions.insert(IndUpdate);
6627 
6628     // We record as "Dead" also the type-casting instructions we had identified
6629     // during induction analysis. We don't need any handling for them in the
6630     // vectorized loop because we have proven that, under a proper runtime
6631     // test guarding the vectorized loop, the value of the phi, and the casted
6632     // value of the phi, are the same. The last instruction in this casting chain
6633     // will get its scalar/vector/widened def from the scalar/vector/widened def
6634     // of the respective phi node. Any other casts in the induction def-use chain
6635     // have no other uses outside the phi update chain, and will be ignored.
6636     InductionDescriptor &IndDes = Induction.second;
6637     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
6638     DeadInstructions.insert(Casts.begin(), Casts.end());
6639   }
6640 }
6641 
6642 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; }
6643 
6644 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; }
6645 
6646 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step,
6647                                         Instruction::BinaryOps BinOp) {
6648   // When unrolling and the VF is 1, we only need to add a simple scalar.
6649   Type *Ty = Val->getType();
6650   assert(!Ty->isVectorTy() && "Val must be a scalar");
6651 
6652   if (Ty->isFloatingPointTy()) {
6653     Constant *C = ConstantFP::get(Ty, (double)StartIdx);
6654 
6655     // Floating point operations had to be 'fast' to enable the unrolling.
6656     Value *MulOp = addFastMathFlag(Builder.CreateFMul(C, Step));
6657     return addFastMathFlag(Builder.CreateBinOp(BinOp, Val, MulOp));
6658   }
6659   Constant *C = ConstantInt::get(Ty, StartIdx);
6660   return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction");
6661 }
6662 
6663 static void AddRuntimeUnrollDisableMetaData(Loop *L) {
6664   SmallVector<Metadata *, 4> MDs;
6665   // Reserve first location for self reference to the LoopID metadata node.
6666   MDs.push_back(nullptr);
6667   bool IsUnrollMetadata = false;
6668   MDNode *LoopID = L->getLoopID();
6669   if (LoopID) {
6670     // First find existing loop unrolling disable metadata.
6671     for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
6672       auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
6673       if (MD) {
6674         const auto *S = dyn_cast<MDString>(MD->getOperand(0));
6675         IsUnrollMetadata =
6676             S && S->getString().startswith("llvm.loop.unroll.disable");
6677       }
6678       MDs.push_back(LoopID->getOperand(i));
6679     }
6680   }
6681 
6682   if (!IsUnrollMetadata) {
6683     // Add runtime unroll disable metadata.
6684     LLVMContext &Context = L->getHeader()->getContext();
6685     SmallVector<Metadata *, 1> DisableOperands;
6686     DisableOperands.push_back(
6687         MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
6688     MDNode *DisableNode = MDNode::get(Context, DisableOperands);
6689     MDs.push_back(DisableNode);
6690     MDNode *NewLoopID = MDNode::get(Context, MDs);
6691     // Set operand 0 to refer to the loop id itself.
6692     NewLoopID->replaceOperandWith(0, NewLoopID);
6693     L->setLoopID(NewLoopID);
6694   }
6695 }
6696 
6697 bool LoopVectorizationPlanner::getDecisionAndClampRange(
6698     const std::function<bool(unsigned)> &Predicate, VFRange &Range) {
6699   assert(Range.End > Range.Start && "Trying to test an empty VF range.");
6700   bool PredicateAtRangeStart = Predicate(Range.Start);
6701 
6702   for (unsigned TmpVF = Range.Start * 2; TmpVF < Range.End; TmpVF *= 2)
6703     if (Predicate(TmpVF) != PredicateAtRangeStart) {
6704       Range.End = TmpVF;
6705       break;
6706     }
6707 
6708   return PredicateAtRangeStart;
6709 }
6710 
6711 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
6712 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
6713 /// of VF's starting at a given VF and extending it as much as possible. Each
6714 /// vectorization decision can potentially shorten this sub-range during
6715 /// buildVPlan().
6716 void LoopVectorizationPlanner::buildVPlans(unsigned MinVF, unsigned MaxVF) {
6717   for (unsigned VF = MinVF; VF < MaxVF + 1;) {
6718     VFRange SubRange = {VF, MaxVF + 1};
6719     VPlans.push_back(buildVPlan(SubRange));
6720     VF = SubRange.End;
6721   }
6722 }
6723 
6724 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
6725                                          VPlanPtr &Plan) {
6726   assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
6727 
6728   // Look for cached value.
6729   std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
6730   EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
6731   if (ECEntryIt != EdgeMaskCache.end())
6732     return ECEntryIt->second;
6733 
6734   VPValue *SrcMask = createBlockInMask(Src, Plan);
6735 
6736   // The terminator has to be a branch inst!
6737   BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
6738   assert(BI && "Unexpected terminator found");
6739 
6740   if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1))
6741     return EdgeMaskCache[Edge] = SrcMask;
6742 
6743   VPValue *EdgeMask = Plan->getVPValue(BI->getCondition());
6744   assert(EdgeMask && "No Edge Mask found for condition");
6745 
6746   if (BI->getSuccessor(0) != Dst)
6747     EdgeMask = Builder.createNot(EdgeMask);
6748 
6749   if (SrcMask) // Otherwise block in-mask is all-one, no need to AND.
6750     EdgeMask = Builder.createAnd(EdgeMask, SrcMask);
6751 
6752   return EdgeMaskCache[Edge] = EdgeMask;
6753 }
6754 
6755 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
6756   assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
6757 
6758   // Look for cached value.
6759   BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB);
6760   if (BCEntryIt != BlockMaskCache.end())
6761     return BCEntryIt->second;
6762 
6763   // All-one mask is modelled as no-mask following the convention for masked
6764   // load/store/gather/scatter. Initialize BlockMask to no-mask.
6765   VPValue *BlockMask = nullptr;
6766 
6767   if (OrigLoop->getHeader() == BB) {
6768     if (!CM.blockNeedsPredication(BB))
6769       return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one.
6770 
6771     // Introduce the early-exit compare IV <= BTC to form header block mask.
6772     // This is used instead of IV < TC because TC may wrap, unlike BTC.
6773     VPValue *IV = Plan->getVPValue(Legal->getPrimaryInduction());
6774     VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
6775     BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
6776     return BlockMaskCache[BB] = BlockMask;
6777   }
6778 
6779   // This is the block mask. We OR all incoming edges.
6780   for (auto *Predecessor : predecessors(BB)) {
6781     VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan);
6782     if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too.
6783       return BlockMaskCache[BB] = EdgeMask;
6784 
6785     if (!BlockMask) { // BlockMask has its initialized nullptr value.
6786       BlockMask = EdgeMask;
6787       continue;
6788     }
6789 
6790     BlockMask = Builder.createOr(BlockMask, EdgeMask);
6791   }
6792 
6793   return BlockMaskCache[BB] = BlockMask;
6794 }
6795 
6796 VPWidenMemoryInstructionRecipe *
6797 VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range,
6798                                   VPlanPtr &Plan) {
6799   if (!isa<LoadInst>(I) && !isa<StoreInst>(I))
6800     return nullptr;
6801 
6802   auto willWiden = [&](unsigned VF) -> bool {
6803     if (VF == 1)
6804       return false;
6805     LoopVectorizationCostModel::InstWidening Decision =
6806         CM.getWideningDecision(I, VF);
6807     assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
6808            "CM decision should be taken at this point.");
6809     if (Decision == LoopVectorizationCostModel::CM_Interleave)
6810       return true;
6811     if (CM.isScalarAfterVectorization(I, VF) ||
6812         CM.isProfitableToScalarize(I, VF))
6813       return false;
6814     return Decision != LoopVectorizationCostModel::CM_Scalarize;
6815   };
6816 
6817   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
6818     return nullptr;
6819 
6820   VPValue *Mask = nullptr;
6821   if (Legal->isMaskRequired(I))
6822     Mask = createBlockInMask(I->getParent(), Plan);
6823 
6824   VPValue *Addr = Plan->getOrAddVPValue(getLoadStorePointerOperand(I));
6825   if (LoadInst *Load = dyn_cast<LoadInst>(I))
6826     return new VPWidenMemoryInstructionRecipe(*Load, Addr, Mask);
6827 
6828   StoreInst *Store = cast<StoreInst>(I);
6829   VPValue *StoredValue = Plan->getOrAddVPValue(Store->getValueOperand());
6830   return new VPWidenMemoryInstructionRecipe(*Store, Addr, StoredValue, Mask);
6831 }
6832 
6833 VPWidenIntOrFpInductionRecipe *
6834 VPRecipeBuilder::tryToOptimizeInduction(Instruction *I, VFRange &Range) {
6835   if (PHINode *Phi = dyn_cast<PHINode>(I)) {
6836     // Check if this is an integer or fp induction. If so, build the recipe that
6837     // produces its scalar and vector values.
6838     InductionDescriptor II = Legal->getInductionVars().lookup(Phi);
6839     if (II.getKind() == InductionDescriptor::IK_IntInduction ||
6840         II.getKind() == InductionDescriptor::IK_FpInduction)
6841       return new VPWidenIntOrFpInductionRecipe(Phi);
6842 
6843     return nullptr;
6844   }
6845 
6846   // Optimize the special case where the source is a constant integer
6847   // induction variable. Notice that we can only optimize the 'trunc' case
6848   // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
6849   // (c) other casts depend on pointer size.
6850 
6851   // Determine whether \p K is a truncation based on an induction variable that
6852   // can be optimized.
6853   auto isOptimizableIVTruncate =
6854       [&](Instruction *K) -> std::function<bool(unsigned)> {
6855     return
6856         [=](unsigned VF) -> bool { return CM.isOptimizableIVTruncate(K, VF); };
6857   };
6858 
6859   if (isa<TruncInst>(I) && LoopVectorizationPlanner::getDecisionAndClampRange(
6860                                isOptimizableIVTruncate(I), Range))
6861     return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)),
6862                                              cast<TruncInst>(I));
6863   return nullptr;
6864 }
6865 
6866 VPBlendRecipe *VPRecipeBuilder::tryToBlend(Instruction *I, VPlanPtr &Plan) {
6867   PHINode *Phi = dyn_cast<PHINode>(I);
6868   if (!Phi || Phi->getParent() == OrigLoop->getHeader())
6869     return nullptr;
6870 
6871   // We know that all PHIs in non-header blocks are converted into selects, so
6872   // we don't have to worry about the insertion order and we can just use the
6873   // builder. At this point we generate the predication tree. There may be
6874   // duplications since this is a simple recursive scan, but future
6875   // optimizations will clean it up.
6876 
6877   SmallVector<VPValue *, 2> Masks;
6878   unsigned NumIncoming = Phi->getNumIncomingValues();
6879   for (unsigned In = 0; In < NumIncoming; In++) {
6880     VPValue *EdgeMask =
6881       createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan);
6882     assert((EdgeMask || NumIncoming == 1) &&
6883            "Multiple predecessors with one having a full mask");
6884     if (EdgeMask)
6885       Masks.push_back(EdgeMask);
6886   }
6887   return new VPBlendRecipe(Phi, Masks);
6888 }
6889 
6890 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(Instruction *I,
6891                                                    VFRange &Range) {
6892 
6893   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
6894       [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range);
6895 
6896   CallInst *CI = dyn_cast<CallInst>(I);
6897   if (IsPredicated || !CI)
6898     return nullptr;
6899 
6900   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
6901   if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
6902              ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect))
6903     return nullptr;
6904 
6905   auto willWiden = [&](unsigned VF) -> bool {
6906     Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
6907     // The following case may be scalarized depending on the VF.
6908     // The flag shows whether we use Intrinsic or a usual Call for vectorized
6909     // version of the instruction.
6910     // Is it beneficial to perform intrinsic call compared to lib call?
6911     bool NeedToScalarize = false;
6912     unsigned CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize);
6913     bool UseVectorIntrinsic =
6914         ID && CM.getVectorIntrinsicCost(CI, VF) <= CallCost;
6915     return UseVectorIntrinsic || !NeedToScalarize;
6916   };
6917 
6918   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
6919     return nullptr;
6920 
6921   // Success: widen this call.
6922   return new VPWidenCallRecipe(*CI);
6923 }
6924 
6925 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, VFRange &Range) {
6926   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
6927       [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range);
6928 
6929   if (IsPredicated)
6930     return nullptr;
6931 
6932   auto IsVectorizableOpcode = [](unsigned Opcode) {
6933     switch (Opcode) {
6934     case Instruction::Add:
6935     case Instruction::And:
6936     case Instruction::AShr:
6937     case Instruction::BitCast:
6938     case Instruction::Br:
6939     case Instruction::FAdd:
6940     case Instruction::FCmp:
6941     case Instruction::FDiv:
6942     case Instruction::FMul:
6943     case Instruction::FNeg:
6944     case Instruction::FPExt:
6945     case Instruction::FPToSI:
6946     case Instruction::FPToUI:
6947     case Instruction::FPTrunc:
6948     case Instruction::FRem:
6949     case Instruction::FSub:
6950     case Instruction::ICmp:
6951     case Instruction::IntToPtr:
6952     case Instruction::Load:
6953     case Instruction::LShr:
6954     case Instruction::Mul:
6955     case Instruction::Or:
6956     case Instruction::PHI:
6957     case Instruction::PtrToInt:
6958     case Instruction::SDiv:
6959     case Instruction::Select:
6960     case Instruction::SExt:
6961     case Instruction::Shl:
6962     case Instruction::SIToFP:
6963     case Instruction::SRem:
6964     case Instruction::Store:
6965     case Instruction::Sub:
6966     case Instruction::Trunc:
6967     case Instruction::UDiv:
6968     case Instruction::UIToFP:
6969     case Instruction::URem:
6970     case Instruction::Xor:
6971     case Instruction::ZExt:
6972       return true;
6973     }
6974     return false;
6975   };
6976 
6977   if (!IsVectorizableOpcode(I->getOpcode()))
6978     return nullptr;
6979 
6980   auto willWiden = [&](unsigned VF) -> bool {
6981     if (!isa<PHINode>(I) && (CM.isScalarAfterVectorization(I, VF) ||
6982                              CM.isProfitableToScalarize(I, VF)))
6983       return false;
6984     if (isa<LoadInst>(I) || isa<StoreInst>(I)) {
6985       assert(CM.getWideningDecision(I, VF) ==
6986                  LoopVectorizationCostModel::CM_Scalarize &&
6987              "Memory widening decisions should have been taken care by now");
6988       return false;
6989     }
6990     return true;
6991   };
6992 
6993   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
6994     return nullptr;
6995 
6996   // Success: widen this instruction.
6997   return new VPWidenRecipe(*I);
6998 }
6999 
7000 VPBasicBlock *VPRecipeBuilder::handleReplication(
7001     Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
7002     DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe,
7003     VPlanPtr &Plan) {
7004   bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
7005       [&](unsigned VF) { return CM.isUniformAfterVectorization(I, VF); },
7006       Range);
7007 
7008   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
7009       [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range);
7010 
7011   auto *Recipe = new VPReplicateRecipe(I, IsUniform, IsPredicated);
7012   setRecipe(I, Recipe);
7013 
7014   // Find if I uses a predicated instruction. If so, it will use its scalar
7015   // value. Avoid hoisting the insert-element which packs the scalar value into
7016   // a vector value, as that happens iff all users use the vector value.
7017   for (auto &Op : I->operands())
7018     if (auto *PredInst = dyn_cast<Instruction>(Op))
7019       if (PredInst2Recipe.find(PredInst) != PredInst2Recipe.end())
7020         PredInst2Recipe[PredInst]->setAlsoPack(false);
7021 
7022   // Finalize the recipe for Instr, first if it is not predicated.
7023   if (!IsPredicated) {
7024     LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
7025     VPBB->appendRecipe(Recipe);
7026     return VPBB;
7027   }
7028   LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
7029   assert(VPBB->getSuccessors().empty() &&
7030          "VPBB has successors when handling predicated replication.");
7031   // Record predicated instructions for above packing optimizations.
7032   PredInst2Recipe[I] = Recipe;
7033   VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan);
7034   VPBlockUtils::insertBlockAfter(Region, VPBB);
7035   auto *RegSucc = new VPBasicBlock();
7036   VPBlockUtils::insertBlockAfter(RegSucc, Region);
7037   return RegSucc;
7038 }
7039 
7040 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr,
7041                                                       VPRecipeBase *PredRecipe,
7042                                                       VPlanPtr &Plan) {
7043   // Instructions marked for predication are replicated and placed under an
7044   // if-then construct to prevent side-effects.
7045 
7046   // Generate recipes to compute the block mask for this region.
7047   VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan);
7048 
7049   // Build the triangular if-then region.
7050   std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
7051   assert(Instr->getParent() && "Predicated instruction not in any basic block");
7052   auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask);
7053   auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
7054   auto *PHIRecipe =
7055       Instr->getType()->isVoidTy() ? nullptr : new VPPredInstPHIRecipe(Instr);
7056   auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
7057   auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe);
7058   VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true);
7059 
7060   // Note: first set Entry as region entry and then connect successors starting
7061   // from it in order, to propagate the "parent" of each VPBasicBlock.
7062   VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry);
7063   VPBlockUtils::connectBlocks(Pred, Exit);
7064 
7065   return Region;
7066 }
7067 
7068 bool VPRecipeBuilder::tryToCreateRecipe(Instruction *Instr, VFRange &Range,
7069                                         VPlanPtr &Plan, VPBasicBlock *VPBB) {
7070   VPRecipeBase *Recipe = nullptr;
7071 
7072   // First, check for specific widening recipes that deal with calls, memory
7073   // operations, inductions and Phi nodes.
7074   if ((Recipe = tryToWidenCall(Instr, Range)) ||
7075       (Recipe = tryToWidenMemory(Instr, Range, Plan)) ||
7076       (Recipe = tryToOptimizeInduction(Instr, Range)) ||
7077       (Recipe = tryToBlend(Instr, Plan)) ||
7078       (isa<PHINode>(Instr) &&
7079        (Recipe = new VPWidenPHIRecipe(cast<PHINode>(Instr))))) {
7080     setRecipe(Instr, Recipe);
7081     VPBB->appendRecipe(Recipe);
7082     return true;
7083   }
7084 
7085   // Handle GEP widening.
7086   if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Instr)) {
7087     auto Scalarize = [&](unsigned VF) {
7088       return CM.isScalarWithPredication(Instr, VF) ||
7089              CM.isScalarAfterVectorization(Instr, VF) ||
7090              CM.isProfitableToScalarize(Instr, VF);
7091     };
7092     if (LoopVectorizationPlanner::getDecisionAndClampRange(Scalarize, Range))
7093       return false;
7094     VPWidenGEPRecipe *Recipe = new VPWidenGEPRecipe(GEP, OrigLoop);
7095     setRecipe(Instr, Recipe);
7096     VPBB->appendRecipe(Recipe);
7097     return true;
7098   }
7099 
7100   // Check if Instr is to be widened by a general VPWidenRecipe, after
7101   // having first checked for specific widening recipes.
7102   if ((Recipe = tryToWiden(Instr, Range))) {
7103     setRecipe(Instr, Recipe);
7104     VPBB->appendRecipe(Recipe);
7105     return true;
7106   }
7107 
7108   return false;
7109 }
7110 
7111 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF,
7112                                                         unsigned MaxVF) {
7113   assert(OrigLoop->empty() && "Inner loop expected.");
7114 
7115   // Collect conditions feeding internal conditional branches; they need to be
7116   // represented in VPlan for it to model masking.
7117   SmallPtrSet<Value *, 1> NeedDef;
7118 
7119   auto *Latch = OrigLoop->getLoopLatch();
7120   for (BasicBlock *BB : OrigLoop->blocks()) {
7121     if (BB == Latch)
7122       continue;
7123     BranchInst *Branch = dyn_cast<BranchInst>(BB->getTerminator());
7124     if (Branch && Branch->isConditional())
7125       NeedDef.insert(Branch->getCondition());
7126   }
7127 
7128   // If the tail is to be folded by masking, the primary induction variable
7129   // needs to be represented in VPlan for it to model early-exit masking.
7130   // Also, both the Phi and the live-out instruction of each reduction are
7131   // required in order to introduce a select between them in VPlan.
7132   if (CM.foldTailByMasking()) {
7133     NeedDef.insert(Legal->getPrimaryInduction());
7134     for (auto &Reduction : Legal->getReductionVars()) {
7135       NeedDef.insert(Reduction.first);
7136       NeedDef.insert(Reduction.second.getLoopExitInstr());
7137     }
7138   }
7139 
7140   // Collect instructions from the original loop that will become trivially dead
7141   // in the vectorized loop. We don't need to vectorize these instructions. For
7142   // example, original induction update instructions can become dead because we
7143   // separately emit induction "steps" when generating code for the new loop.
7144   // Similarly, we create a new latch condition when setting up the structure
7145   // of the new loop, so the old one can become dead.
7146   SmallPtrSet<Instruction *, 4> DeadInstructions;
7147   collectTriviallyDeadInstructions(DeadInstructions);
7148 
7149   // Add assume instructions we need to drop to DeadInstructions, to prevent
7150   // them from being added to the VPlan.
7151   // TODO: We only need to drop assumes in blocks that get flattend. If the
7152   // control flow is preserved, we should keep them.
7153   auto &ConditionalAssumes = Legal->getConditionalAssumes();
7154   DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end());
7155 
7156   DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter();
7157   // Dead instructions do not need sinking. Remove them from SinkAfter.
7158   for (Instruction *I : DeadInstructions)
7159     SinkAfter.erase(I);
7160 
7161   for (unsigned VF = MinVF; VF < MaxVF + 1;) {
7162     VFRange SubRange = {VF, MaxVF + 1};
7163     VPlans.push_back(buildVPlanWithVPRecipes(SubRange, NeedDef,
7164                                              DeadInstructions, SinkAfter));
7165     VF = SubRange.End;
7166   }
7167 }
7168 
7169 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
7170     VFRange &Range, SmallPtrSetImpl<Value *> &NeedDef,
7171     SmallPtrSetImpl<Instruction *> &DeadInstructions,
7172     const DenseMap<Instruction *, Instruction *> &SinkAfter) {
7173 
7174   // Hold a mapping from predicated instructions to their recipes, in order to
7175   // fix their AlsoPack behavior if a user is determined to replicate and use a
7176   // scalar instead of vector value.
7177   DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe;
7178 
7179   SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
7180 
7181   VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, Builder);
7182 
7183   // ---------------------------------------------------------------------------
7184   // Pre-construction: record ingredients whose recipes we'll need to further
7185   // process after constructing the initial VPlan.
7186   // ---------------------------------------------------------------------------
7187 
7188   // Mark instructions we'll need to sink later and their targets as
7189   // ingredients whose recipe we'll need to record.
7190   for (auto &Entry : SinkAfter) {
7191     RecipeBuilder.recordRecipeOf(Entry.first);
7192     RecipeBuilder.recordRecipeOf(Entry.second);
7193   }
7194 
7195   // For each interleave group which is relevant for this (possibly trimmed)
7196   // Range, add it to the set of groups to be later applied to the VPlan and add
7197   // placeholders for its members' Recipes which we'll be replacing with a
7198   // single VPInterleaveRecipe.
7199   for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
7200     auto applyIG = [IG, this](unsigned VF) -> bool {
7201       return (VF >= 2 && // Query is illegal for VF == 1
7202               CM.getWideningDecision(IG->getInsertPos(), VF) ==
7203                   LoopVectorizationCostModel::CM_Interleave);
7204     };
7205     if (!getDecisionAndClampRange(applyIG, Range))
7206       continue;
7207     InterleaveGroups.insert(IG);
7208     for (unsigned i = 0; i < IG->getFactor(); i++)
7209       if (Instruction *Member = IG->getMember(i))
7210         RecipeBuilder.recordRecipeOf(Member);
7211   };
7212 
7213   // ---------------------------------------------------------------------------
7214   // Build initial VPlan: Scan the body of the loop in a topological order to
7215   // visit each basic block after having visited its predecessor basic blocks.
7216   // ---------------------------------------------------------------------------
7217 
7218   // Create a dummy pre-entry VPBasicBlock to start building the VPlan.
7219   auto Plan = std::make_unique<VPlan>();
7220   VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry");
7221   Plan->setEntry(VPBB);
7222 
7223   // Represent values that will have defs inside VPlan.
7224   for (Value *V : NeedDef)
7225     Plan->addVPValue(V);
7226 
7227   // Scan the body of the loop in a topological order to visit each basic block
7228   // after having visited its predecessor basic blocks.
7229   LoopBlocksDFS DFS(OrigLoop);
7230   DFS.perform(LI);
7231 
7232   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
7233     // Relevant instructions from basic block BB will be grouped into VPRecipe
7234     // ingredients and fill a new VPBasicBlock.
7235     unsigned VPBBsForBB = 0;
7236     auto *FirstVPBBForBB = new VPBasicBlock(BB->getName());
7237     VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB);
7238     VPBB = FirstVPBBForBB;
7239     Builder.setInsertPoint(VPBB);
7240 
7241     // Introduce each ingredient into VPlan.
7242     for (Instruction &I : BB->instructionsWithoutDebug()) {
7243       Instruction *Instr = &I;
7244 
7245       // First filter out irrelevant instructions, to ensure no recipes are
7246       // built for them.
7247       if (isa<BranchInst>(Instr) ||
7248           DeadInstructions.find(Instr) != DeadInstructions.end())
7249         continue;
7250 
7251       if (RecipeBuilder.tryToCreateRecipe(Instr, Range, Plan, VPBB))
7252         continue;
7253 
7254       // Otherwise, if all widening options failed, Instruction is to be
7255       // replicated. This may create a successor for VPBB.
7256       VPBasicBlock *NextVPBB = RecipeBuilder.handleReplication(
7257           Instr, Range, VPBB, PredInst2Recipe, Plan);
7258       if (NextVPBB != VPBB) {
7259         VPBB = NextVPBB;
7260         VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++)
7261                                     : "");
7262       }
7263     }
7264   }
7265 
7266   // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks
7267   // may also be empty, such as the last one VPBB, reflecting original
7268   // basic-blocks with no recipes.
7269   VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry());
7270   assert(PreEntry->empty() && "Expecting empty pre-entry block.");
7271   VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor());
7272   VPBlockUtils::disconnectBlocks(PreEntry, Entry);
7273   delete PreEntry;
7274 
7275   // ---------------------------------------------------------------------------
7276   // Transform initial VPlan: Apply previously taken decisions, in order, to
7277   // bring the VPlan to its final state.
7278   // ---------------------------------------------------------------------------
7279 
7280   // Apply Sink-After legal constraints.
7281   for (auto &Entry : SinkAfter) {
7282     VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first);
7283     VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second);
7284     Sink->moveAfter(Target);
7285   }
7286 
7287   // Interleave memory: for each Interleave Group we marked earlier as relevant
7288   // for this VPlan, replace the Recipes widening its memory instructions with a
7289   // single VPInterleaveRecipe at its insertion point.
7290   for (auto IG : InterleaveGroups) {
7291     auto *Recipe = cast<VPWidenMemoryInstructionRecipe>(
7292         RecipeBuilder.getRecipe(IG->getInsertPos()));
7293     (new VPInterleaveRecipe(IG, Recipe->getAddr(), Recipe->getMask()))
7294         ->insertBefore(Recipe);
7295 
7296     for (unsigned i = 0; i < IG->getFactor(); ++i)
7297       if (Instruction *Member = IG->getMember(i)) {
7298         RecipeBuilder.getRecipe(Member)->eraseFromParent();
7299       }
7300   }
7301 
7302   // Finally, if tail is folded by masking, introduce selects between the phi
7303   // and the live-out instruction of each reduction, at the end of the latch.
7304   if (CM.foldTailByMasking()) {
7305     Builder.setInsertPoint(VPBB);
7306     auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan);
7307     for (auto &Reduction : Legal->getReductionVars()) {
7308       VPValue *Phi = Plan->getVPValue(Reduction.first);
7309       VPValue *Red = Plan->getVPValue(Reduction.second.getLoopExitInstr());
7310       Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi});
7311     }
7312   }
7313 
7314   std::string PlanName;
7315   raw_string_ostream RSO(PlanName);
7316   unsigned VF = Range.Start;
7317   Plan->addVF(VF);
7318   RSO << "Initial VPlan for VF={" << VF;
7319   for (VF *= 2; VF < Range.End; VF *= 2) {
7320     Plan->addVF(VF);
7321     RSO << "," << VF;
7322   }
7323   RSO << "},UF>=1";
7324   RSO.flush();
7325   Plan->setName(PlanName);
7326 
7327   return Plan;
7328 }
7329 
7330 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
7331   // Outer loop handling: They may require CFG and instruction level
7332   // transformations before even evaluating whether vectorization is profitable.
7333   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
7334   // the vectorization pipeline.
7335   assert(!OrigLoop->empty());
7336   assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
7337 
7338   // Create new empty VPlan
7339   auto Plan = std::make_unique<VPlan>();
7340 
7341   // Build hierarchical CFG
7342   VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
7343   HCFGBuilder.buildHierarchicalCFG();
7344 
7345   for (unsigned VF = Range.Start; VF < Range.End; VF *= 2)
7346     Plan->addVF(VF);
7347 
7348   if (EnableVPlanPredication) {
7349     VPlanPredicator VPP(*Plan);
7350     VPP.predicate();
7351 
7352     // Avoid running transformation to recipes until masked code generation in
7353     // VPlan-native path is in place.
7354     return Plan;
7355   }
7356 
7357   SmallPtrSet<Instruction *, 1> DeadInstructions;
7358   VPlanTransforms::VPInstructionsToVPRecipes(
7359       OrigLoop, Plan, Legal->getInductionVars(), DeadInstructions);
7360   return Plan;
7361 }
7362 
7363 Value* LoopVectorizationPlanner::VPCallbackILV::
7364 getOrCreateVectorValues(Value *V, unsigned Part) {
7365       return ILV.getOrCreateVectorValue(V, Part);
7366 }
7367 
7368 Value *LoopVectorizationPlanner::VPCallbackILV::getOrCreateScalarValue(
7369     Value *V, const VPIteration &Instance) {
7370   return ILV.getOrCreateScalarValue(V, Instance);
7371 }
7372 
7373 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
7374                                VPSlotTracker &SlotTracker) const {
7375   O << " +\n"
7376     << Indent << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
7377   IG->getInsertPos()->printAsOperand(O, false);
7378   O << ", ";
7379   getAddr()->printAsOperand(O, SlotTracker);
7380   VPValue *Mask = getMask();
7381   if (Mask) {
7382     O << ", ";
7383     Mask->printAsOperand(O, SlotTracker);
7384   }
7385   O << "\\l\"";
7386   for (unsigned i = 0; i < IG->getFactor(); ++i)
7387     if (Instruction *I = IG->getMember(i))
7388       O << " +\n"
7389         << Indent << "\"  " << VPlanIngredient(I) << " " << i << "\\l\"";
7390 }
7391 
7392 void VPWidenCallRecipe::execute(VPTransformState &State) {
7393   State.ILV->widenCallInstruction(Ingredient);
7394 }
7395 
7396 void VPWidenRecipe::execute(VPTransformState &State) {
7397   State.ILV->widenInstruction(Ingredient);
7398 }
7399 
7400 void VPWidenGEPRecipe::execute(VPTransformState &State) {
7401   State.ILV->widenGEP(GEP, State.UF, State.VF, IsPtrLoopInvariant,
7402                       IsIndexLoopInvariant);
7403 }
7404 
7405 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
7406   assert(!State.Instance && "Int or FP induction being replicated.");
7407   State.ILV->widenIntOrFpInduction(IV, Trunc);
7408 }
7409 
7410 void VPWidenPHIRecipe::execute(VPTransformState &State) {
7411   State.ILV->widenPHIInstruction(Phi, State.UF, State.VF);
7412 }
7413 
7414 void VPBlendRecipe::execute(VPTransformState &State) {
7415   State.ILV->setDebugLocFromInst(State.Builder, Phi);
7416   // We know that all PHIs in non-header blocks are converted into
7417   // selects, so we don't have to worry about the insertion order and we
7418   // can just use the builder.
7419   // At this point we generate the predication tree. There may be
7420   // duplications since this is a simple recursive scan, but future
7421   // optimizations will clean it up.
7422 
7423   unsigned NumIncoming = Phi->getNumIncomingValues();
7424 
7425   assert((User || NumIncoming == 1) &&
7426          "Multiple predecessors with predecessors having a full mask");
7427   // Generate a sequence of selects of the form:
7428   // SELECT(Mask3, In3,
7429   //      SELECT(Mask2, In2,
7430   //                   ( ...)))
7431   InnerLoopVectorizer::VectorParts Entry(State.UF);
7432   for (unsigned In = 0; In < NumIncoming; ++In) {
7433     for (unsigned Part = 0; Part < State.UF; ++Part) {
7434       // We might have single edge PHIs (blocks) - use an identity
7435       // 'select' for the first PHI operand.
7436       Value *In0 =
7437           State.ILV->getOrCreateVectorValue(Phi->getIncomingValue(In), Part);
7438       if (In == 0)
7439         Entry[Part] = In0; // Initialize with the first incoming value.
7440       else {
7441         // Select between the current value and the previous incoming edge
7442         // based on the incoming mask.
7443         Value *Cond = State.get(User->getOperand(In), Part);
7444         Entry[Part] =
7445             State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi");
7446       }
7447     }
7448   }
7449   for (unsigned Part = 0; Part < State.UF; ++Part)
7450     State.ValueMap.setVectorValue(Phi, Part, Entry[Part]);
7451 }
7452 
7453 void VPInterleaveRecipe::execute(VPTransformState &State) {
7454   assert(!State.Instance && "Interleave group being replicated.");
7455   State.ILV->vectorizeInterleaveGroup(IG->getInsertPos(), State, getAddr(),
7456                                       getMask());
7457 }
7458 
7459 void VPReplicateRecipe::execute(VPTransformState &State) {
7460   if (State.Instance) { // Generate a single instance.
7461     State.ILV->scalarizeInstruction(Ingredient, *State.Instance, IsPredicated);
7462     // Insert scalar instance packing it into a vector.
7463     if (AlsoPack && State.VF > 1) {
7464       // If we're constructing lane 0, initialize to start from undef.
7465       if (State.Instance->Lane == 0) {
7466         Value *Undef =
7467             UndefValue::get(VectorType::get(Ingredient->getType(), State.VF));
7468         State.ValueMap.setVectorValue(Ingredient, State.Instance->Part, Undef);
7469       }
7470       State.ILV->packScalarIntoVectorValue(Ingredient, *State.Instance);
7471     }
7472     return;
7473   }
7474 
7475   // Generate scalar instances for all VF lanes of all UF parts, unless the
7476   // instruction is uniform inwhich case generate only the first lane for each
7477   // of the UF parts.
7478   unsigned EndLane = IsUniform ? 1 : State.VF;
7479   for (unsigned Part = 0; Part < State.UF; ++Part)
7480     for (unsigned Lane = 0; Lane < EndLane; ++Lane)
7481       State.ILV->scalarizeInstruction(Ingredient, {Part, Lane}, IsPredicated);
7482 }
7483 
7484 void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
7485   assert(State.Instance && "Branch on Mask works only on single instance.");
7486 
7487   unsigned Part = State.Instance->Part;
7488   unsigned Lane = State.Instance->Lane;
7489 
7490   Value *ConditionBit = nullptr;
7491   if (!User) // Block in mask is all-one.
7492     ConditionBit = State.Builder.getTrue();
7493   else {
7494     VPValue *BlockInMask = User->getOperand(0);
7495     ConditionBit = State.get(BlockInMask, Part);
7496     if (ConditionBit->getType()->isVectorTy())
7497       ConditionBit = State.Builder.CreateExtractElement(
7498           ConditionBit, State.Builder.getInt32(Lane));
7499   }
7500 
7501   // Replace the temporary unreachable terminator with a new conditional branch,
7502   // whose two destinations will be set later when they are created.
7503   auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
7504   assert(isa<UnreachableInst>(CurrentTerminator) &&
7505          "Expected to replace unreachable terminator with conditional branch.");
7506   auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
7507   CondBr->setSuccessor(0, nullptr);
7508   ReplaceInstWithInst(CurrentTerminator, CondBr);
7509 }
7510 
7511 void VPPredInstPHIRecipe::execute(VPTransformState &State) {
7512   assert(State.Instance && "Predicated instruction PHI works per instance.");
7513   Instruction *ScalarPredInst = cast<Instruction>(
7514       State.ValueMap.getScalarValue(PredInst, *State.Instance));
7515   BasicBlock *PredicatedBB = ScalarPredInst->getParent();
7516   BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
7517   assert(PredicatingBB && "Predicated block has no single predecessor.");
7518 
7519   // By current pack/unpack logic we need to generate only a single phi node: if
7520   // a vector value for the predicated instruction exists at this point it means
7521   // the instruction has vector users only, and a phi for the vector value is
7522   // needed. In this case the recipe of the predicated instruction is marked to
7523   // also do that packing, thereby "hoisting" the insert-element sequence.
7524   // Otherwise, a phi node for the scalar value is needed.
7525   unsigned Part = State.Instance->Part;
7526   if (State.ValueMap.hasVectorValue(PredInst, Part)) {
7527     Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part);
7528     InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
7529     PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
7530     VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
7531     VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
7532     State.ValueMap.resetVectorValue(PredInst, Part, VPhi); // Update cache.
7533   } else {
7534     Type *PredInstType = PredInst->getType();
7535     PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
7536     Phi->addIncoming(UndefValue::get(ScalarPredInst->getType()), PredicatingBB);
7537     Phi->addIncoming(ScalarPredInst, PredicatedBB);
7538     State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi);
7539   }
7540 }
7541 
7542 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
7543   VPValue *StoredValue = isa<StoreInst>(Instr) ? getStoredValue() : nullptr;
7544   State.ILV->vectorizeMemoryInstruction(&Instr, State, getAddr(), StoredValue,
7545                                         getMask());
7546 }
7547 
7548 // Determine how to lower the scalar epilogue, which depends on 1) optimising
7549 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
7550 // predication, and 4) a TTI hook that analyses whether the loop is suitable
7551 // for predication.
7552 static ScalarEpilogueLowering getScalarEpilogueLowering(
7553     Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI,
7554     BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
7555     AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
7556     LoopVectorizationLegality &LVL) {
7557   bool OptSize =
7558       F->hasOptSize() || llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
7559                                                      PGSOQueryType::IRPass);
7560   // 1) OptSize takes precedence over all other options, i.e. if this is set,
7561   // don't look at hints or options, and don't request a scalar epilogue.
7562   if (OptSize && Hints.getForce() != LoopVectorizeHints::FK_Enabled)
7563     return CM_ScalarEpilogueNotAllowedOptSize;
7564 
7565   bool PredicateOptDisabled = PreferPredicateOverEpilog.getNumOccurrences() &&
7566                               !PreferPredicateOverEpilog;
7567 
7568   // 2) Next, if disabling predication is requested on the command line, honour
7569   // this and request a scalar epilogue. Also do this if we don't have a
7570   // primary induction variable, which is required for predication.
7571   if (PredicateOptDisabled || !LVL.getPrimaryInduction())
7572     return CM_ScalarEpilogueAllowed;
7573 
7574   // 3) and 4) look if enabling predication is requested on the command line,
7575   // with a loop hint, or if the TTI hook indicates this is profitable, request
7576   // predication .
7577   if (PreferPredicateOverEpilog ||
7578       Hints.getPredicate() == LoopVectorizeHints::FK_Enabled ||
7579       (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT,
7580                                         LVL.getLAI()) &&
7581        Hints.getPredicate() != LoopVectorizeHints::FK_Disabled))
7582     return CM_ScalarEpilogueNotNeededUsePredicate;
7583 
7584   return CM_ScalarEpilogueAllowed;
7585 }
7586 
7587 // Process the loop in the VPlan-native vectorization path. This path builds
7588 // VPlan upfront in the vectorization pipeline, which allows to apply
7589 // VPlan-to-VPlan transformations from the very beginning without modifying the
7590 // input LLVM IR.
7591 static bool processLoopInVPlanNativePath(
7592     Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
7593     LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
7594     TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
7595     OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
7596     ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) {
7597 
7598   assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
7599   Function *F = L->getHeader()->getParent();
7600   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
7601 
7602   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
7603       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL);
7604 
7605   LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
7606                                 &Hints, IAI);
7607   // Use the planner for outer loop vectorization.
7608   // TODO: CM is not used at this point inside the planner. Turn CM into an
7609   // optional argument if we don't need it in the future.
7610   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI);
7611 
7612   // Get user vectorization factor.
7613   const unsigned UserVF = Hints.getWidth();
7614 
7615   // Plan how to best vectorize, return the best VF and its cost.
7616   const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
7617 
7618   // If we are stress testing VPlan builds, do not attempt to generate vector
7619   // code. Masked vector code generation support will follow soon.
7620   // Also, do not attempt to vectorize if no vector code will be produced.
7621   if (VPlanBuildStressTest || EnableVPlanPredication ||
7622       VectorizationFactor::Disabled() == VF)
7623     return false;
7624 
7625   LVP.setBestPlan(VF.Width, 1);
7626 
7627   InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL,
7628                          &CM);
7629   LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
7630                     << L->getHeader()->getParent()->getName() << "\"\n");
7631   LVP.executePlan(LB, DT);
7632 
7633   // Mark the loop as already vectorized to avoid vectorizing again.
7634   Hints.setAlreadyVectorized();
7635 
7636   LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent()));
7637   return true;
7638 }
7639 
7640 bool LoopVectorizePass::processLoop(Loop *L) {
7641   assert((EnableVPlanNativePath || L->empty()) &&
7642          "VPlan-native path is not enabled. Only process inner loops.");
7643 
7644 #ifndef NDEBUG
7645   const std::string DebugLocStr = getDebugLocString(L);
7646 #endif /* NDEBUG */
7647 
7648   LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \""
7649                     << L->getHeader()->getParent()->getName() << "\" from "
7650                     << DebugLocStr << "\n");
7651 
7652   LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE);
7653 
7654   LLVM_DEBUG(
7655       dbgs() << "LV: Loop hints:"
7656              << " force="
7657              << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
7658                      ? "disabled"
7659                      : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
7660                             ? "enabled"
7661                             : "?"))
7662              << " width=" << Hints.getWidth()
7663              << " unroll=" << Hints.getInterleave() << "\n");
7664 
7665   // Function containing loop
7666   Function *F = L->getHeader()->getParent();
7667 
7668   // Looking at the diagnostic output is the only way to determine if a loop
7669   // was vectorized (other than looking at the IR or machine code), so it
7670   // is important to generate an optimization remark for each loop. Most of
7671   // these messages are generated as OptimizationRemarkAnalysis. Remarks
7672   // generated as OptimizationRemark and OptimizationRemarkMissed are
7673   // less verbose reporting vectorized loops and unvectorized loops that may
7674   // benefit from vectorization, respectively.
7675 
7676   if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
7677     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
7678     return false;
7679   }
7680 
7681   PredicatedScalarEvolution PSE(*SE, *L);
7682 
7683   // Check if it is legal to vectorize the loop.
7684   LoopVectorizationRequirements Requirements(*ORE);
7685   LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE,
7686                                 &Requirements, &Hints, DB, AC);
7687   if (!LVL.canVectorize(EnableVPlanNativePath)) {
7688     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
7689     Hints.emitRemarkWithHints();
7690     return false;
7691   }
7692 
7693   // Check the function attributes and profiles to find out if this function
7694   // should be optimized for size.
7695   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
7696       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL);
7697 
7698   // Entrance to the VPlan-native vectorization path. Outer loops are processed
7699   // here. They may require CFG and instruction level transformations before
7700   // even evaluating whether vectorization is profitable. Since we cannot modify
7701   // the incoming IR, we need to build VPlan upfront in the vectorization
7702   // pipeline.
7703   if (!L->empty())
7704     return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
7705                                         ORE, BFI, PSI, Hints);
7706 
7707   assert(L->empty() && "Inner loop expected.");
7708 
7709   // Check the loop for a trip count threshold: vectorize loops with a tiny trip
7710   // count by optimizing for size, to minimize overheads.
7711   auto ExpectedTC = getSmallBestKnownTC(*SE, L);
7712   if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
7713     LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
7714                       << "This loop is worth vectorizing only if no scalar "
7715                       << "iteration overheads are incurred.");
7716     if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
7717       LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
7718     else {
7719       LLVM_DEBUG(dbgs() << "\n");
7720       SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
7721     }
7722   }
7723 
7724   // Check the function attributes to see if implicit floats are allowed.
7725   // FIXME: This check doesn't seem possibly correct -- what if the loop is
7726   // an integer loop and the vector instructions selected are purely integer
7727   // vector instructions?
7728   if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
7729     reportVectorizationFailure(
7730         "Can't vectorize when the NoImplicitFloat attribute is used",
7731         "loop not vectorized due to NoImplicitFloat attribute",
7732         "NoImplicitFloat", ORE, L);
7733     Hints.emitRemarkWithHints();
7734     return false;
7735   }
7736 
7737   // Check if the target supports potentially unsafe FP vectorization.
7738   // FIXME: Add a check for the type of safety issue (denormal, signaling)
7739   // for the target we're vectorizing for, to make sure none of the
7740   // additional fp-math flags can help.
7741   if (Hints.isPotentiallyUnsafe() &&
7742       TTI->isFPVectorizationPotentiallyUnsafe()) {
7743     reportVectorizationFailure(
7744         "Potentially unsafe FP op prevents vectorization",
7745         "loop not vectorized due to unsafe FP support.",
7746         "UnsafeFP", ORE, L);
7747     Hints.emitRemarkWithHints();
7748     return false;
7749   }
7750 
7751   bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
7752   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
7753 
7754   // If an override option has been passed in for interleaved accesses, use it.
7755   if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
7756     UseInterleaved = EnableInterleavedMemAccesses;
7757 
7758   // Analyze interleaved memory accesses.
7759   if (UseInterleaved) {
7760     IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
7761   }
7762 
7763   // Use the cost model.
7764   LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
7765                                 F, &Hints, IAI);
7766   CM.collectValuesToIgnore();
7767 
7768   // Use the planner for vectorization.
7769   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI);
7770 
7771   // Get user vectorization factor.
7772   unsigned UserVF = Hints.getWidth();
7773 
7774   // Plan how to best vectorize, return the best VF and its cost.
7775   Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF);
7776 
7777   VectorizationFactor VF = VectorizationFactor::Disabled();
7778   unsigned IC = 1;
7779   unsigned UserIC = Hints.getInterleave();
7780 
7781   if (MaybeVF) {
7782     VF = *MaybeVF;
7783     // Select the interleave count.
7784     IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
7785   }
7786 
7787   // Identify the diagnostic messages that should be produced.
7788   std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
7789   bool VectorizeLoop = true, InterleaveLoop = true;
7790   if (Requirements.doesNotMeet(F, L, Hints)) {
7791     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization "
7792                          "requirements.\n");
7793     Hints.emitRemarkWithHints();
7794     return false;
7795   }
7796 
7797   if (VF.Width == 1) {
7798     LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
7799     VecDiagMsg = std::make_pair(
7800         "VectorizationNotBeneficial",
7801         "the cost-model indicates that vectorization is not beneficial");
7802     VectorizeLoop = false;
7803   }
7804 
7805   if (!MaybeVF && UserIC > 1) {
7806     // Tell the user interleaving was avoided up-front, despite being explicitly
7807     // requested.
7808     LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
7809                          "interleaving should be avoided up front\n");
7810     IntDiagMsg = std::make_pair(
7811         "InterleavingAvoided",
7812         "Ignoring UserIC, because interleaving was avoided up front");
7813     InterleaveLoop = false;
7814   } else if (IC == 1 && UserIC <= 1) {
7815     // Tell the user interleaving is not beneficial.
7816     LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
7817     IntDiagMsg = std::make_pair(
7818         "InterleavingNotBeneficial",
7819         "the cost-model indicates that interleaving is not beneficial");
7820     InterleaveLoop = false;
7821     if (UserIC == 1) {
7822       IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
7823       IntDiagMsg.second +=
7824           " and is explicitly disabled or interleave count is set to 1";
7825     }
7826   } else if (IC > 1 && UserIC == 1) {
7827     // Tell the user interleaving is beneficial, but it explicitly disabled.
7828     LLVM_DEBUG(
7829         dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
7830     IntDiagMsg = std::make_pair(
7831         "InterleavingBeneficialButDisabled",
7832         "the cost-model indicates that interleaving is beneficial "
7833         "but is explicitly disabled or interleave count is set to 1");
7834     InterleaveLoop = false;
7835   }
7836 
7837   // Override IC if user provided an interleave count.
7838   IC = UserIC > 0 ? UserIC : IC;
7839 
7840   // Emit diagnostic messages, if any.
7841   const char *VAPassName = Hints.vectorizeAnalysisPassName();
7842   if (!VectorizeLoop && !InterleaveLoop) {
7843     // Do not vectorize or interleaving the loop.
7844     ORE->emit([&]() {
7845       return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
7846                                       L->getStartLoc(), L->getHeader())
7847              << VecDiagMsg.second;
7848     });
7849     ORE->emit([&]() {
7850       return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
7851                                       L->getStartLoc(), L->getHeader())
7852              << IntDiagMsg.second;
7853     });
7854     return false;
7855   } else if (!VectorizeLoop && InterleaveLoop) {
7856     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
7857     ORE->emit([&]() {
7858       return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
7859                                         L->getStartLoc(), L->getHeader())
7860              << VecDiagMsg.second;
7861     });
7862   } else if (VectorizeLoop && !InterleaveLoop) {
7863     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
7864                       << ") in " << DebugLocStr << '\n');
7865     ORE->emit([&]() {
7866       return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
7867                                         L->getStartLoc(), L->getHeader())
7868              << IntDiagMsg.second;
7869     });
7870   } else if (VectorizeLoop && InterleaveLoop) {
7871     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
7872                       << ") in " << DebugLocStr << '\n');
7873     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
7874   }
7875 
7876   LVP.setBestPlan(VF.Width, IC);
7877 
7878   using namespace ore;
7879   bool DisableRuntimeUnroll = false;
7880   MDNode *OrigLoopID = L->getLoopID();
7881 
7882   if (!VectorizeLoop) {
7883     assert(IC > 1 && "interleave count should not be 1 or 0");
7884     // If we decided that it is not legal to vectorize the loop, then
7885     // interleave it.
7886     InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
7887                                &CM);
7888     LVP.executePlan(Unroller, DT);
7889 
7890     ORE->emit([&]() {
7891       return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
7892                                 L->getHeader())
7893              << "interleaved loop (interleaved count: "
7894              << NV("InterleaveCount", IC) << ")";
7895     });
7896   } else {
7897     // If we decided that it is *legal* to vectorize the loop, then do it.
7898     InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
7899                            &LVL, &CM);
7900     LVP.executePlan(LB, DT);
7901     ++LoopsVectorized;
7902 
7903     // Add metadata to disable runtime unrolling a scalar loop when there are
7904     // no runtime checks about strides and memory. A scalar loop that is
7905     // rarely used is not worth unrolling.
7906     if (!LB.areSafetyChecksAdded())
7907       DisableRuntimeUnroll = true;
7908 
7909     // Report the vectorization decision.
7910     ORE->emit([&]() {
7911       return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(),
7912                                 L->getHeader())
7913              << "vectorized loop (vectorization width: "
7914              << NV("VectorizationFactor", VF.Width)
7915              << ", interleaved count: " << NV("InterleaveCount", IC) << ")";
7916     });
7917   }
7918 
7919   Optional<MDNode *> RemainderLoopID =
7920       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
7921                                       LLVMLoopVectorizeFollowupEpilogue});
7922   if (RemainderLoopID.hasValue()) {
7923     L->setLoopID(RemainderLoopID.getValue());
7924   } else {
7925     if (DisableRuntimeUnroll)
7926       AddRuntimeUnrollDisableMetaData(L);
7927 
7928     // Mark the loop as already vectorized to avoid vectorizing again.
7929     Hints.setAlreadyVectorized();
7930   }
7931 
7932   LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent()));
7933   return true;
7934 }
7935 
7936 bool LoopVectorizePass::runImpl(
7937     Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_,
7938     DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_,
7939     DemandedBits &DB_, AliasAnalysis &AA_, AssumptionCache &AC_,
7940     std::function<const LoopAccessInfo &(Loop &)> &GetLAA_,
7941     OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) {
7942   SE = &SE_;
7943   LI = &LI_;
7944   TTI = &TTI_;
7945   DT = &DT_;
7946   BFI = &BFI_;
7947   TLI = TLI_;
7948   AA = &AA_;
7949   AC = &AC_;
7950   GetLAA = &GetLAA_;
7951   DB = &DB_;
7952   ORE = &ORE_;
7953   PSI = PSI_;
7954 
7955   // Don't attempt if
7956   // 1. the target claims to have no vector registers, and
7957   // 2. interleaving won't help ILP.
7958   //
7959   // The second condition is necessary because, even if the target has no
7960   // vector registers, loop vectorization may still enable scalar
7961   // interleaving.
7962   if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) &&
7963       TTI->getMaxInterleaveFactor(1) < 2)
7964     return false;
7965 
7966   bool Changed = false;
7967 
7968   // The vectorizer requires loops to be in simplified form.
7969   // Since simplification may add new inner loops, it has to run before the
7970   // legality and profitability checks. This means running the loop vectorizer
7971   // will simplify all loops, regardless of whether anything end up being
7972   // vectorized.
7973   for (auto &L : *LI)
7974     Changed |=
7975         simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
7976 
7977   // Build up a worklist of inner-loops to vectorize. This is necessary as
7978   // the act of vectorizing or partially unrolling a loop creates new loops
7979   // and can invalidate iterators across the loops.
7980   SmallVector<Loop *, 8> Worklist;
7981 
7982   for (Loop *L : *LI)
7983     collectSupportedLoops(*L, LI, ORE, Worklist);
7984 
7985   LoopsAnalyzed += Worklist.size();
7986 
7987   // Now walk the identified inner loops.
7988   while (!Worklist.empty()) {
7989     Loop *L = Worklist.pop_back_val();
7990 
7991     // For the inner loops we actually process, form LCSSA to simplify the
7992     // transform.
7993     Changed |= formLCSSARecursively(*L, *DT, LI, SE);
7994 
7995     Changed |= processLoop(L);
7996   }
7997 
7998   // Process each loop nest in the function.
7999   return Changed;
8000 }
8001 
8002 PreservedAnalyses LoopVectorizePass::run(Function &F,
8003                                          FunctionAnalysisManager &AM) {
8004     auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
8005     auto &LI = AM.getResult<LoopAnalysis>(F);
8006     auto &TTI = AM.getResult<TargetIRAnalysis>(F);
8007     auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
8008     auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F);
8009     auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
8010     auto &AA = AM.getResult<AAManager>(F);
8011     auto &AC = AM.getResult<AssumptionAnalysis>(F);
8012     auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
8013     auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
8014     MemorySSA *MSSA = EnableMSSALoopDependency
8015                           ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA()
8016                           : nullptr;
8017 
8018     auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
8019     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
8020         [&](Loop &L) -> const LoopAccessInfo & {
8021       LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI, MSSA};
8022       return LAM.getResult<LoopAccessAnalysis>(L, AR);
8023     };
8024     const ModuleAnalysisManager &MAM =
8025         AM.getResult<ModuleAnalysisManagerFunctionProxy>(F).getManager();
8026     ProfileSummaryInfo *PSI =
8027         MAM.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
8028     bool Changed =
8029         runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI);
8030     if (!Changed)
8031       return PreservedAnalyses::all();
8032     PreservedAnalyses PA;
8033 
8034     // We currently do not preserve loopinfo/dominator analyses with outer loop
8035     // vectorization. Until this is addressed, mark these analyses as preserved
8036     // only for non-VPlan-native path.
8037     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
8038     if (!EnableVPlanNativePath) {
8039       PA.preserve<LoopAnalysis>();
8040       PA.preserve<DominatorTreeAnalysis>();
8041     }
8042     PA.preserve<BasicAA>();
8043     PA.preserve<GlobalsAA>();
8044     return PA;
8045 }
8046