1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10 // and generates target-independent LLVM-IR.
11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12 // of instructions in order to estimate the profitability of vectorization.
13 //
14 // The loop vectorizer combines consecutive loop iterations into a single
15 // 'wide' iteration. After this transformation the index is incremented
16 // by the SIMD vector width, and not by one.
17 //
18 // This pass has three parts:
19 // 1. The main loop pass that drives the different parts.
20 // 2. LoopVectorizationLegality - A unit that checks for the legality
21 //    of the vectorization.
22 // 3. InnerLoopVectorizer - A unit that performs the actual
23 //    widening of instructions.
24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
25 //    of vectorization. It decides on the optimal vector width, which
26 //    can be one, if vectorization is not profitable.
27 //
28 // There is a development effort going on to migrate loop vectorizer to the
29 // VPlan infrastructure and to introduce outer loop vectorization support (see
30 // docs/Proposal/VectorizationPlan.rst and
31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32 // purpose, we temporarily introduced the VPlan-native vectorization path: an
33 // alternative vectorization path that is natively implemented on top of the
34 // VPlan infrastructure. See EnableVPlanNativePath for enabling.
35 //
36 //===----------------------------------------------------------------------===//
37 //
38 // The reduction-variable vectorization is based on the paper:
39 //  D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40 //
41 // Variable uniformity checks are inspired by:
42 //  Karrenberg, R. and Hack, S. Whole Function Vectorization.
43 //
44 // The interleaved access vectorization is based on the paper:
45 //  Dorit Nuzman, Ira Rosen and Ayal Zaks.  Auto-Vectorization of Interleaved
46 //  Data for SIMD
47 //
48 // Other ideas/concepts are from:
49 //  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50 //
51 //  S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua.  An Evaluation of
52 //  Vectorizing Compilers.
53 //
54 //===----------------------------------------------------------------------===//
55 
56 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
57 #include "LoopVectorizationPlanner.h"
58 #include "VPRecipeBuilder.h"
59 #include "VPlan.h"
60 #include "VPlanHCFGBuilder.h"
61 #include "VPlanPredicator.h"
62 #include "VPlanTransforms.h"
63 #include "llvm/ADT/APInt.h"
64 #include "llvm/ADT/ArrayRef.h"
65 #include "llvm/ADT/DenseMap.h"
66 #include "llvm/ADT/DenseMapInfo.h"
67 #include "llvm/ADT/Hashing.h"
68 #include "llvm/ADT/MapVector.h"
69 #include "llvm/ADT/None.h"
70 #include "llvm/ADT/Optional.h"
71 #include "llvm/ADT/STLExtras.h"
72 #include "llvm/ADT/SetVector.h"
73 #include "llvm/ADT/SmallPtrSet.h"
74 #include "llvm/ADT/SmallVector.h"
75 #include "llvm/ADT/Statistic.h"
76 #include "llvm/ADT/StringRef.h"
77 #include "llvm/ADT/Twine.h"
78 #include "llvm/ADT/iterator_range.h"
79 #include "llvm/Analysis/AssumptionCache.h"
80 #include "llvm/Analysis/BasicAliasAnalysis.h"
81 #include "llvm/Analysis/BlockFrequencyInfo.h"
82 #include "llvm/Analysis/CFG.h"
83 #include "llvm/Analysis/CodeMetrics.h"
84 #include "llvm/Analysis/DemandedBits.h"
85 #include "llvm/Analysis/GlobalsModRef.h"
86 #include "llvm/Analysis/LoopAccessAnalysis.h"
87 #include "llvm/Analysis/LoopAnalysisManager.h"
88 #include "llvm/Analysis/LoopInfo.h"
89 #include "llvm/Analysis/LoopIterator.h"
90 #include "llvm/Analysis/MemorySSA.h"
91 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
92 #include "llvm/Analysis/ProfileSummaryInfo.h"
93 #include "llvm/Analysis/ScalarEvolution.h"
94 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
95 #include "llvm/Analysis/TargetLibraryInfo.h"
96 #include "llvm/Analysis/TargetTransformInfo.h"
97 #include "llvm/Analysis/VectorUtils.h"
98 #include "llvm/IR/Attributes.h"
99 #include "llvm/IR/BasicBlock.h"
100 #include "llvm/IR/CFG.h"
101 #include "llvm/IR/Constant.h"
102 #include "llvm/IR/Constants.h"
103 #include "llvm/IR/DataLayout.h"
104 #include "llvm/IR/DebugInfoMetadata.h"
105 #include "llvm/IR/DebugLoc.h"
106 #include "llvm/IR/DerivedTypes.h"
107 #include "llvm/IR/DiagnosticInfo.h"
108 #include "llvm/IR/Dominators.h"
109 #include "llvm/IR/Function.h"
110 #include "llvm/IR/IRBuilder.h"
111 #include "llvm/IR/InstrTypes.h"
112 #include "llvm/IR/Instruction.h"
113 #include "llvm/IR/Instructions.h"
114 #include "llvm/IR/IntrinsicInst.h"
115 #include "llvm/IR/Intrinsics.h"
116 #include "llvm/IR/LLVMContext.h"
117 #include "llvm/IR/Metadata.h"
118 #include "llvm/IR/Module.h"
119 #include "llvm/IR/Operator.h"
120 #include "llvm/IR/Type.h"
121 #include "llvm/IR/Use.h"
122 #include "llvm/IR/User.h"
123 #include "llvm/IR/Value.h"
124 #include "llvm/IR/ValueHandle.h"
125 #include "llvm/IR/Verifier.h"
126 #include "llvm/InitializePasses.h"
127 #include "llvm/Pass.h"
128 #include "llvm/Support/Casting.h"
129 #include "llvm/Support/CommandLine.h"
130 #include "llvm/Support/Compiler.h"
131 #include "llvm/Support/Debug.h"
132 #include "llvm/Support/ErrorHandling.h"
133 #include "llvm/Support/MathExtras.h"
134 #include "llvm/Support/raw_ostream.h"
135 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
136 #include "llvm/Transforms/Utils/InjectTLIMappings.h"
137 #include "llvm/Transforms/Utils/LoopSimplify.h"
138 #include "llvm/Transforms/Utils/LoopUtils.h"
139 #include "llvm/Transforms/Utils/LoopVersioning.h"
140 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
141 #include "llvm/Transforms/Utils/SizeOpts.h"
142 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
143 #include <algorithm>
144 #include <cassert>
145 #include <cstdint>
146 #include <cstdlib>
147 #include <functional>
148 #include <iterator>
149 #include <limits>
150 #include <memory>
151 #include <string>
152 #include <tuple>
153 #include <utility>
154 
155 using namespace llvm;
156 
157 #define LV_NAME "loop-vectorize"
158 #define DEBUG_TYPE LV_NAME
159 
160 /// @{
161 /// Metadata attribute names
162 static const char *const LLVMLoopVectorizeFollowupAll =
163     "llvm.loop.vectorize.followup_all";
164 static const char *const LLVMLoopVectorizeFollowupVectorized =
165     "llvm.loop.vectorize.followup_vectorized";
166 static const char *const LLVMLoopVectorizeFollowupEpilogue =
167     "llvm.loop.vectorize.followup_epilogue";
168 /// @}
169 
170 STATISTIC(LoopsVectorized, "Number of loops vectorized");
171 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
172 
173 /// Loops with a known constant trip count below this number are vectorized only
174 /// if no scalar iteration overheads are incurred.
175 static cl::opt<unsigned> TinyTripCountVectorThreshold(
176     "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
177     cl::desc("Loops with a constant trip count that is smaller than this "
178              "value are vectorized only if no scalar iteration overheads "
179              "are incurred."));
180 
181 // Indicates that an epilogue is undesired, predication is preferred.
182 // This means that the vectorizer will try to fold the loop-tail (epilogue)
183 // into the loop and predicate the loop body accordingly.
184 static cl::opt<bool> PreferPredicateOverEpilog(
185     "prefer-predicate-over-epilog", cl::init(false), cl::Hidden,
186     cl::desc("Indicate that an epilogue is undesired, predication should be "
187              "used instead."));
188 
189 static cl::opt<bool> MaximizeBandwidth(
190     "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
191     cl::desc("Maximize bandwidth when selecting vectorization factor which "
192              "will be determined by the smallest type in loop."));
193 
194 static cl::opt<bool> EnableInterleavedMemAccesses(
195     "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
196     cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
197 
198 /// An interleave-group may need masking if it resides in a block that needs
199 /// predication, or in order to mask away gaps.
200 static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
201     "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
202     cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
203 
204 static cl::opt<unsigned> TinyTripCountInterleaveThreshold(
205     "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden,
206     cl::desc("We don't interleave loops with a estimated constant trip count "
207              "below this number"));
208 
209 static cl::opt<unsigned> ForceTargetNumScalarRegs(
210     "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
211     cl::desc("A flag that overrides the target's number of scalar registers."));
212 
213 static cl::opt<unsigned> ForceTargetNumVectorRegs(
214     "force-target-num-vector-regs", cl::init(0), cl::Hidden,
215     cl::desc("A flag that overrides the target's number of vector registers."));
216 
217 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
218     "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
219     cl::desc("A flag that overrides the target's max interleave factor for "
220              "scalar loops."));
221 
222 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
223     "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
224     cl::desc("A flag that overrides the target's max interleave factor for "
225              "vectorized loops."));
226 
227 static cl::opt<unsigned> ForceTargetInstructionCost(
228     "force-target-instruction-cost", cl::init(0), cl::Hidden,
229     cl::desc("A flag that overrides the target's expected cost for "
230              "an instruction to a single constant value. Mostly "
231              "useful for getting consistent testing."));
232 
233 static cl::opt<unsigned> SmallLoopCost(
234     "small-loop-cost", cl::init(20), cl::Hidden,
235     cl::desc(
236         "The cost of a loop that is considered 'small' by the interleaver."));
237 
238 static cl::opt<bool> LoopVectorizeWithBlockFrequency(
239     "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
240     cl::desc("Enable the use of the block frequency analysis to access PGO "
241              "heuristics minimizing code growth in cold regions and being more "
242              "aggressive in hot regions."));
243 
244 // Runtime interleave loops for load/store throughput.
245 static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
246     "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
247     cl::desc(
248         "Enable runtime interleaving until load/store ports are saturated"));
249 
250 /// The number of stores in a loop that are allowed to need predication.
251 static cl::opt<unsigned> NumberOfStoresToPredicate(
252     "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
253     cl::desc("Max number of stores to be predicated behind an if."));
254 
255 static cl::opt<bool> EnableIndVarRegisterHeur(
256     "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
257     cl::desc("Count the induction variable only once when interleaving"));
258 
259 static cl::opt<bool> EnableCondStoresVectorization(
260     "enable-cond-stores-vec", cl::init(true), cl::Hidden,
261     cl::desc("Enable if predication of stores during vectorization."));
262 
263 static cl::opt<unsigned> MaxNestedScalarReductionIC(
264     "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
265     cl::desc("The maximum interleave count to use when interleaving a scalar "
266              "reduction in a nested loop."));
267 
268 cl::opt<bool> EnableVPlanNativePath(
269     "enable-vplan-native-path", cl::init(false), cl::Hidden,
270     cl::desc("Enable VPlan-native vectorization path with "
271              "support for outer loop vectorization."));
272 
273 // FIXME: Remove this switch once we have divergence analysis. Currently we
274 // assume divergent non-backedge branches when this switch is true.
275 cl::opt<bool> EnableVPlanPredication(
276     "enable-vplan-predication", cl::init(false), cl::Hidden,
277     cl::desc("Enable VPlan-native vectorization path predicator with "
278              "support for outer loop vectorization."));
279 
280 // This flag enables the stress testing of the VPlan H-CFG construction in the
281 // VPlan-native vectorization path. It must be used in conjuction with
282 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
283 // verification of the H-CFGs built.
284 static cl::opt<bool> VPlanBuildStressTest(
285     "vplan-build-stress-test", cl::init(false), cl::Hidden,
286     cl::desc(
287         "Build VPlan for every supported loop nest in the function and bail "
288         "out right after the build (stress test the VPlan H-CFG construction "
289         "in the VPlan-native vectorization path)."));
290 
291 cl::opt<bool> llvm::EnableLoopInterleaving(
292     "interleave-loops", cl::init(true), cl::Hidden,
293     cl::desc("Enable loop interleaving in Loop vectorization passes"));
294 cl::opt<bool> llvm::EnableLoopVectorization(
295     "vectorize-loops", cl::init(true), cl::Hidden,
296     cl::desc("Run the Loop vectorization passes"));
297 
298 /// A helper function that returns the type of loaded or stored value.
299 static Type *getMemInstValueType(Value *I) {
300   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
301          "Expected Load or Store instruction");
302   if (auto *LI = dyn_cast<LoadInst>(I))
303     return LI->getType();
304   return cast<StoreInst>(I)->getValueOperand()->getType();
305 }
306 
307 /// A helper function that returns true if the given type is irregular. The
308 /// type is irregular if its allocated size doesn't equal the store size of an
309 /// element of the corresponding vector type at the given vectorization factor.
310 static bool hasIrregularType(Type *Ty, const DataLayout &DL, unsigned VF) {
311   // Determine if an array of VF elements of type Ty is "bitcast compatible"
312   // with a <VF x Ty> vector.
313   if (VF > 1) {
314     auto *VectorTy = FixedVectorType::get(Ty, VF);
315     return VF * DL.getTypeAllocSize(Ty) != DL.getTypeStoreSize(VectorTy);
316   }
317 
318   // If the vectorization factor is one, we just check if an array of type Ty
319   // requires padding between elements.
320   return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
321 }
322 
323 /// A helper function that returns the reciprocal of the block probability of
324 /// predicated blocks. If we return X, we are assuming the predicated block
325 /// will execute once for every X iterations of the loop header.
326 ///
327 /// TODO: We should use actual block probability here, if available. Currently,
328 ///       we always assume predicated blocks have a 50% chance of executing.
329 static unsigned getReciprocalPredBlockProb() { return 2; }
330 
331 /// A helper function that adds a 'fast' flag to floating-point operations.
332 static Value *addFastMathFlag(Value *V) {
333   if (isa<FPMathOperator>(V))
334     cast<Instruction>(V)->setFastMathFlags(FastMathFlags::getFast());
335   return V;
336 }
337 
338 static Value *addFastMathFlag(Value *V, FastMathFlags FMF) {
339   if (isa<FPMathOperator>(V))
340     cast<Instruction>(V)->setFastMathFlags(FMF);
341   return V;
342 }
343 
344 /// A helper function that returns an integer or floating-point constant with
345 /// value C.
346 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
347   return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
348                            : ConstantFP::get(Ty, C);
349 }
350 
351 /// Returns "best known" trip count for the specified loop \p L as defined by
352 /// the following procedure:
353 ///   1) Returns exact trip count if it is known.
354 ///   2) Returns expected trip count according to profile data if any.
355 ///   3) Returns upper bound estimate if it is known.
356 ///   4) Returns None if all of the above failed.
357 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) {
358   // Check if exact trip count is known.
359   if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
360     return ExpectedTC;
361 
362   // Check if there is an expected trip count available from profile data.
363   if (LoopVectorizeWithBlockFrequency)
364     if (auto EstimatedTC = getLoopEstimatedTripCount(L))
365       return EstimatedTC;
366 
367   // Check if upper bound estimate is known.
368   if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
369     return ExpectedTC;
370 
371   return None;
372 }
373 
374 namespace llvm {
375 
376 /// InnerLoopVectorizer vectorizes loops which contain only one basic
377 /// block to a specified vectorization factor (VF).
378 /// This class performs the widening of scalars into vectors, or multiple
379 /// scalars. This class also implements the following features:
380 /// * It inserts an epilogue loop for handling loops that don't have iteration
381 ///   counts that are known to be a multiple of the vectorization factor.
382 /// * It handles the code generation for reduction variables.
383 /// * Scalarization (implementation using scalars) of un-vectorizable
384 ///   instructions.
385 /// InnerLoopVectorizer does not perform any vectorization-legality
386 /// checks, and relies on the caller to check for the different legality
387 /// aspects. The InnerLoopVectorizer relies on the
388 /// LoopVectorizationLegality class to provide information about the induction
389 /// and reduction variables that were found to a given vectorization factor.
390 class InnerLoopVectorizer {
391 public:
392   InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
393                       LoopInfo *LI, DominatorTree *DT,
394                       const TargetLibraryInfo *TLI,
395                       const TargetTransformInfo *TTI, AssumptionCache *AC,
396                       OptimizationRemarkEmitter *ORE, unsigned VecWidth,
397                       unsigned UnrollFactor, LoopVectorizationLegality *LVL,
398                       LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
399                       ProfileSummaryInfo *PSI)
400       : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
401         AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
402         Builder(PSE.getSE()->getContext()),
403         VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM),
404         BFI(BFI), PSI(PSI) {}
405   virtual ~InnerLoopVectorizer() = default;
406 
407   /// Create a new empty loop. Unlink the old loop and connect the new one.
408   /// Return the pre-header block of the new loop.
409   BasicBlock *createVectorizedLoopSkeleton();
410 
411   /// Widen a single instruction within the innermost loop.
412   void widenInstruction(Instruction &I, VPUser &Operands,
413                         VPTransformState &State);
414 
415   /// Widen a single call instruction within the innermost loop.
416   void widenCallInstruction(CallInst &I, VPUser &ArgOperands,
417                             VPTransformState &State);
418 
419   /// Widen a single select instruction within the innermost loop.
420   void widenSelectInstruction(SelectInst &I, VPUser &Operands,
421                               bool InvariantCond, VPTransformState &State);
422 
423   /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
424   void fixVectorizedLoop();
425 
426   // Return true if any runtime check is added.
427   bool areSafetyChecksAdded() { return AddedSafetyChecks; }
428 
429   /// A type for vectorized values in the new loop. Each value from the
430   /// original loop, when vectorized, is represented by UF vector values in the
431   /// new unrolled loop, where UF is the unroll factor.
432   using VectorParts = SmallVector<Value *, 2>;
433 
434   /// Vectorize a single GetElementPtrInst based on information gathered and
435   /// decisions taken during planning.
436   void widenGEP(GetElementPtrInst *GEP, VPUser &Indices, unsigned UF,
437                 unsigned VF, bool IsPtrLoopInvariant,
438                 SmallBitVector &IsIndexLoopInvariant, VPTransformState &State);
439 
440   /// Vectorize a single PHINode in a block. This method handles the induction
441   /// variable canonicalization. It supports both VF = 1 for unrolled loops and
442   /// arbitrary length vectors.
443   void widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF);
444 
445   /// A helper function to scalarize a single Instruction in the innermost loop.
446   /// Generates a sequence of scalar instances for each lane between \p MinLane
447   /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
448   /// inclusive. Uses the VPValue operands from \p Operands instead of \p
449   /// Instr's operands.
450   void scalarizeInstruction(Instruction *Instr, VPUser &Operands,
451                             const VPIteration &Instance, bool IfPredicateInstr,
452                             VPTransformState &State);
453 
454   /// Widen an integer or floating-point induction variable \p IV. If \p Trunc
455   /// is provided, the integer induction variable will first be truncated to
456   /// the corresponding type.
457   void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr);
458 
459   /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a
460   /// vector or scalar value on-demand if one is not yet available. When
461   /// vectorizing a loop, we visit the definition of an instruction before its
462   /// uses. When visiting the definition, we either vectorize or scalarize the
463   /// instruction, creating an entry for it in the corresponding map. (In some
464   /// cases, such as induction variables, we will create both vector and scalar
465   /// entries.) Then, as we encounter uses of the definition, we derive values
466   /// for each scalar or vector use unless such a value is already available.
467   /// For example, if we scalarize a definition and one of its uses is vector,
468   /// we build the required vector on-demand with an insertelement sequence
469   /// when visiting the use. Otherwise, if the use is scalar, we can use the
470   /// existing scalar definition.
471   ///
472   /// Return a value in the new loop corresponding to \p V from the original
473   /// loop at unroll index \p Part. If the value has already been vectorized,
474   /// the corresponding vector entry in VectorLoopValueMap is returned. If,
475   /// however, the value has a scalar entry in VectorLoopValueMap, we construct
476   /// a new vector value on-demand by inserting the scalar values into a vector
477   /// with an insertelement sequence. If the value has been neither vectorized
478   /// nor scalarized, it must be loop invariant, so we simply broadcast the
479   /// value into a vector.
480   Value *getOrCreateVectorValue(Value *V, unsigned Part);
481 
482   /// Return a value in the new loop corresponding to \p V from the original
483   /// loop at unroll and vector indices \p Instance. If the value has been
484   /// vectorized but not scalarized, the necessary extractelement instruction
485   /// will be generated.
486   Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance);
487 
488   /// Construct the vector value of a scalarized value \p V one lane at a time.
489   void packScalarIntoVectorValue(Value *V, const VPIteration &Instance);
490 
491   /// Try to vectorize interleaved access group \p Group with the base address
492   /// given in \p Addr, optionally masking the vector operations if \p
493   /// BlockInMask is non-null. Use \p State to translate given VPValues to IR
494   /// values in the vectorized loop.
495   void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group,
496                                 VPTransformState &State, VPValue *Addr,
497                                 VPValue *BlockInMask = nullptr);
498 
499   /// Vectorize Load and Store instructions with the base address given in \p
500   /// Addr, optionally masking the vector operations if \p BlockInMask is
501   /// non-null. Use \p State to translate given VPValues to IR values in the
502   /// vectorized loop.
503   void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State,
504                                   VPValue *Addr, VPValue *StoredValue,
505                                   VPValue *BlockInMask);
506 
507   /// Set the debug location in the builder using the debug location in
508   /// the instruction.
509   void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr);
510 
511   /// Fix the non-induction PHIs in the OrigPHIsToFix vector.
512   void fixNonInductionPHIs(void);
513 
514 protected:
515   friend class LoopVectorizationPlanner;
516 
517   /// A small list of PHINodes.
518   using PhiVector = SmallVector<PHINode *, 4>;
519 
520   /// A type for scalarized values in the new loop. Each value from the
521   /// original loop, when scalarized, is represented by UF x VF scalar values
522   /// in the new unrolled loop, where UF is the unroll factor and VF is the
523   /// vectorization factor.
524   using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
525 
526   /// Set up the values of the IVs correctly when exiting the vector loop.
527   void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
528                     Value *CountRoundDown, Value *EndValue,
529                     BasicBlock *MiddleBlock);
530 
531   /// Create a new induction variable inside L.
532   PHINode *createInductionVariable(Loop *L, Value *Start, Value *End,
533                                    Value *Step, Instruction *DL);
534 
535   /// Handle all cross-iteration phis in the header.
536   void fixCrossIterationPHIs();
537 
538   /// Fix a first-order recurrence. This is the second phase of vectorizing
539   /// this phi node.
540   void fixFirstOrderRecurrence(PHINode *Phi);
541 
542   /// Fix a reduction cross-iteration phi. This is the second phase of
543   /// vectorizing this phi node.
544   void fixReduction(PHINode *Phi);
545 
546   /// Clear NSW/NUW flags from reduction instructions if necessary.
547   void clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc);
548 
549   /// The Loop exit block may have single value PHI nodes with some
550   /// incoming value. While vectorizing we only handled real values
551   /// that were defined inside the loop and we should have one value for
552   /// each predecessor of its parent basic block. See PR14725.
553   void fixLCSSAPHIs();
554 
555   /// Iteratively sink the scalarized operands of a predicated instruction into
556   /// the block that was created for it.
557   void sinkScalarOperands(Instruction *PredInst);
558 
559   /// Shrinks vector element sizes to the smallest bitwidth they can be legally
560   /// represented as.
561   void truncateToMinimalBitwidths();
562 
563   /// Create a broadcast instruction. This method generates a broadcast
564   /// instruction (shuffle) for loop invariant values and for the induction
565   /// value. If this is the induction variable then we extend it to N, N+1, ...
566   /// this is needed because each iteration in the loop corresponds to a SIMD
567   /// element.
568   virtual Value *getBroadcastInstrs(Value *V);
569 
570   /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...)
571   /// to each vector element of Val. The sequence starts at StartIndex.
572   /// \p Opcode is relevant for FP induction variable.
573   virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step,
574                                Instruction::BinaryOps Opcode =
575                                Instruction::BinaryOpsEnd);
576 
577   /// Compute scalar induction steps. \p ScalarIV is the scalar induction
578   /// variable on which to base the steps, \p Step is the size of the step, and
579   /// \p EntryVal is the value from the original loop that maps to the steps.
580   /// Note that \p EntryVal doesn't have to be an induction variable - it
581   /// can also be a truncate instruction.
582   void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal,
583                         const InductionDescriptor &ID);
584 
585   /// Create a vector induction phi node based on an existing scalar one. \p
586   /// EntryVal is the value from the original loop that maps to the vector phi
587   /// node, and \p Step is the loop-invariant step. If \p EntryVal is a
588   /// truncate instruction, instead of widening the original IV, we widen a
589   /// version of the IV truncated to \p EntryVal's type.
590   void createVectorIntOrFpInductionPHI(const InductionDescriptor &II,
591                                        Value *Step, Instruction *EntryVal);
592 
593   /// Returns true if an instruction \p I should be scalarized instead of
594   /// vectorized for the chosen vectorization factor.
595   bool shouldScalarizeInstruction(Instruction *I) const;
596 
597   /// Returns true if we should generate a scalar version of \p IV.
598   bool needsScalarInduction(Instruction *IV) const;
599 
600   /// If there is a cast involved in the induction variable \p ID, which should
601   /// be ignored in the vectorized loop body, this function records the
602   /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the
603   /// cast. We had already proved that the casted Phi is equal to the uncasted
604   /// Phi in the vectorized loop (under a runtime guard), and therefore
605   /// there is no need to vectorize the cast - the same value can be used in the
606   /// vector loop for both the Phi and the cast.
607   /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified,
608   /// Otherwise, \p VectorLoopValue is a widened/vectorized value.
609   ///
610   /// \p EntryVal is the value from the original loop that maps to the vector
611   /// phi node and is used to distinguish what is the IV currently being
612   /// processed - original one (if \p EntryVal is a phi corresponding to the
613   /// original IV) or the "newly-created" one based on the proof mentioned above
614   /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the
615   /// latter case \p EntryVal is a TruncInst and we must not record anything for
616   /// that IV, but it's error-prone to expect callers of this routine to care
617   /// about that, hence this explicit parameter.
618   void recordVectorLoopValueForInductionCast(const InductionDescriptor &ID,
619                                              const Instruction *EntryVal,
620                                              Value *VectorLoopValue,
621                                              unsigned Part,
622                                              unsigned Lane = UINT_MAX);
623 
624   /// Generate a shuffle sequence that will reverse the vector Vec.
625   virtual Value *reverseVector(Value *Vec);
626 
627   /// Returns (and creates if needed) the original loop trip count.
628   Value *getOrCreateTripCount(Loop *NewLoop);
629 
630   /// Returns (and creates if needed) the trip count of the widened loop.
631   Value *getOrCreateVectorTripCount(Loop *NewLoop);
632 
633   /// Returns a bitcasted value to the requested vector type.
634   /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
635   Value *createBitOrPointerCast(Value *V, VectorType *DstVTy,
636                                 const DataLayout &DL);
637 
638   /// Emit a bypass check to see if the vector trip count is zero, including if
639   /// it overflows.
640   void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass);
641 
642   /// Emit a bypass check to see if all of the SCEV assumptions we've
643   /// had to make are correct.
644   void emitSCEVChecks(Loop *L, BasicBlock *Bypass);
645 
646   /// Emit bypass checks to check any memory assumptions we may have made.
647   void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass);
648 
649   /// Compute the transformed value of Index at offset StartValue using step
650   /// StepValue.
651   /// For integer induction, returns StartValue + Index * StepValue.
652   /// For pointer induction, returns StartValue[Index * StepValue].
653   /// FIXME: The newly created binary instructions should contain nsw/nuw
654   /// flags, which can be found from the original scalar operations.
655   Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE,
656                               const DataLayout &DL,
657                               const InductionDescriptor &ID) const;
658 
659   /// Add additional metadata to \p To that was not present on \p Orig.
660   ///
661   /// Currently this is used to add the noalias annotations based on the
662   /// inserted memchecks.  Use this for instructions that are *cloned* into the
663   /// vector loop.
664   void addNewMetadata(Instruction *To, const Instruction *Orig);
665 
666   /// Add metadata from one instruction to another.
667   ///
668   /// This includes both the original MDs from \p From and additional ones (\see
669   /// addNewMetadata).  Use this for *newly created* instructions in the vector
670   /// loop.
671   void addMetadata(Instruction *To, Instruction *From);
672 
673   /// Similar to the previous function but it adds the metadata to a
674   /// vector of instructions.
675   void addMetadata(ArrayRef<Value *> To, Instruction *From);
676 
677   /// The original loop.
678   Loop *OrigLoop;
679 
680   /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
681   /// dynamic knowledge to simplify SCEV expressions and converts them to a
682   /// more usable form.
683   PredicatedScalarEvolution &PSE;
684 
685   /// Loop Info.
686   LoopInfo *LI;
687 
688   /// Dominator Tree.
689   DominatorTree *DT;
690 
691   /// Alias Analysis.
692   AAResults *AA;
693 
694   /// Target Library Info.
695   const TargetLibraryInfo *TLI;
696 
697   /// Target Transform Info.
698   const TargetTransformInfo *TTI;
699 
700   /// Assumption Cache.
701   AssumptionCache *AC;
702 
703   /// Interface to emit optimization remarks.
704   OptimizationRemarkEmitter *ORE;
705 
706   /// LoopVersioning.  It's only set up (non-null) if memchecks were
707   /// used.
708   ///
709   /// This is currently only used to add no-alias metadata based on the
710   /// memchecks.  The actually versioning is performed manually.
711   std::unique_ptr<LoopVersioning> LVer;
712 
713   /// The vectorization SIMD factor to use. Each vector will have this many
714   /// vector elements.
715   unsigned VF;
716 
717   /// The vectorization unroll factor to use. Each scalar is vectorized to this
718   /// many different vector instructions.
719   unsigned UF;
720 
721   /// The builder that we use
722   IRBuilder<> Builder;
723 
724   // --- Vectorization state ---
725 
726   /// The vector-loop preheader.
727   BasicBlock *LoopVectorPreHeader;
728 
729   /// The scalar-loop preheader.
730   BasicBlock *LoopScalarPreHeader;
731 
732   /// Middle Block between the vector and the scalar.
733   BasicBlock *LoopMiddleBlock;
734 
735   /// The ExitBlock of the scalar loop.
736   BasicBlock *LoopExitBlock;
737 
738   /// The vector loop body.
739   BasicBlock *LoopVectorBody;
740 
741   /// The scalar loop body.
742   BasicBlock *LoopScalarBody;
743 
744   /// A list of all bypass blocks. The first block is the entry of the loop.
745   SmallVector<BasicBlock *, 4> LoopBypassBlocks;
746 
747   /// The new Induction variable which was added to the new block.
748   PHINode *Induction = nullptr;
749 
750   /// The induction variable of the old basic block.
751   PHINode *OldInduction = nullptr;
752 
753   /// Maps values from the original loop to their corresponding values in the
754   /// vectorized loop. A key value can map to either vector values, scalar
755   /// values or both kinds of values, depending on whether the key was
756   /// vectorized and scalarized.
757   VectorizerValueMap VectorLoopValueMap;
758 
759   /// Store instructions that were predicated.
760   SmallVector<Instruction *, 4> PredicatedInstructions;
761 
762   /// Trip count of the original loop.
763   Value *TripCount = nullptr;
764 
765   /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
766   Value *VectorTripCount = nullptr;
767 
768   /// The legality analysis.
769   LoopVectorizationLegality *Legal;
770 
771   /// The profitablity analysis.
772   LoopVectorizationCostModel *Cost;
773 
774   // Record whether runtime checks are added.
775   bool AddedSafetyChecks = false;
776 
777   // Holds the end values for each induction variable. We save the end values
778   // so we can later fix-up the external users of the induction variables.
779   DenseMap<PHINode *, Value *> IVEndValues;
780 
781   // Vector of original scalar PHIs whose corresponding widened PHIs need to be
782   // fixed up at the end of vector code generation.
783   SmallVector<PHINode *, 8> OrigPHIsToFix;
784 
785   /// BFI and PSI are used to check for profile guided size optimizations.
786   BlockFrequencyInfo *BFI;
787   ProfileSummaryInfo *PSI;
788 };
789 
790 class InnerLoopUnroller : public InnerLoopVectorizer {
791 public:
792   InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
793                     LoopInfo *LI, DominatorTree *DT,
794                     const TargetLibraryInfo *TLI,
795                     const TargetTransformInfo *TTI, AssumptionCache *AC,
796                     OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
797                     LoopVectorizationLegality *LVL,
798                     LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
799                     ProfileSummaryInfo *PSI)
800       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1,
801                             UnrollFactor, LVL, CM, BFI, PSI) {}
802 
803 private:
804   Value *getBroadcastInstrs(Value *V) override;
805   Value *getStepVector(Value *Val, int StartIdx, Value *Step,
806                        Instruction::BinaryOps Opcode =
807                        Instruction::BinaryOpsEnd) override;
808   Value *reverseVector(Value *Vec) override;
809 };
810 
811 } // end namespace llvm
812 
813 /// Look for a meaningful debug location on the instruction or it's
814 /// operands.
815 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
816   if (!I)
817     return I;
818 
819   DebugLoc Empty;
820   if (I->getDebugLoc() != Empty)
821     return I;
822 
823   for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) {
824     if (Instruction *OpInst = dyn_cast<Instruction>(*OI))
825       if (OpInst->getDebugLoc() != Empty)
826         return OpInst;
827   }
828 
829   return I;
830 }
831 
832 void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) {
833   if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) {
834     const DILocation *DIL = Inst->getDebugLoc();
835     if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
836         !isa<DbgInfoIntrinsic>(Inst)) {
837       auto NewDIL = DIL->cloneByMultiplyingDuplicationFactor(UF * VF);
838       if (NewDIL)
839         B.SetCurrentDebugLocation(NewDIL.getValue());
840       else
841         LLVM_DEBUG(dbgs()
842                    << "Failed to create new discriminator: "
843                    << DIL->getFilename() << " Line: " << DIL->getLine());
844     }
845     else
846       B.SetCurrentDebugLocation(DIL);
847   } else
848     B.SetCurrentDebugLocation(DebugLoc());
849 }
850 
851 /// Write a record \p DebugMsg about vectorization failure to the debug
852 /// output stream. If \p I is passed, it is an instruction that prevents
853 /// vectorization.
854 #ifndef NDEBUG
855 static void debugVectorizationFailure(const StringRef DebugMsg,
856     Instruction *I) {
857   dbgs() << "LV: Not vectorizing: " << DebugMsg;
858   if (I != nullptr)
859     dbgs() << " " << *I;
860   else
861     dbgs() << '.';
862   dbgs() << '\n';
863 }
864 #endif
865 
866 /// Create an analysis remark that explains why vectorization failed
867 ///
868 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint).  \p
869 /// RemarkName is the identifier for the remark.  If \p I is passed it is an
870 /// instruction that prevents vectorization.  Otherwise \p TheLoop is used for
871 /// the location of the remark.  \return the remark object that can be
872 /// streamed to.
873 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
874     StringRef RemarkName, Loop *TheLoop, Instruction *I) {
875   Value *CodeRegion = TheLoop->getHeader();
876   DebugLoc DL = TheLoop->getStartLoc();
877 
878   if (I) {
879     CodeRegion = I->getParent();
880     // If there is no debug location attached to the instruction, revert back to
881     // using the loop's.
882     if (I->getDebugLoc())
883       DL = I->getDebugLoc();
884   }
885 
886   OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion);
887   R << "loop not vectorized: ";
888   return R;
889 }
890 
891 namespace llvm {
892 
893 void reportVectorizationFailure(const StringRef DebugMsg,
894     const StringRef OREMsg, const StringRef ORETag,
895     OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) {
896   LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I));
897   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
898   ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(),
899                 ORETag, TheLoop, I) << OREMsg);
900 }
901 
902 } // end namespace llvm
903 
904 #ifndef NDEBUG
905 /// \return string containing a file name and a line # for the given loop.
906 static std::string getDebugLocString(const Loop *L) {
907   std::string Result;
908   if (L) {
909     raw_string_ostream OS(Result);
910     if (const DebugLoc LoopDbgLoc = L->getStartLoc())
911       LoopDbgLoc.print(OS);
912     else
913       // Just print the module name.
914       OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
915     OS.flush();
916   }
917   return Result;
918 }
919 #endif
920 
921 void InnerLoopVectorizer::addNewMetadata(Instruction *To,
922                                          const Instruction *Orig) {
923   // If the loop was versioned with memchecks, add the corresponding no-alias
924   // metadata.
925   if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig)))
926     LVer->annotateInstWithNoAlias(To, Orig);
927 }
928 
929 void InnerLoopVectorizer::addMetadata(Instruction *To,
930                                       Instruction *From) {
931   propagateMetadata(To, From);
932   addNewMetadata(To, From);
933 }
934 
935 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,
936                                       Instruction *From) {
937   for (Value *V : To) {
938     if (Instruction *I = dyn_cast<Instruction>(V))
939       addMetadata(I, From);
940   }
941 }
942 
943 namespace llvm {
944 
945 // Loop vectorization cost-model hints how the scalar epilogue loop should be
946 // lowered.
947 enum ScalarEpilogueLowering {
948 
949   // The default: allowing scalar epilogues.
950   CM_ScalarEpilogueAllowed,
951 
952   // Vectorization with OptForSize: don't allow epilogues.
953   CM_ScalarEpilogueNotAllowedOptSize,
954 
955   // A special case of vectorisation with OptForSize: loops with a very small
956   // trip count are considered for vectorization under OptForSize, thereby
957   // making sure the cost of their loop body is dominant, free of runtime
958   // guards and scalar iteration overheads.
959   CM_ScalarEpilogueNotAllowedLowTripLoop,
960 
961   // Loop hint predicate indicating an epilogue is undesired.
962   CM_ScalarEpilogueNotNeededUsePredicate
963 };
964 
965 /// LoopVectorizationCostModel - estimates the expected speedups due to
966 /// vectorization.
967 /// In many cases vectorization is not profitable. This can happen because of
968 /// a number of reasons. In this class we mainly attempt to predict the
969 /// expected speedup/slowdowns due to the supported instruction set. We use the
970 /// TargetTransformInfo to query the different backends for the cost of
971 /// different operations.
972 class LoopVectorizationCostModel {
973 public:
974   LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
975                              PredicatedScalarEvolution &PSE, LoopInfo *LI,
976                              LoopVectorizationLegality *Legal,
977                              const TargetTransformInfo &TTI,
978                              const TargetLibraryInfo *TLI, DemandedBits *DB,
979                              AssumptionCache *AC,
980                              OptimizationRemarkEmitter *ORE, const Function *F,
981                              const LoopVectorizeHints *Hints,
982                              InterleavedAccessInfo &IAI)
983       : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
984         TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
985         Hints(Hints), InterleaveInfo(IAI) {}
986 
987   /// \return An upper bound for the vectorization factor, or None if
988   /// vectorization and interleaving should be avoided up front.
989   Optional<unsigned> computeMaxVF(unsigned UserVF, unsigned UserIC);
990 
991   /// \return True if runtime checks are required for vectorization, and false
992   /// otherwise.
993   bool runtimeChecksRequired();
994 
995   /// \return The most profitable vectorization factor and the cost of that VF.
996   /// This method checks every power of two up to MaxVF. If UserVF is not ZERO
997   /// then this vectorization factor will be selected if vectorization is
998   /// possible.
999   VectorizationFactor selectVectorizationFactor(unsigned MaxVF);
1000 
1001   /// Setup cost-based decisions for user vectorization factor.
1002   void selectUserVectorizationFactor(unsigned UserVF) {
1003     collectUniformsAndScalars(UserVF);
1004     collectInstsToScalarize(UserVF);
1005   }
1006 
1007   /// \return The size (in bits) of the smallest and widest types in the code
1008   /// that needs to be vectorized. We ignore values that remain scalar such as
1009   /// 64 bit loop indices.
1010   std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
1011 
1012   /// \return The desired interleave count.
1013   /// If interleave count has been specified by metadata it will be returned.
1014   /// Otherwise, the interleave count is computed and returned. VF and LoopCost
1015   /// are the selected vectorization factor and the cost of the selected VF.
1016   unsigned selectInterleaveCount(unsigned VF, unsigned LoopCost);
1017 
1018   /// Memory access instruction may be vectorized in more than one way.
1019   /// Form of instruction after vectorization depends on cost.
1020   /// This function takes cost-based decisions for Load/Store instructions
1021   /// and collects them in a map. This decisions map is used for building
1022   /// the lists of loop-uniform and loop-scalar instructions.
1023   /// The calculated cost is saved with widening decision in order to
1024   /// avoid redundant calculations.
1025   void setCostBasedWideningDecision(unsigned VF);
1026 
1027   /// A struct that represents some properties of the register usage
1028   /// of a loop.
1029   struct RegisterUsage {
1030     /// Holds the number of loop invariant values that are used in the loop.
1031     /// The key is ClassID of target-provided register class.
1032     SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs;
1033     /// Holds the maximum number of concurrent live intervals in the loop.
1034     /// The key is ClassID of target-provided register class.
1035     SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
1036   };
1037 
1038   /// \return Returns information about the register usages of the loop for the
1039   /// given vectorization factors.
1040   SmallVector<RegisterUsage, 8> calculateRegisterUsage(ArrayRef<unsigned> VFs);
1041 
1042   /// Collect values we want to ignore in the cost model.
1043   void collectValuesToIgnore();
1044 
1045   /// \returns The smallest bitwidth each instruction can be represented with.
1046   /// The vector equivalents of these instructions should be truncated to this
1047   /// type.
1048   const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
1049     return MinBWs;
1050   }
1051 
1052   /// \returns True if it is more profitable to scalarize instruction \p I for
1053   /// vectorization factor \p VF.
1054   bool isProfitableToScalarize(Instruction *I, unsigned VF) const {
1055     assert(VF > 1 && "Profitable to scalarize relevant only for VF > 1.");
1056 
1057     // Cost model is not run in the VPlan-native path - return conservative
1058     // result until this changes.
1059     if (EnableVPlanNativePath)
1060       return false;
1061 
1062     auto Scalars = InstsToScalarize.find(VF);
1063     assert(Scalars != InstsToScalarize.end() &&
1064            "VF not yet analyzed for scalarization profitability");
1065     return Scalars->second.find(I) != Scalars->second.end();
1066   }
1067 
1068   /// Returns true if \p I is known to be uniform after vectorization.
1069   bool isUniformAfterVectorization(Instruction *I, unsigned VF) const {
1070     if (VF == 1)
1071       return true;
1072 
1073     // Cost model is not run in the VPlan-native path - return conservative
1074     // result until this changes.
1075     if (EnableVPlanNativePath)
1076       return false;
1077 
1078     auto UniformsPerVF = Uniforms.find(VF);
1079     assert(UniformsPerVF != Uniforms.end() &&
1080            "VF not yet analyzed for uniformity");
1081     return UniformsPerVF->second.count(I);
1082   }
1083 
1084   /// Returns true if \p I is known to be scalar after vectorization.
1085   bool isScalarAfterVectorization(Instruction *I, unsigned VF) const {
1086     if (VF == 1)
1087       return true;
1088 
1089     // Cost model is not run in the VPlan-native path - return conservative
1090     // result until this changes.
1091     if (EnableVPlanNativePath)
1092       return false;
1093 
1094     auto ScalarsPerVF = Scalars.find(VF);
1095     assert(ScalarsPerVF != Scalars.end() &&
1096            "Scalar values are not calculated for VF");
1097     return ScalarsPerVF->second.count(I);
1098   }
1099 
1100   /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1101   /// for vectorization factor \p VF.
1102   bool canTruncateToMinimalBitwidth(Instruction *I, unsigned VF) const {
1103     return VF > 1 && MinBWs.find(I) != MinBWs.end() &&
1104            !isProfitableToScalarize(I, VF) &&
1105            !isScalarAfterVectorization(I, VF);
1106   }
1107 
1108   /// Decision that was taken during cost calculation for memory instruction.
1109   enum InstWidening {
1110     CM_Unknown,
1111     CM_Widen,         // For consecutive accesses with stride +1.
1112     CM_Widen_Reverse, // For consecutive accesses with stride -1.
1113     CM_Interleave,
1114     CM_GatherScatter,
1115     CM_Scalarize
1116   };
1117 
1118   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1119   /// instruction \p I and vector width \p VF.
1120   void setWideningDecision(Instruction *I, unsigned VF, InstWidening W,
1121                            unsigned Cost) {
1122     assert(VF >= 2 && "Expected VF >=2");
1123     WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1124   }
1125 
1126   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1127   /// interleaving group \p Grp and vector width \p VF.
1128   void setWideningDecision(const InterleaveGroup<Instruction> *Grp, unsigned VF,
1129                            InstWidening W, unsigned Cost) {
1130     assert(VF >= 2 && "Expected VF >=2");
1131     /// Broadcast this decicion to all instructions inside the group.
1132     /// But the cost will be assigned to one instruction only.
1133     for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1134       if (auto *I = Grp->getMember(i)) {
1135         if (Grp->getInsertPos() == I)
1136           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1137         else
1138           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1139       }
1140     }
1141   }
1142 
1143   /// Return the cost model decision for the given instruction \p I and vector
1144   /// width \p VF. Return CM_Unknown if this instruction did not pass
1145   /// through the cost modeling.
1146   InstWidening getWideningDecision(Instruction *I, unsigned VF) {
1147     assert(VF >= 2 && "Expected VF >=2");
1148 
1149     // Cost model is not run in the VPlan-native path - return conservative
1150     // result until this changes.
1151     if (EnableVPlanNativePath)
1152       return CM_GatherScatter;
1153 
1154     std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
1155     auto Itr = WideningDecisions.find(InstOnVF);
1156     if (Itr == WideningDecisions.end())
1157       return CM_Unknown;
1158     return Itr->second.first;
1159   }
1160 
1161   /// Return the vectorization cost for the given instruction \p I and vector
1162   /// width \p VF.
1163   unsigned getWideningCost(Instruction *I, unsigned VF) {
1164     assert(VF >= 2 && "Expected VF >=2");
1165     std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
1166     assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&
1167            "The cost is not calculated");
1168     return WideningDecisions[InstOnVF].second;
1169   }
1170 
1171   /// Return True if instruction \p I is an optimizable truncate whose operand
1172   /// is an induction variable. Such a truncate will be removed by adding a new
1173   /// induction variable with the destination type.
1174   bool isOptimizableIVTruncate(Instruction *I, unsigned VF) {
1175     // If the instruction is not a truncate, return false.
1176     auto *Trunc = dyn_cast<TruncInst>(I);
1177     if (!Trunc)
1178       return false;
1179 
1180     // Get the source and destination types of the truncate.
1181     Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1182     Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1183 
1184     // If the truncate is free for the given types, return false. Replacing a
1185     // free truncate with an induction variable would add an induction variable
1186     // update instruction to each iteration of the loop. We exclude from this
1187     // check the primary induction variable since it will need an update
1188     // instruction regardless.
1189     Value *Op = Trunc->getOperand(0);
1190     if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1191       return false;
1192 
1193     // If the truncated value is not an induction variable, return false.
1194     return Legal->isInductionPhi(Op);
1195   }
1196 
1197   /// Collects the instructions to scalarize for each predicated instruction in
1198   /// the loop.
1199   void collectInstsToScalarize(unsigned VF);
1200 
1201   /// Collect Uniform and Scalar values for the given \p VF.
1202   /// The sets depend on CM decision for Load/Store instructions
1203   /// that may be vectorized as interleave, gather-scatter or scalarized.
1204   void collectUniformsAndScalars(unsigned VF) {
1205     // Do the analysis once.
1206     if (VF == 1 || Uniforms.find(VF) != Uniforms.end())
1207       return;
1208     setCostBasedWideningDecision(VF);
1209     collectLoopUniforms(VF);
1210     collectLoopScalars(VF);
1211   }
1212 
1213   /// Returns true if the target machine supports masked store operation
1214   /// for the given \p DataType and kind of access to \p Ptr.
1215   bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) {
1216     return Legal->isConsecutivePtr(Ptr) &&
1217            TTI.isLegalMaskedStore(DataType, Alignment);
1218   }
1219 
1220   /// Returns true if the target machine supports masked load operation
1221   /// for the given \p DataType and kind of access to \p Ptr.
1222   bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) {
1223     return Legal->isConsecutivePtr(Ptr) &&
1224            TTI.isLegalMaskedLoad(DataType, Alignment);
1225   }
1226 
1227   /// Returns true if the target machine supports masked scatter operation
1228   /// for the given \p DataType.
1229   bool isLegalMaskedScatter(Type *DataType, Align Alignment) {
1230     return TTI.isLegalMaskedScatter(DataType, Alignment);
1231   }
1232 
1233   /// Returns true if the target machine supports masked gather operation
1234   /// for the given \p DataType.
1235   bool isLegalMaskedGather(Type *DataType, Align Alignment) {
1236     return TTI.isLegalMaskedGather(DataType, Alignment);
1237   }
1238 
1239   /// Returns true if the target machine can represent \p V as a masked gather
1240   /// or scatter operation.
1241   bool isLegalGatherOrScatter(Value *V) {
1242     bool LI = isa<LoadInst>(V);
1243     bool SI = isa<StoreInst>(V);
1244     if (!LI && !SI)
1245       return false;
1246     auto *Ty = getMemInstValueType(V);
1247     Align Align = getLoadStoreAlignment(V);
1248     return (LI && isLegalMaskedGather(Ty, Align)) ||
1249            (SI && isLegalMaskedScatter(Ty, Align));
1250   }
1251 
1252   /// Returns true if \p I is an instruction that will be scalarized with
1253   /// predication. Such instructions include conditional stores and
1254   /// instructions that may divide by zero.
1255   /// If a non-zero VF has been calculated, we check if I will be scalarized
1256   /// predication for that VF.
1257   bool isScalarWithPredication(Instruction *I, unsigned VF = 1);
1258 
1259   // Returns true if \p I is an instruction that will be predicated either
1260   // through scalar predication or masked load/store or masked gather/scatter.
1261   // Superset of instructions that return true for isScalarWithPredication.
1262   bool isPredicatedInst(Instruction *I) {
1263     if (!blockNeedsPredication(I->getParent()))
1264       return false;
1265     // Loads and stores that need some form of masked operation are predicated
1266     // instructions.
1267     if (isa<LoadInst>(I) || isa<StoreInst>(I))
1268       return Legal->isMaskRequired(I);
1269     return isScalarWithPredication(I);
1270   }
1271 
1272   /// Returns true if \p I is a memory instruction with consecutive memory
1273   /// access that can be widened.
1274   bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1);
1275 
1276   /// Returns true if \p I is a memory instruction in an interleaved-group
1277   /// of memory accesses that can be vectorized with wide vector loads/stores
1278   /// and shuffles.
1279   bool interleavedAccessCanBeWidened(Instruction *I, unsigned VF = 1);
1280 
1281   /// Check if \p Instr belongs to any interleaved access group.
1282   bool isAccessInterleaved(Instruction *Instr) {
1283     return InterleaveInfo.isInterleaved(Instr);
1284   }
1285 
1286   /// Get the interleaved access group that \p Instr belongs to.
1287   const InterleaveGroup<Instruction> *
1288   getInterleavedAccessGroup(Instruction *Instr) {
1289     return InterleaveInfo.getInterleaveGroup(Instr);
1290   }
1291 
1292   /// Returns true if an interleaved group requires a scalar iteration
1293   /// to handle accesses with gaps, and there is nothing preventing us from
1294   /// creating a scalar epilogue.
1295   bool requiresScalarEpilogue() const {
1296     return isScalarEpilogueAllowed() && InterleaveInfo.requiresScalarEpilogue();
1297   }
1298 
1299   /// Returns true if a scalar epilogue is not allowed due to optsize or a
1300   /// loop hint annotation.
1301   bool isScalarEpilogueAllowed() const {
1302     return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1303   }
1304 
1305   /// Returns true if all loop blocks should be masked to fold tail loop.
1306   bool foldTailByMasking() const { return FoldTailByMasking; }
1307 
1308   bool blockNeedsPredication(BasicBlock *BB) {
1309     return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1310   }
1311 
1312   /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1313   /// with factor VF.  Return the cost of the instruction, including
1314   /// scalarization overhead if it's needed.
1315   unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF);
1316 
1317   /// Estimate cost of a call instruction CI if it were vectorized with factor
1318   /// VF. Return the cost of the instruction, including scalarization overhead
1319   /// if it's needed. The flag NeedToScalarize shows if the call needs to be
1320   /// scalarized -
1321   /// i.e. either vector version isn't available, or is too expensive.
1322   unsigned getVectorCallCost(CallInst *CI, unsigned VF, bool &NeedToScalarize);
1323 
1324   /// Invalidates decisions already taken by the cost model.
1325   void invalidateCostModelingDecisions() {
1326     WideningDecisions.clear();
1327     Uniforms.clear();
1328     Scalars.clear();
1329   }
1330 
1331 private:
1332   unsigned NumPredStores = 0;
1333 
1334   /// \return An upper bound for the vectorization factor, a power-of-2 larger
1335   /// than zero. One is returned if vectorization should best be avoided due
1336   /// to cost.
1337   unsigned computeFeasibleMaxVF(unsigned ConstTripCount);
1338 
1339   /// The vectorization cost is a combination of the cost itself and a boolean
1340   /// indicating whether any of the contributing operations will actually
1341   /// operate on
1342   /// vector values after type legalization in the backend. If this latter value
1343   /// is
1344   /// false, then all operations will be scalarized (i.e. no vectorization has
1345   /// actually taken place).
1346   using VectorizationCostTy = std::pair<unsigned, bool>;
1347 
1348   /// Returns the expected execution cost. The unit of the cost does
1349   /// not matter because we use the 'cost' units to compare different
1350   /// vector widths. The cost that is returned is *not* normalized by
1351   /// the factor width.
1352   VectorizationCostTy expectedCost(unsigned VF);
1353 
1354   /// Returns the execution time cost of an instruction for a given vector
1355   /// width. Vector width of one means scalar.
1356   VectorizationCostTy getInstructionCost(Instruction *I, unsigned VF);
1357 
1358   /// The cost-computation logic from getInstructionCost which provides
1359   /// the vector type as an output parameter.
1360   unsigned getInstructionCost(Instruction *I, unsigned VF, Type *&VectorTy);
1361 
1362   /// Calculate vectorization cost of memory instruction \p I.
1363   unsigned getMemoryInstructionCost(Instruction *I, unsigned VF);
1364 
1365   /// The cost computation for scalarized memory instruction.
1366   unsigned getMemInstScalarizationCost(Instruction *I, unsigned VF);
1367 
1368   /// The cost computation for interleaving group of memory instructions.
1369   unsigned getInterleaveGroupCost(Instruction *I, unsigned VF);
1370 
1371   /// The cost computation for Gather/Scatter instruction.
1372   unsigned getGatherScatterCost(Instruction *I, unsigned VF);
1373 
1374   /// The cost computation for widening instruction \p I with consecutive
1375   /// memory access.
1376   unsigned getConsecutiveMemOpCost(Instruction *I, unsigned VF);
1377 
1378   /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1379   /// Load: scalar load + broadcast.
1380   /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1381   /// element)
1382   unsigned getUniformMemOpCost(Instruction *I, unsigned VF);
1383 
1384   /// Estimate the overhead of scalarizing an instruction. This is a
1385   /// convenience wrapper for the type-based getScalarizationOverhead API.
1386   unsigned getScalarizationOverhead(Instruction *I, unsigned VF);
1387 
1388   /// Returns whether the instruction is a load or store and will be a emitted
1389   /// as a vector operation.
1390   bool isConsecutiveLoadOrStore(Instruction *I);
1391 
1392   /// Returns true if an artificially high cost for emulated masked memrefs
1393   /// should be used.
1394   bool useEmulatedMaskMemRefHack(Instruction *I);
1395 
1396   /// Map of scalar integer values to the smallest bitwidth they can be legally
1397   /// represented as. The vector equivalents of these values should be truncated
1398   /// to this type.
1399   MapVector<Instruction *, uint64_t> MinBWs;
1400 
1401   /// A type representing the costs for instructions if they were to be
1402   /// scalarized rather than vectorized. The entries are Instruction-Cost
1403   /// pairs.
1404   using ScalarCostsTy = DenseMap<Instruction *, unsigned>;
1405 
1406   /// A set containing all BasicBlocks that are known to present after
1407   /// vectorization as a predicated block.
1408   SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;
1409 
1410   /// Records whether it is allowed to have the original scalar loop execute at
1411   /// least once. This may be needed as a fallback loop in case runtime
1412   /// aliasing/dependence checks fail, or to handle the tail/remainder
1413   /// iterations when the trip count is unknown or doesn't divide by the VF,
1414   /// or as a peel-loop to handle gaps in interleave-groups.
1415   /// Under optsize and when the trip count is very small we don't allow any
1416   /// iterations to execute in the scalar loop.
1417   ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1418 
1419   /// All blocks of loop are to be masked to fold tail of scalar iterations.
1420   bool FoldTailByMasking = false;
1421 
1422   /// A map holding scalar costs for different vectorization factors. The
1423   /// presence of a cost for an instruction in the mapping indicates that the
1424   /// instruction will be scalarized when vectorizing with the associated
1425   /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1426   DenseMap<unsigned, ScalarCostsTy> InstsToScalarize;
1427 
1428   /// Holds the instructions known to be uniform after vectorization.
1429   /// The data is collected per VF.
1430   DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Uniforms;
1431 
1432   /// Holds the instructions known to be scalar after vectorization.
1433   /// The data is collected per VF.
1434   DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Scalars;
1435 
1436   /// Holds the instructions (address computations) that are forced to be
1437   /// scalarized.
1438   DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1439 
1440   /// Returns the expected difference in cost from scalarizing the expression
1441   /// feeding a predicated instruction \p PredInst. The instructions to
1442   /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1443   /// non-negative return value implies the expression will be scalarized.
1444   /// Currently, only single-use chains are considered for scalarization.
1445   int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
1446                               unsigned VF);
1447 
1448   /// Collect the instructions that are uniform after vectorization. An
1449   /// instruction is uniform if we represent it with a single scalar value in
1450   /// the vectorized loop corresponding to each vector iteration. Examples of
1451   /// uniform instructions include pointer operands of consecutive or
1452   /// interleaved memory accesses. Note that although uniformity implies an
1453   /// instruction will be scalar, the reverse is not true. In general, a
1454   /// scalarized instruction will be represented by VF scalar values in the
1455   /// vectorized loop, each corresponding to an iteration of the original
1456   /// scalar loop.
1457   void collectLoopUniforms(unsigned VF);
1458 
1459   /// Collect the instructions that are scalar after vectorization. An
1460   /// instruction is scalar if it is known to be uniform or will be scalarized
1461   /// during vectorization. Non-uniform scalarized instructions will be
1462   /// represented by VF values in the vectorized loop, each corresponding to an
1463   /// iteration of the original scalar loop.
1464   void collectLoopScalars(unsigned VF);
1465 
1466   /// Keeps cost model vectorization decision and cost for instructions.
1467   /// Right now it is used for memory instructions only.
1468   using DecisionList = DenseMap<std::pair<Instruction *, unsigned>,
1469                                 std::pair<InstWidening, unsigned>>;
1470 
1471   DecisionList WideningDecisions;
1472 
1473   /// Returns true if \p V is expected to be vectorized and it needs to be
1474   /// extracted.
1475   bool needsExtract(Value *V, unsigned VF) const {
1476     Instruction *I = dyn_cast<Instruction>(V);
1477     if (VF == 1 || !I || !TheLoop->contains(I) || TheLoop->isLoopInvariant(I))
1478       return false;
1479 
1480     // Assume we can vectorize V (and hence we need extraction) if the
1481     // scalars are not computed yet. This can happen, because it is called
1482     // via getScalarizationOverhead from setCostBasedWideningDecision, before
1483     // the scalars are collected. That should be a safe assumption in most
1484     // cases, because we check if the operands have vectorizable types
1485     // beforehand in LoopVectorizationLegality.
1486     return Scalars.find(VF) == Scalars.end() ||
1487            !isScalarAfterVectorization(I, VF);
1488   };
1489 
1490   /// Returns a range containing only operands needing to be extracted.
1491   SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1492                                                    unsigned VF) {
1493     return SmallVector<Value *, 4>(make_filter_range(
1494         Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1495   }
1496 
1497 public:
1498   /// The loop that we evaluate.
1499   Loop *TheLoop;
1500 
1501   /// Predicated scalar evolution analysis.
1502   PredicatedScalarEvolution &PSE;
1503 
1504   /// Loop Info analysis.
1505   LoopInfo *LI;
1506 
1507   /// Vectorization legality.
1508   LoopVectorizationLegality *Legal;
1509 
1510   /// Vector target information.
1511   const TargetTransformInfo &TTI;
1512 
1513   /// Target Library Info.
1514   const TargetLibraryInfo *TLI;
1515 
1516   /// Demanded bits analysis.
1517   DemandedBits *DB;
1518 
1519   /// Assumption cache.
1520   AssumptionCache *AC;
1521 
1522   /// Interface to emit optimization remarks.
1523   OptimizationRemarkEmitter *ORE;
1524 
1525   const Function *TheFunction;
1526 
1527   /// Loop Vectorize Hint.
1528   const LoopVectorizeHints *Hints;
1529 
1530   /// The interleave access information contains groups of interleaved accesses
1531   /// with the same stride and close to each other.
1532   InterleavedAccessInfo &InterleaveInfo;
1533 
1534   /// Values to ignore in the cost model.
1535   SmallPtrSet<const Value *, 16> ValuesToIgnore;
1536 
1537   /// Values to ignore in the cost model when VF > 1.
1538   SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1539 };
1540 
1541 } // end namespace llvm
1542 
1543 // Return true if \p OuterLp is an outer loop annotated with hints for explicit
1544 // vectorization. The loop needs to be annotated with #pragma omp simd
1545 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
1546 // vector length information is not provided, vectorization is not considered
1547 // explicit. Interleave hints are not allowed either. These limitations will be
1548 // relaxed in the future.
1549 // Please, note that we are currently forced to abuse the pragma 'clang
1550 // vectorize' semantics. This pragma provides *auto-vectorization hints*
1551 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
1552 // provides *explicit vectorization hints* (LV can bypass legal checks and
1553 // assume that vectorization is legal). However, both hints are implemented
1554 // using the same metadata (llvm.loop.vectorize, processed by
1555 // LoopVectorizeHints). This will be fixed in the future when the native IR
1556 // representation for pragma 'omp simd' is introduced.
1557 static bool isExplicitVecOuterLoop(Loop *OuterLp,
1558                                    OptimizationRemarkEmitter *ORE) {
1559   assert(!OuterLp->empty() && "This is not an outer loop");
1560   LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
1561 
1562   // Only outer loops with an explicit vectorization hint are supported.
1563   // Unannotated outer loops are ignored.
1564   if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
1565     return false;
1566 
1567   Function *Fn = OuterLp->getHeader()->getParent();
1568   if (!Hints.allowVectorization(Fn, OuterLp,
1569                                 true /*VectorizeOnlyWhenForced*/)) {
1570     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
1571     return false;
1572   }
1573 
1574   if (Hints.getInterleave() > 1) {
1575     // TODO: Interleave support is future work.
1576     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
1577                          "outer loops.\n");
1578     Hints.emitRemarkWithHints();
1579     return false;
1580   }
1581 
1582   return true;
1583 }
1584 
1585 static void collectSupportedLoops(Loop &L, LoopInfo *LI,
1586                                   OptimizationRemarkEmitter *ORE,
1587                                   SmallVectorImpl<Loop *> &V) {
1588   // Collect inner loops and outer loops without irreducible control flow. For
1589   // now, only collect outer loops that have explicit vectorization hints. If we
1590   // are stress testing the VPlan H-CFG construction, we collect the outermost
1591   // loop of every loop nest.
1592   if (L.empty() || VPlanBuildStressTest ||
1593       (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
1594     LoopBlocksRPO RPOT(&L);
1595     RPOT.perform(LI);
1596     if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
1597       V.push_back(&L);
1598       // TODO: Collect inner loops inside marked outer loops in case
1599       // vectorization fails for the outer loop. Do not invoke
1600       // 'containsIrreducibleCFG' again for inner loops when the outer loop is
1601       // already known to be reducible. We can use an inherited attribute for
1602       // that.
1603       return;
1604     }
1605   }
1606   for (Loop *InnerL : L)
1607     collectSupportedLoops(*InnerL, LI, ORE, V);
1608 }
1609 
1610 namespace {
1611 
1612 /// The LoopVectorize Pass.
1613 struct LoopVectorize : public FunctionPass {
1614   /// Pass identification, replacement for typeid
1615   static char ID;
1616 
1617   LoopVectorizePass Impl;
1618 
1619   explicit LoopVectorize(bool InterleaveOnlyWhenForced = false,
1620                          bool VectorizeOnlyWhenForced = false)
1621       : FunctionPass(ID),
1622         Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) {
1623     initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
1624   }
1625 
1626   bool runOnFunction(Function &F) override {
1627     if (skipFunction(F))
1628       return false;
1629 
1630     auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
1631     auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
1632     auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
1633     auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
1634     auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
1635     auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
1636     auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
1637     auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
1638     auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
1639     auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
1640     auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
1641     auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
1642     auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
1643 
1644     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
1645         [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
1646 
1647     return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC,
1648                         GetLAA, *ORE, PSI).MadeAnyChange;
1649   }
1650 
1651   void getAnalysisUsage(AnalysisUsage &AU) const override {
1652     AU.addRequired<AssumptionCacheTracker>();
1653     AU.addRequired<BlockFrequencyInfoWrapperPass>();
1654     AU.addRequired<DominatorTreeWrapperPass>();
1655     AU.addRequired<LoopInfoWrapperPass>();
1656     AU.addRequired<ScalarEvolutionWrapperPass>();
1657     AU.addRequired<TargetTransformInfoWrapperPass>();
1658     AU.addRequired<AAResultsWrapperPass>();
1659     AU.addRequired<LoopAccessLegacyAnalysis>();
1660     AU.addRequired<DemandedBitsWrapperPass>();
1661     AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
1662     AU.addRequired<InjectTLIMappingsLegacy>();
1663 
1664     // We currently do not preserve loopinfo/dominator analyses with outer loop
1665     // vectorization. Until this is addressed, mark these analyses as preserved
1666     // only for non-VPlan-native path.
1667     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
1668     if (!EnableVPlanNativePath) {
1669       AU.addPreserved<LoopInfoWrapperPass>();
1670       AU.addPreserved<DominatorTreeWrapperPass>();
1671     }
1672 
1673     AU.addPreserved<BasicAAWrapperPass>();
1674     AU.addPreserved<GlobalsAAWrapperPass>();
1675     AU.addRequired<ProfileSummaryInfoWrapperPass>();
1676   }
1677 };
1678 
1679 } // end anonymous namespace
1680 
1681 //===----------------------------------------------------------------------===//
1682 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
1683 // LoopVectorizationCostModel and LoopVectorizationPlanner.
1684 //===----------------------------------------------------------------------===//
1685 
1686 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
1687   // We need to place the broadcast of invariant variables outside the loop,
1688   // but only if it's proven safe to do so. Else, broadcast will be inside
1689   // vector loop body.
1690   Instruction *Instr = dyn_cast<Instruction>(V);
1691   bool SafeToHoist = OrigLoop->isLoopInvariant(V) &&
1692                      (!Instr ||
1693                       DT->dominates(Instr->getParent(), LoopVectorPreHeader));
1694   // Place the code for broadcasting invariant variables in the new preheader.
1695   IRBuilder<>::InsertPointGuard Guard(Builder);
1696   if (SafeToHoist)
1697     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
1698 
1699   // Broadcast the scalar into all locations in the vector.
1700   Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
1701 
1702   return Shuf;
1703 }
1704 
1705 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
1706     const InductionDescriptor &II, Value *Step, Instruction *EntryVal) {
1707   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1708          "Expected either an induction phi-node or a truncate of it!");
1709   Value *Start = II.getStartValue();
1710 
1711   // Construct the initial value of the vector IV in the vector loop preheader
1712   auto CurrIP = Builder.saveIP();
1713   Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
1714   if (isa<TruncInst>(EntryVal)) {
1715     assert(Start->getType()->isIntegerTy() &&
1716            "Truncation requires an integer type");
1717     auto *TruncType = cast<IntegerType>(EntryVal->getType());
1718     Step = Builder.CreateTrunc(Step, TruncType);
1719     Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
1720   }
1721   Value *SplatStart = Builder.CreateVectorSplat(VF, Start);
1722   Value *SteppedStart =
1723       getStepVector(SplatStart, 0, Step, II.getInductionOpcode());
1724 
1725   // We create vector phi nodes for both integer and floating-point induction
1726   // variables. Here, we determine the kind of arithmetic we will perform.
1727   Instruction::BinaryOps AddOp;
1728   Instruction::BinaryOps MulOp;
1729   if (Step->getType()->isIntegerTy()) {
1730     AddOp = Instruction::Add;
1731     MulOp = Instruction::Mul;
1732   } else {
1733     AddOp = II.getInductionOpcode();
1734     MulOp = Instruction::FMul;
1735   }
1736 
1737   // Multiply the vectorization factor by the step using integer or
1738   // floating-point arithmetic as appropriate.
1739   Value *ConstVF = getSignedIntOrFpConstant(Step->getType(), VF);
1740   Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF));
1741 
1742   // Create a vector splat to use in the induction update.
1743   //
1744   // FIXME: If the step is non-constant, we create the vector splat with
1745   //        IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
1746   //        handle a constant vector splat.
1747   Value *SplatVF =
1748       isa<Constant>(Mul)
1749           ? ConstantVector::getSplat({VF, false}, cast<Constant>(Mul))
1750           : Builder.CreateVectorSplat(VF, Mul);
1751   Builder.restoreIP(CurrIP);
1752 
1753   // We may need to add the step a number of times, depending on the unroll
1754   // factor. The last of those goes into the PHI.
1755   PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
1756                                     &*LoopVectorBody->getFirstInsertionPt());
1757   VecInd->setDebugLoc(EntryVal->getDebugLoc());
1758   Instruction *LastInduction = VecInd;
1759   for (unsigned Part = 0; Part < UF; ++Part) {
1760     VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction);
1761 
1762     if (isa<TruncInst>(EntryVal))
1763       addMetadata(LastInduction, EntryVal);
1764     recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, Part);
1765 
1766     LastInduction = cast<Instruction>(addFastMathFlag(
1767         Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")));
1768     LastInduction->setDebugLoc(EntryVal->getDebugLoc());
1769   }
1770 
1771   // Move the last step to the end of the latch block. This ensures consistent
1772   // placement of all induction updates.
1773   auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
1774   auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator());
1775   auto *ICmp = cast<Instruction>(Br->getCondition());
1776   LastInduction->moveBefore(ICmp);
1777   LastInduction->setName("vec.ind.next");
1778 
1779   VecInd->addIncoming(SteppedStart, LoopVectorPreHeader);
1780   VecInd->addIncoming(LastInduction, LoopVectorLatch);
1781 }
1782 
1783 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const {
1784   return Cost->isScalarAfterVectorization(I, VF) ||
1785          Cost->isProfitableToScalarize(I, VF);
1786 }
1787 
1788 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const {
1789   if (shouldScalarizeInstruction(IV))
1790     return true;
1791   auto isScalarInst = [&](User *U) -> bool {
1792     auto *I = cast<Instruction>(U);
1793     return (OrigLoop->contains(I) && shouldScalarizeInstruction(I));
1794   };
1795   return llvm::any_of(IV->users(), isScalarInst);
1796 }
1797 
1798 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast(
1799     const InductionDescriptor &ID, const Instruction *EntryVal,
1800     Value *VectorLoopVal, unsigned Part, unsigned Lane) {
1801   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1802          "Expected either an induction phi-node or a truncate of it!");
1803 
1804   // This induction variable is not the phi from the original loop but the
1805   // newly-created IV based on the proof that casted Phi is equal to the
1806   // uncasted Phi in the vectorized loop (under a runtime guard possibly). It
1807   // re-uses the same InductionDescriptor that original IV uses but we don't
1808   // have to do any recording in this case - that is done when original IV is
1809   // processed.
1810   if (isa<TruncInst>(EntryVal))
1811     return;
1812 
1813   const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts();
1814   if (Casts.empty())
1815     return;
1816   // Only the first Cast instruction in the Casts vector is of interest.
1817   // The rest of the Casts (if exist) have no uses outside the
1818   // induction update chain itself.
1819   Instruction *CastInst = *Casts.begin();
1820   if (Lane < UINT_MAX)
1821     VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal);
1822   else
1823     VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal);
1824 }
1825 
1826 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) {
1827   assert((IV->getType()->isIntegerTy() || IV != OldInduction) &&
1828          "Primary induction variable must have an integer type");
1829 
1830   auto II = Legal->getInductionVars().find(IV);
1831   assert(II != Legal->getInductionVars().end() && "IV is not an induction");
1832 
1833   auto ID = II->second;
1834   assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
1835 
1836   // The value from the original loop to which we are mapping the new induction
1837   // variable.
1838   Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
1839 
1840   auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
1841 
1842   // Generate code for the induction step. Note that induction steps are
1843   // required to be loop-invariant
1844   auto CreateStepValue = [&](const SCEV *Step) -> Value * {
1845     assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) &&
1846            "Induction step should be loop invariant");
1847     if (PSE.getSE()->isSCEVable(IV->getType())) {
1848       SCEVExpander Exp(*PSE.getSE(), DL, "induction");
1849       return Exp.expandCodeFor(Step, Step->getType(),
1850                                LoopVectorPreHeader->getTerminator());
1851     }
1852     return cast<SCEVUnknown>(Step)->getValue();
1853   };
1854 
1855   // The scalar value to broadcast. This is derived from the canonical
1856   // induction variable. If a truncation type is given, truncate the canonical
1857   // induction variable and step. Otherwise, derive these values from the
1858   // induction descriptor.
1859   auto CreateScalarIV = [&](Value *&Step) -> Value * {
1860     Value *ScalarIV = Induction;
1861     if (IV != OldInduction) {
1862       ScalarIV = IV->getType()->isIntegerTy()
1863                      ? Builder.CreateSExtOrTrunc(Induction, IV->getType())
1864                      : Builder.CreateCast(Instruction::SIToFP, Induction,
1865                                           IV->getType());
1866       ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID);
1867       ScalarIV->setName("offset.idx");
1868     }
1869     if (Trunc) {
1870       auto *TruncType = cast<IntegerType>(Trunc->getType());
1871       assert(Step->getType()->isIntegerTy() &&
1872              "Truncation requires an integer step");
1873       ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType);
1874       Step = Builder.CreateTrunc(Step, TruncType);
1875     }
1876     return ScalarIV;
1877   };
1878 
1879   // Create the vector values from the scalar IV, in the absence of creating a
1880   // vector IV.
1881   auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) {
1882     Value *Broadcasted = getBroadcastInstrs(ScalarIV);
1883     for (unsigned Part = 0; Part < UF; ++Part) {
1884       Value *EntryPart =
1885           getStepVector(Broadcasted, VF * Part, Step, ID.getInductionOpcode());
1886       VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart);
1887       if (Trunc)
1888         addMetadata(EntryPart, Trunc);
1889       recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part);
1890     }
1891   };
1892 
1893   // Now do the actual transformations, and start with creating the step value.
1894   Value *Step = CreateStepValue(ID.getStep());
1895   if (VF <= 1) {
1896     Value *ScalarIV = CreateScalarIV(Step);
1897     CreateSplatIV(ScalarIV, Step);
1898     return;
1899   }
1900 
1901   // Determine if we want a scalar version of the induction variable. This is
1902   // true if the induction variable itself is not widened, or if it has at
1903   // least one user in the loop that is not widened.
1904   auto NeedsScalarIV = needsScalarInduction(EntryVal);
1905   if (!NeedsScalarIV) {
1906     createVectorIntOrFpInductionPHI(ID, Step, EntryVal);
1907     return;
1908   }
1909 
1910   // Try to create a new independent vector induction variable. If we can't
1911   // create the phi node, we will splat the scalar induction variable in each
1912   // loop iteration.
1913   if (!shouldScalarizeInstruction(EntryVal)) {
1914     createVectorIntOrFpInductionPHI(ID, Step, EntryVal);
1915     Value *ScalarIV = CreateScalarIV(Step);
1916     // Create scalar steps that can be used by instructions we will later
1917     // scalarize. Note that the addition of the scalar steps will not increase
1918     // the number of instructions in the loop in the common case prior to
1919     // InstCombine. We will be trading one vector extract for each scalar step.
1920     buildScalarSteps(ScalarIV, Step, EntryVal, ID);
1921     return;
1922   }
1923 
1924   // All IV users are scalar instructions, so only emit a scalar IV, not a
1925   // vectorised IV. Except when we tail-fold, then the splat IV feeds the
1926   // predicate used by the masked loads/stores.
1927   Value *ScalarIV = CreateScalarIV(Step);
1928   if (!Cost->isScalarEpilogueAllowed())
1929     CreateSplatIV(ScalarIV, Step);
1930   buildScalarSteps(ScalarIV, Step, EntryVal, ID);
1931 }
1932 
1933 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step,
1934                                           Instruction::BinaryOps BinOp) {
1935   // Create and check the types.
1936   auto *ValVTy = cast<VectorType>(Val->getType());
1937   int VLen = ValVTy->getNumElements();
1938 
1939   Type *STy = Val->getType()->getScalarType();
1940   assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
1941          "Induction Step must be an integer or FP");
1942   assert(Step->getType() == STy && "Step has wrong type");
1943 
1944   SmallVector<Constant *, 8> Indices;
1945 
1946   if (STy->isIntegerTy()) {
1947     // Create a vector of consecutive numbers from zero to VF.
1948     for (int i = 0; i < VLen; ++i)
1949       Indices.push_back(ConstantInt::get(STy, StartIdx + i));
1950 
1951     // Add the consecutive indices to the vector value.
1952     Constant *Cv = ConstantVector::get(Indices);
1953     assert(Cv->getType() == Val->getType() && "Invalid consecutive vec");
1954     Step = Builder.CreateVectorSplat(VLen, Step);
1955     assert(Step->getType() == Val->getType() && "Invalid step vec");
1956     // FIXME: The newly created binary instructions should contain nsw/nuw flags,
1957     // which can be found from the original scalar operations.
1958     Step = Builder.CreateMul(Cv, Step);
1959     return Builder.CreateAdd(Val, Step, "induction");
1960   }
1961 
1962   // Floating point induction.
1963   assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
1964          "Binary Opcode should be specified for FP induction");
1965   // Create a vector of consecutive numbers from zero to VF.
1966   for (int i = 0; i < VLen; ++i)
1967     Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i)));
1968 
1969   // Add the consecutive indices to the vector value.
1970   Constant *Cv = ConstantVector::get(Indices);
1971 
1972   Step = Builder.CreateVectorSplat(VLen, Step);
1973 
1974   // Floating point operations had to be 'fast' to enable the induction.
1975   FastMathFlags Flags;
1976   Flags.setFast();
1977 
1978   Value *MulOp = Builder.CreateFMul(Cv, Step);
1979   if (isa<Instruction>(MulOp))
1980     // Have to check, MulOp may be a constant
1981     cast<Instruction>(MulOp)->setFastMathFlags(Flags);
1982 
1983   Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
1984   if (isa<Instruction>(BOp))
1985     cast<Instruction>(BOp)->setFastMathFlags(Flags);
1986   return BOp;
1987 }
1988 
1989 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
1990                                            Instruction *EntryVal,
1991                                            const InductionDescriptor &ID) {
1992   // We shouldn't have to build scalar steps if we aren't vectorizing.
1993   assert(VF > 1 && "VF should be greater than one");
1994 
1995   // Get the value type and ensure it and the step have the same integer type.
1996   Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
1997   assert(ScalarIVTy == Step->getType() &&
1998          "Val and Step should have the same type");
1999 
2000   // We build scalar steps for both integer and floating-point induction
2001   // variables. Here, we determine the kind of arithmetic we will perform.
2002   Instruction::BinaryOps AddOp;
2003   Instruction::BinaryOps MulOp;
2004   if (ScalarIVTy->isIntegerTy()) {
2005     AddOp = Instruction::Add;
2006     MulOp = Instruction::Mul;
2007   } else {
2008     AddOp = ID.getInductionOpcode();
2009     MulOp = Instruction::FMul;
2010   }
2011 
2012   // Determine the number of scalars we need to generate for each unroll
2013   // iteration. If EntryVal is uniform, we only need to generate the first
2014   // lane. Otherwise, we generate all VF values.
2015   unsigned Lanes =
2016       Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) ? 1
2017                                                                          : VF;
2018   // Compute the scalar steps and save the results in VectorLoopValueMap.
2019   for (unsigned Part = 0; Part < UF; ++Part) {
2020     for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
2021       auto *StartIdx = getSignedIntOrFpConstant(ScalarIVTy, VF * Part + Lane);
2022       auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step));
2023       auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul));
2024       VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add);
2025       recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane);
2026     }
2027   }
2028 }
2029 
2030 Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) {
2031   assert(V != Induction && "The new induction variable should not be used.");
2032   assert(!V->getType()->isVectorTy() && "Can't widen a vector");
2033   assert(!V->getType()->isVoidTy() && "Type does not produce a value");
2034 
2035   // If we have a stride that is replaced by one, do it here. Defer this for
2036   // the VPlan-native path until we start running Legal checks in that path.
2037   if (!EnableVPlanNativePath && Legal->hasStride(V))
2038     V = ConstantInt::get(V->getType(), 1);
2039 
2040   // If we have a vector mapped to this value, return it.
2041   if (VectorLoopValueMap.hasVectorValue(V, Part))
2042     return VectorLoopValueMap.getVectorValue(V, Part);
2043 
2044   // If the value has not been vectorized, check if it has been scalarized
2045   // instead. If it has been scalarized, and we actually need the value in
2046   // vector form, we will construct the vector values on demand.
2047   if (VectorLoopValueMap.hasAnyScalarValue(V)) {
2048     Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0});
2049 
2050     // If we've scalarized a value, that value should be an instruction.
2051     auto *I = cast<Instruction>(V);
2052 
2053     // If we aren't vectorizing, we can just copy the scalar map values over to
2054     // the vector map.
2055     if (VF == 1) {
2056       VectorLoopValueMap.setVectorValue(V, Part, ScalarValue);
2057       return ScalarValue;
2058     }
2059 
2060     // Get the last scalar instruction we generated for V and Part. If the value
2061     // is known to be uniform after vectorization, this corresponds to lane zero
2062     // of the Part unroll iteration. Otherwise, the last instruction is the one
2063     // we created for the last vector lane of the Part unroll iteration.
2064     unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) ? 0 : VF - 1;
2065     auto *LastInst = cast<Instruction>(
2066         VectorLoopValueMap.getScalarValue(V, {Part, LastLane}));
2067 
2068     // Set the insert point after the last scalarized instruction. This ensures
2069     // the insertelement sequence will directly follow the scalar definitions.
2070     auto OldIP = Builder.saveIP();
2071     auto NewIP = std::next(BasicBlock::iterator(LastInst));
2072     Builder.SetInsertPoint(&*NewIP);
2073 
2074     // However, if we are vectorizing, we need to construct the vector values.
2075     // If the value is known to be uniform after vectorization, we can just
2076     // broadcast the scalar value corresponding to lane zero for each unroll
2077     // iteration. Otherwise, we construct the vector values using insertelement
2078     // instructions. Since the resulting vectors are stored in
2079     // VectorLoopValueMap, we will only generate the insertelements once.
2080     Value *VectorValue = nullptr;
2081     if (Cost->isUniformAfterVectorization(I, VF)) {
2082       VectorValue = getBroadcastInstrs(ScalarValue);
2083       VectorLoopValueMap.setVectorValue(V, Part, VectorValue);
2084     } else {
2085       // Initialize packing with insertelements to start from undef.
2086       Value *Undef = UndefValue::get(FixedVectorType::get(V->getType(), VF));
2087       VectorLoopValueMap.setVectorValue(V, Part, Undef);
2088       for (unsigned Lane = 0; Lane < VF; ++Lane)
2089         packScalarIntoVectorValue(V, {Part, Lane});
2090       VectorValue = VectorLoopValueMap.getVectorValue(V, Part);
2091     }
2092     Builder.restoreIP(OldIP);
2093     return VectorValue;
2094   }
2095 
2096   // If this scalar is unknown, assume that it is a constant or that it is
2097   // loop invariant. Broadcast V and save the value for future uses.
2098   Value *B = getBroadcastInstrs(V);
2099   VectorLoopValueMap.setVectorValue(V, Part, B);
2100   return B;
2101 }
2102 
2103 Value *
2104 InnerLoopVectorizer::getOrCreateScalarValue(Value *V,
2105                                             const VPIteration &Instance) {
2106   // If the value is not an instruction contained in the loop, it should
2107   // already be scalar.
2108   if (OrigLoop->isLoopInvariant(V))
2109     return V;
2110 
2111   assert(Instance.Lane > 0
2112              ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF)
2113              : true && "Uniform values only have lane zero");
2114 
2115   // If the value from the original loop has not been vectorized, it is
2116   // represented by UF x VF scalar values in the new loop. Return the requested
2117   // scalar value.
2118   if (VectorLoopValueMap.hasScalarValue(V, Instance))
2119     return VectorLoopValueMap.getScalarValue(V, Instance);
2120 
2121   // If the value has not been scalarized, get its entry in VectorLoopValueMap
2122   // for the given unroll part. If this entry is not a vector type (i.e., the
2123   // vectorization factor is one), there is no need to generate an
2124   // extractelement instruction.
2125   auto *U = getOrCreateVectorValue(V, Instance.Part);
2126   if (!U->getType()->isVectorTy()) {
2127     assert(VF == 1 && "Value not scalarized has non-vector type");
2128     return U;
2129   }
2130 
2131   // Otherwise, the value from the original loop has been vectorized and is
2132   // represented by UF vector values. Extract and return the requested scalar
2133   // value from the appropriate vector lane.
2134   return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane));
2135 }
2136 
2137 void InnerLoopVectorizer::packScalarIntoVectorValue(
2138     Value *V, const VPIteration &Instance) {
2139   assert(V != Induction && "The new induction variable should not be used.");
2140   assert(!V->getType()->isVectorTy() && "Can't pack a vector");
2141   assert(!V->getType()->isVoidTy() && "Type does not produce a value");
2142 
2143   Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance);
2144   Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part);
2145   VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst,
2146                                             Builder.getInt32(Instance.Lane));
2147   VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue);
2148 }
2149 
2150 Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
2151   assert(Vec->getType()->isVectorTy() && "Invalid type");
2152   SmallVector<int, 8> ShuffleMask;
2153   for (unsigned i = 0; i < VF; ++i)
2154     ShuffleMask.push_back(VF - i - 1);
2155 
2156   return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()),
2157                                      ShuffleMask, "reverse");
2158 }
2159 
2160 // Return whether we allow using masked interleave-groups (for dealing with
2161 // strided loads/stores that reside in predicated blocks, or for dealing
2162 // with gaps).
2163 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2164   // If an override option has been passed in for interleaved accesses, use it.
2165   if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2166     return EnableMaskedInterleavedMemAccesses;
2167 
2168   return TTI.enableMaskedInterleavedAccessVectorization();
2169 }
2170 
2171 // Try to vectorize the interleave group that \p Instr belongs to.
2172 //
2173 // E.g. Translate following interleaved load group (factor = 3):
2174 //   for (i = 0; i < N; i+=3) {
2175 //     R = Pic[i];             // Member of index 0
2176 //     G = Pic[i+1];           // Member of index 1
2177 //     B = Pic[i+2];           // Member of index 2
2178 //     ... // do something to R, G, B
2179 //   }
2180 // To:
2181 //   %wide.vec = load <12 x i32>                       ; Read 4 tuples of R,G,B
2182 //   %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9>   ; R elements
2183 //   %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10>  ; G elements
2184 //   %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11>  ; B elements
2185 //
2186 // Or translate following interleaved store group (factor = 3):
2187 //   for (i = 0; i < N; i+=3) {
2188 //     ... do something to R, G, B
2189 //     Pic[i]   = R;           // Member of index 0
2190 //     Pic[i+1] = G;           // Member of index 1
2191 //     Pic[i+2] = B;           // Member of index 2
2192 //   }
2193 // To:
2194 //   %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2195 //   %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u>
2196 //   %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2197 //        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements
2198 //   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
2199 void InnerLoopVectorizer::vectorizeInterleaveGroup(
2200     const InterleaveGroup<Instruction> *Group, VPTransformState &State,
2201     VPValue *Addr, VPValue *BlockInMask) {
2202   Instruction *Instr = Group->getInsertPos();
2203   const DataLayout &DL = Instr->getModule()->getDataLayout();
2204 
2205   // Prepare for the vector type of the interleaved load/store.
2206   Type *ScalarTy = getMemInstValueType(Instr);
2207   unsigned InterleaveFactor = Group->getFactor();
2208   auto *VecTy = FixedVectorType::get(ScalarTy, InterleaveFactor * VF);
2209 
2210   // Prepare for the new pointers.
2211   SmallVector<Value *, 2> AddrParts;
2212   unsigned Index = Group->getIndex(Instr);
2213 
2214   // TODO: extend the masked interleaved-group support to reversed access.
2215   assert((!BlockInMask || !Group->isReverse()) &&
2216          "Reversed masked interleave-group not supported.");
2217 
2218   // If the group is reverse, adjust the index to refer to the last vector lane
2219   // instead of the first. We adjust the index from the first vector lane,
2220   // rather than directly getting the pointer for lane VF - 1, because the
2221   // pointer operand of the interleaved access is supposed to be uniform. For
2222   // uniform instructions, we're only required to generate a value for the
2223   // first vector lane in each unroll iteration.
2224   if (Group->isReverse())
2225     Index += (VF - 1) * Group->getFactor();
2226 
2227   for (unsigned Part = 0; Part < UF; Part++) {
2228     Value *AddrPart = State.get(Addr, {Part, 0});
2229     setDebugLocFromInst(Builder, AddrPart);
2230 
2231     // Notice current instruction could be any index. Need to adjust the address
2232     // to the member of index 0.
2233     //
2234     // E.g.  a = A[i+1];     // Member of index 1 (Current instruction)
2235     //       b = A[i];       // Member of index 0
2236     // Current pointer is pointed to A[i+1], adjust it to A[i].
2237     //
2238     // E.g.  A[i+1] = a;     // Member of index 1
2239     //       A[i]   = b;     // Member of index 0
2240     //       A[i+2] = c;     // Member of index 2 (Current instruction)
2241     // Current pointer is pointed to A[i+2], adjust it to A[i].
2242 
2243     bool InBounds = false;
2244     if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts()))
2245       InBounds = gep->isInBounds();
2246     AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index));
2247     cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds);
2248 
2249     // Cast to the vector pointer type.
2250     unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace();
2251     Type *PtrTy = VecTy->getPointerTo(AddressSpace);
2252     AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy));
2253   }
2254 
2255   setDebugLocFromInst(Builder, Instr);
2256   Value *UndefVec = UndefValue::get(VecTy);
2257 
2258   Value *MaskForGaps = nullptr;
2259   if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) {
2260     MaskForGaps = createBitMaskForGaps(Builder, VF, *Group);
2261     assert(MaskForGaps && "Mask for Gaps is required but it is null");
2262   }
2263 
2264   // Vectorize the interleaved load group.
2265   if (isa<LoadInst>(Instr)) {
2266     // For each unroll part, create a wide load for the group.
2267     SmallVector<Value *, 2> NewLoads;
2268     for (unsigned Part = 0; Part < UF; Part++) {
2269       Instruction *NewLoad;
2270       if (BlockInMask || MaskForGaps) {
2271         assert(useMaskedInterleavedAccesses(*TTI) &&
2272                "masked interleaved groups are not allowed.");
2273         Value *GroupMask = MaskForGaps;
2274         if (BlockInMask) {
2275           Value *BlockInMaskPart = State.get(BlockInMask, Part);
2276           auto *Undefs = UndefValue::get(BlockInMaskPart->getType());
2277           Value *ShuffledMask = Builder.CreateShuffleVector(
2278               BlockInMaskPart, Undefs,
2279               createReplicatedMask(InterleaveFactor, VF), "interleaved.mask");
2280           GroupMask = MaskForGaps
2281                           ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
2282                                                 MaskForGaps)
2283                           : ShuffledMask;
2284         }
2285         NewLoad =
2286             Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlign(),
2287                                      GroupMask, UndefVec, "wide.masked.vec");
2288       }
2289       else
2290         NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part],
2291                                             Group->getAlign(), "wide.vec");
2292       Group->addMetadata(NewLoad);
2293       NewLoads.push_back(NewLoad);
2294     }
2295 
2296     // For each member in the group, shuffle out the appropriate data from the
2297     // wide loads.
2298     for (unsigned I = 0; I < InterleaveFactor; ++I) {
2299       Instruction *Member = Group->getMember(I);
2300 
2301       // Skip the gaps in the group.
2302       if (!Member)
2303         continue;
2304 
2305       auto StrideMask = createStrideMask(I, InterleaveFactor, VF);
2306       for (unsigned Part = 0; Part < UF; Part++) {
2307         Value *StridedVec = Builder.CreateShuffleVector(
2308             NewLoads[Part], UndefVec, StrideMask, "strided.vec");
2309 
2310         // If this member has different type, cast the result type.
2311         if (Member->getType() != ScalarTy) {
2312           VectorType *OtherVTy = FixedVectorType::get(Member->getType(), VF);
2313           StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2314         }
2315 
2316         if (Group->isReverse())
2317           StridedVec = reverseVector(StridedVec);
2318 
2319         VectorLoopValueMap.setVectorValue(Member, Part, StridedVec);
2320       }
2321     }
2322     return;
2323   }
2324 
2325   // The sub vector type for current instruction.
2326   auto *SubVT = FixedVectorType::get(ScalarTy, VF);
2327 
2328   // Vectorize the interleaved store group.
2329   for (unsigned Part = 0; Part < UF; Part++) {
2330     // Collect the stored vector from each member.
2331     SmallVector<Value *, 4> StoredVecs;
2332     for (unsigned i = 0; i < InterleaveFactor; i++) {
2333       // Interleaved store group doesn't allow a gap, so each index has a member
2334       Instruction *Member = Group->getMember(i);
2335       assert(Member && "Fail to get a member from an interleaved store group");
2336 
2337       Value *StoredVec = getOrCreateVectorValue(
2338           cast<StoreInst>(Member)->getValueOperand(), Part);
2339       if (Group->isReverse())
2340         StoredVec = reverseVector(StoredVec);
2341 
2342       // If this member has different type, cast it to a unified type.
2343 
2344       if (StoredVec->getType() != SubVT)
2345         StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2346 
2347       StoredVecs.push_back(StoredVec);
2348     }
2349 
2350     // Concatenate all vectors into a wide vector.
2351     Value *WideVec = concatenateVectors(Builder, StoredVecs);
2352 
2353     // Interleave the elements in the wide vector.
2354     Value *IVec = Builder.CreateShuffleVector(
2355         WideVec, UndefVec, createInterleaveMask(VF, InterleaveFactor),
2356         "interleaved.vec");
2357 
2358     Instruction *NewStoreInstr;
2359     if (BlockInMask) {
2360       Value *BlockInMaskPart = State.get(BlockInMask, Part);
2361       auto *Undefs = UndefValue::get(BlockInMaskPart->getType());
2362       Value *ShuffledMask = Builder.CreateShuffleVector(
2363           BlockInMaskPart, Undefs, createReplicatedMask(InterleaveFactor, VF),
2364           "interleaved.mask");
2365       NewStoreInstr = Builder.CreateMaskedStore(
2366           IVec, AddrParts[Part], Group->getAlign(), ShuffledMask);
2367     }
2368     else
2369       NewStoreInstr =
2370           Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign());
2371 
2372     Group->addMetadata(NewStoreInstr);
2373   }
2374 }
2375 
2376 void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
2377                                                      VPTransformState &State,
2378                                                      VPValue *Addr,
2379                                                      VPValue *StoredValue,
2380                                                      VPValue *BlockInMask) {
2381   // Attempt to issue a wide load.
2382   LoadInst *LI = dyn_cast<LoadInst>(Instr);
2383   StoreInst *SI = dyn_cast<StoreInst>(Instr);
2384 
2385   assert((LI || SI) && "Invalid Load/Store instruction");
2386   assert((!SI || StoredValue) && "No stored value provided for widened store");
2387   assert((!LI || !StoredValue) && "Stored value provided for widened load");
2388 
2389   LoopVectorizationCostModel::InstWidening Decision =
2390       Cost->getWideningDecision(Instr, VF);
2391   assert((Decision == LoopVectorizationCostModel::CM_Widen ||
2392           Decision == LoopVectorizationCostModel::CM_Widen_Reverse ||
2393           Decision == LoopVectorizationCostModel::CM_GatherScatter) &&
2394          "CM decision is not to widen the memory instruction");
2395 
2396   Type *ScalarDataTy = getMemInstValueType(Instr);
2397   auto *DataTy = FixedVectorType::get(ScalarDataTy, VF);
2398   const Align Alignment = getLoadStoreAlignment(Instr);
2399 
2400   // Determine if the pointer operand of the access is either consecutive or
2401   // reverse consecutive.
2402   bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse);
2403   bool ConsecutiveStride =
2404       Reverse || (Decision == LoopVectorizationCostModel::CM_Widen);
2405   bool CreateGatherScatter =
2406       (Decision == LoopVectorizationCostModel::CM_GatherScatter);
2407 
2408   // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector
2409   // gather/scatter. Otherwise Decision should have been to Scalarize.
2410   assert((ConsecutiveStride || CreateGatherScatter) &&
2411          "The instruction should be scalarized");
2412   (void)ConsecutiveStride;
2413 
2414   VectorParts BlockInMaskParts(UF);
2415   bool isMaskRequired = BlockInMask;
2416   if (isMaskRequired)
2417     for (unsigned Part = 0; Part < UF; ++Part)
2418       BlockInMaskParts[Part] = State.get(BlockInMask, Part);
2419 
2420   const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
2421     // Calculate the pointer for the specific unroll-part.
2422     GetElementPtrInst *PartPtr = nullptr;
2423 
2424     bool InBounds = false;
2425     if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
2426       InBounds = gep->isInBounds();
2427 
2428     if (Reverse) {
2429       // If the address is consecutive but reversed, then the
2430       // wide store needs to start at the last vector element.
2431       PartPtr = cast<GetElementPtrInst>(
2432           Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(-Part * VF)));
2433       PartPtr->setIsInBounds(InBounds);
2434       PartPtr = cast<GetElementPtrInst>(
2435           Builder.CreateGEP(ScalarDataTy, PartPtr, Builder.getInt32(1 - VF)));
2436       PartPtr->setIsInBounds(InBounds);
2437       if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
2438         BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]);
2439     } else {
2440       PartPtr = cast<GetElementPtrInst>(
2441           Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(Part * VF)));
2442       PartPtr->setIsInBounds(InBounds);
2443     }
2444 
2445     unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
2446     return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
2447   };
2448 
2449   // Handle Stores:
2450   if (SI) {
2451     setDebugLocFromInst(Builder, SI);
2452 
2453     for (unsigned Part = 0; Part < UF; ++Part) {
2454       Instruction *NewSI = nullptr;
2455       Value *StoredVal = State.get(StoredValue, Part);
2456       if (CreateGatherScatter) {
2457         Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
2458         Value *VectorGep = State.get(Addr, Part);
2459         NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
2460                                             MaskPart);
2461       } else {
2462         if (Reverse) {
2463           // If we store to reverse consecutive memory locations, then we need
2464           // to reverse the order of elements in the stored value.
2465           StoredVal = reverseVector(StoredVal);
2466           // We don't want to update the value in the map as it might be used in
2467           // another expression. So don't call resetVectorValue(StoredVal).
2468         }
2469         auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0}));
2470         if (isMaskRequired)
2471           NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
2472                                             BlockInMaskParts[Part]);
2473         else
2474           NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
2475       }
2476       addMetadata(NewSI, SI);
2477     }
2478     return;
2479   }
2480 
2481   // Handle loads.
2482   assert(LI && "Must have a load instruction");
2483   setDebugLocFromInst(Builder, LI);
2484   for (unsigned Part = 0; Part < UF; ++Part) {
2485     Value *NewLI;
2486     if (CreateGatherScatter) {
2487       Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
2488       Value *VectorGep = State.get(Addr, Part);
2489       NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart,
2490                                          nullptr, "wide.masked.gather");
2491       addMetadata(NewLI, LI);
2492     } else {
2493       auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0}));
2494       if (isMaskRequired)
2495         NewLI = Builder.CreateMaskedLoad(
2496             VecPtr, Alignment, BlockInMaskParts[Part], UndefValue::get(DataTy),
2497             "wide.masked.load");
2498       else
2499         NewLI =
2500             Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");
2501 
2502       // Add metadata to the load, but setVectorValue to the reverse shuffle.
2503       addMetadata(NewLI, LI);
2504       if (Reverse)
2505         NewLI = reverseVector(NewLI);
2506     }
2507     VectorLoopValueMap.setVectorValue(Instr, Part, NewLI);
2508   }
2509 }
2510 
2511 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPUser &User,
2512                                                const VPIteration &Instance,
2513                                                bool IfPredicateInstr,
2514                                                VPTransformState &State) {
2515   assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
2516 
2517   setDebugLocFromInst(Builder, Instr);
2518 
2519   // Does this instruction return a value ?
2520   bool IsVoidRetTy = Instr->getType()->isVoidTy();
2521 
2522   Instruction *Cloned = Instr->clone();
2523   if (!IsVoidRetTy)
2524     Cloned->setName(Instr->getName() + ".cloned");
2525 
2526   // Replace the operands of the cloned instructions with their scalar
2527   // equivalents in the new loop.
2528   for (unsigned op = 0, e = User.getNumOperands(); op != e; ++op) {
2529     auto *NewOp = State.get(User.getOperand(op), Instance);
2530     Cloned->setOperand(op, NewOp);
2531   }
2532   addNewMetadata(Cloned, Instr);
2533 
2534   // Place the cloned scalar in the new loop.
2535   Builder.Insert(Cloned);
2536 
2537   // Add the cloned scalar to the scalar map entry.
2538   VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned);
2539 
2540   // If we just cloned a new assumption, add it the assumption cache.
2541   if (auto *II = dyn_cast<IntrinsicInst>(Cloned))
2542     if (II->getIntrinsicID() == Intrinsic::assume)
2543       AC->registerAssumption(II);
2544 
2545   // End if-block.
2546   if (IfPredicateInstr)
2547     PredicatedInstructions.push_back(Cloned);
2548 }
2549 
2550 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start,
2551                                                       Value *End, Value *Step,
2552                                                       Instruction *DL) {
2553   BasicBlock *Header = L->getHeader();
2554   BasicBlock *Latch = L->getLoopLatch();
2555   // As we're just creating this loop, it's possible no latch exists
2556   // yet. If so, use the header as this will be a single block loop.
2557   if (!Latch)
2558     Latch = Header;
2559 
2560   IRBuilder<> Builder(&*Header->getFirstInsertionPt());
2561   Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction);
2562   setDebugLocFromInst(Builder, OldInst);
2563   auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index");
2564 
2565   Builder.SetInsertPoint(Latch->getTerminator());
2566   setDebugLocFromInst(Builder, OldInst);
2567 
2568   // Create i+1 and fill the PHINode.
2569   Value *Next = Builder.CreateAdd(Induction, Step, "index.next");
2570   Induction->addIncoming(Start, L->getLoopPreheader());
2571   Induction->addIncoming(Next, Latch);
2572   // Create the compare.
2573   Value *ICmp = Builder.CreateICmpEQ(Next, End);
2574   Builder.CreateCondBr(ICmp, L->getExitBlock(), Header);
2575 
2576   // Now we have two terminators. Remove the old one from the block.
2577   Latch->getTerminator()->eraseFromParent();
2578 
2579   return Induction;
2580 }
2581 
2582 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
2583   if (TripCount)
2584     return TripCount;
2585 
2586   assert(L && "Create Trip Count for null loop.");
2587   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2588   // Find the loop boundaries.
2589   ScalarEvolution *SE = PSE.getSE();
2590   const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
2591   assert(BackedgeTakenCount != SE->getCouldNotCompute() &&
2592          "Invalid loop count");
2593 
2594   Type *IdxTy = Legal->getWidestInductionType();
2595   assert(IdxTy && "No type for induction");
2596 
2597   // The exit count might have the type of i64 while the phi is i32. This can
2598   // happen if we have an induction variable that is sign extended before the
2599   // compare. The only way that we get a backedge taken count is that the
2600   // induction variable was signed and as such will not overflow. In such a case
2601   // truncation is legal.
2602   if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) >
2603       IdxTy->getPrimitiveSizeInBits())
2604     BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
2605   BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
2606 
2607   // Get the total trip count from the count by adding 1.
2608   const SCEV *ExitCount = SE->getAddExpr(
2609       BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
2610 
2611   const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
2612 
2613   // Expand the trip count and place the new instructions in the preheader.
2614   // Notice that the pre-header does not change, only the loop body.
2615   SCEVExpander Exp(*SE, DL, "induction");
2616 
2617   // Count holds the overall loop count (N).
2618   TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
2619                                 L->getLoopPreheader()->getTerminator());
2620 
2621   if (TripCount->getType()->isPointerTy())
2622     TripCount =
2623         CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
2624                                     L->getLoopPreheader()->getTerminator());
2625 
2626   return TripCount;
2627 }
2628 
2629 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
2630   if (VectorTripCount)
2631     return VectorTripCount;
2632 
2633   Value *TC = getOrCreateTripCount(L);
2634   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2635 
2636   Type *Ty = TC->getType();
2637   Constant *Step = ConstantInt::get(Ty, VF * UF);
2638 
2639   // If the tail is to be folded by masking, round the number of iterations N
2640   // up to a multiple of Step instead of rounding down. This is done by first
2641   // adding Step-1 and then rounding down. Note that it's ok if this addition
2642   // overflows: the vector induction variable will eventually wrap to zero given
2643   // that it starts at zero and its Step is a power of two; the loop will then
2644   // exit, with the last early-exit vector comparison also producing all-true.
2645   if (Cost->foldTailByMasking()) {
2646     assert(isPowerOf2_32(VF * UF) &&
2647            "VF*UF must be a power of 2 when folding tail by masking");
2648     TC = Builder.CreateAdd(TC, ConstantInt::get(Ty, VF * UF - 1), "n.rnd.up");
2649   }
2650 
2651   // Now we need to generate the expression for the part of the loop that the
2652   // vectorized body will execute. This is equal to N - (N % Step) if scalar
2653   // iterations are not required for correctness, or N - Step, otherwise. Step
2654   // is equal to the vectorization factor (number of SIMD elements) times the
2655   // unroll factor (number of SIMD instructions).
2656   Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
2657 
2658   // If there is a non-reversed interleaved group that may speculatively access
2659   // memory out-of-bounds, we need to ensure that there will be at least one
2660   // iteration of the scalar epilogue loop. Thus, if the step evenly divides
2661   // the trip count, we set the remainder to be equal to the step. If the step
2662   // does not evenly divide the trip count, no adjustment is necessary since
2663   // there will already be scalar iterations. Note that the minimum iterations
2664   // check ensures that N >= Step.
2665   if (VF > 1 && Cost->requiresScalarEpilogue()) {
2666     auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
2667     R = Builder.CreateSelect(IsZero, Step, R);
2668   }
2669 
2670   VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
2671 
2672   return VectorTripCount;
2673 }
2674 
2675 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
2676                                                    const DataLayout &DL) {
2677   // Verify that V is a vector type with same number of elements as DstVTy.
2678   unsigned VF = DstVTy->getNumElements();
2679   VectorType *SrcVecTy = cast<VectorType>(V->getType());
2680   assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match");
2681   Type *SrcElemTy = SrcVecTy->getElementType();
2682   Type *DstElemTy = DstVTy->getElementType();
2683   assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
2684          "Vector elements must have same size");
2685 
2686   // Do a direct cast if element types are castable.
2687   if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
2688     return Builder.CreateBitOrPointerCast(V, DstVTy);
2689   }
2690   // V cannot be directly casted to desired vector type.
2691   // May happen when V is a floating point vector but DstVTy is a vector of
2692   // pointers or vice-versa. Handle this using a two-step bitcast using an
2693   // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
2694   assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
2695          "Only one type should be a pointer type");
2696   assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
2697          "Only one type should be a floating point type");
2698   Type *IntTy =
2699       IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
2700   auto *VecIntTy = FixedVectorType::get(IntTy, VF);
2701   Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
2702   return Builder.CreateBitOrPointerCast(CastVal, DstVTy);
2703 }
2704 
2705 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
2706                                                          BasicBlock *Bypass) {
2707   Value *Count = getOrCreateTripCount(L);
2708   // Reuse existing vector loop preheader for TC checks.
2709   // Note that new preheader block is generated for vector loop.
2710   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
2711   IRBuilder<> Builder(TCCheckBlock->getTerminator());
2712 
2713   // Generate code to check if the loop's trip count is less than VF * UF, or
2714   // equal to it in case a scalar epilogue is required; this implies that the
2715   // vector trip count is zero. This check also covers the case where adding one
2716   // to the backedge-taken count overflowed leading to an incorrect trip count
2717   // of zero. In this case we will also jump to the scalar loop.
2718   auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE
2719                                           : ICmpInst::ICMP_ULT;
2720 
2721   // If tail is to be folded, vector loop takes care of all iterations.
2722   Value *CheckMinIters = Builder.getFalse();
2723   if (!Cost->foldTailByMasking())
2724     CheckMinIters = Builder.CreateICmp(
2725         P, Count, ConstantInt::get(Count->getType(), VF * UF),
2726         "min.iters.check");
2727 
2728   // Create new preheader for vector loop.
2729   LoopVectorPreHeader =
2730       SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
2731                  "vector.ph");
2732 
2733   assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
2734                                DT->getNode(Bypass)->getIDom()) &&
2735          "TC check is expected to dominate Bypass");
2736 
2737   // Update dominator for Bypass & LoopExit.
2738   DT->changeImmediateDominator(Bypass, TCCheckBlock);
2739   DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
2740 
2741   ReplaceInstWithInst(
2742       TCCheckBlock->getTerminator(),
2743       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
2744   LoopBypassBlocks.push_back(TCCheckBlock);
2745 }
2746 
2747 void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
2748   // Reuse existing vector loop preheader for SCEV checks.
2749   // Note that new preheader block is generated for vector loop.
2750   BasicBlock *const SCEVCheckBlock = LoopVectorPreHeader;
2751 
2752   // Generate the code to check that the SCEV assumptions that we made.
2753   // We want the new basic block to start at the first instruction in a
2754   // sequence of instructions that form a check.
2755   SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(),
2756                    "scev.check");
2757   Value *SCEVCheck = Exp.expandCodeForPredicate(
2758       &PSE.getUnionPredicate(), SCEVCheckBlock->getTerminator());
2759 
2760   if (auto *C = dyn_cast<ConstantInt>(SCEVCheck))
2761     if (C->isZero())
2762       return;
2763 
2764   assert(!(SCEVCheckBlock->getParent()->hasOptSize() ||
2765            llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
2766                                        PGSOQueryType::IRPass)) &&
2767          "Cannot SCEV check stride or overflow when optimizing for size");
2768 
2769   SCEVCheckBlock->setName("vector.scevcheck");
2770   // Create new preheader for vector loop.
2771   LoopVectorPreHeader =
2772       SplitBlock(SCEVCheckBlock, SCEVCheckBlock->getTerminator(), DT, LI,
2773                  nullptr, "vector.ph");
2774 
2775   // Update dominator only if this is first RT check.
2776   if (LoopBypassBlocks.empty()) {
2777     DT->changeImmediateDominator(Bypass, SCEVCheckBlock);
2778     DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock);
2779   }
2780 
2781   ReplaceInstWithInst(
2782       SCEVCheckBlock->getTerminator(),
2783       BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheck));
2784   LoopBypassBlocks.push_back(SCEVCheckBlock);
2785   AddedSafetyChecks = true;
2786 }
2787 
2788 void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {
2789   // VPlan-native path does not do any analysis for runtime checks currently.
2790   if (EnableVPlanNativePath)
2791     return;
2792 
2793   // Reuse existing vector loop preheader for runtime memory checks.
2794   // Note that new preheader block is generated for vector loop.
2795   BasicBlock *const MemCheckBlock = L->getLoopPreheader();
2796 
2797   // Generate the code that checks in runtime if arrays overlap. We put the
2798   // checks into a separate block to make the more common case of few elements
2799   // faster.
2800   auto *LAI = Legal->getLAI();
2801   const auto &RtPtrChecking = *LAI->getRuntimePointerChecking();
2802   if (!RtPtrChecking.Need)
2803     return;
2804   Instruction *FirstCheckInst;
2805   Instruction *MemRuntimeCheck;
2806   std::tie(FirstCheckInst, MemRuntimeCheck) =
2807       addRuntimeChecks(MemCheckBlock->getTerminator(), OrigLoop,
2808                        RtPtrChecking.getChecks(), RtPtrChecking.getSE());
2809   assert(MemRuntimeCheck && "no RT checks generated although RtPtrChecking "
2810                             "claimed checks are required");
2811 
2812   if (MemCheckBlock->getParent()->hasOptSize() ||
2813       llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
2814                                   PGSOQueryType::IRPass)) {
2815     assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
2816            "Cannot emit memory checks when optimizing for size, unless forced "
2817            "to vectorize.");
2818     ORE->emit([&]() {
2819       return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
2820                                         L->getStartLoc(), L->getHeader())
2821              << "Code-size may be reduced by not forcing "
2822                 "vectorization, or by source-code modifications "
2823                 "eliminating the need for runtime checks "
2824                 "(e.g., adding 'restrict').";
2825     });
2826   }
2827 
2828   MemCheckBlock->setName("vector.memcheck");
2829   // Create new preheader for vector loop.
2830   LoopVectorPreHeader =
2831       SplitBlock(MemCheckBlock, MemCheckBlock->getTerminator(), DT, LI, nullptr,
2832                  "vector.ph");
2833 
2834   // Update dominator only if this is first RT check.
2835   if (LoopBypassBlocks.empty()) {
2836     DT->changeImmediateDominator(Bypass, MemCheckBlock);
2837     DT->changeImmediateDominator(LoopExitBlock, MemCheckBlock);
2838   }
2839 
2840   ReplaceInstWithInst(
2841       MemCheckBlock->getTerminator(),
2842       BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheck));
2843   LoopBypassBlocks.push_back(MemCheckBlock);
2844   AddedSafetyChecks = true;
2845 
2846   // We currently don't use LoopVersioning for the actual loop cloning but we
2847   // still use it to add the noalias metadata.
2848   LVer = std::make_unique<LoopVersioning>(*Legal->getLAI(), OrigLoop, LI, DT,
2849                                           PSE.getSE());
2850   LVer->prepareNoAliasMetadata();
2851 }
2852 
2853 Value *InnerLoopVectorizer::emitTransformedIndex(
2854     IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL,
2855     const InductionDescriptor &ID) const {
2856 
2857   SCEVExpander Exp(*SE, DL, "induction");
2858   auto Step = ID.getStep();
2859   auto StartValue = ID.getStartValue();
2860   assert(Index->getType() == Step->getType() &&
2861          "Index type does not match StepValue type");
2862 
2863   // Note: the IR at this point is broken. We cannot use SE to create any new
2864   // SCEV and then expand it, hoping that SCEV's simplification will give us
2865   // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
2866   // lead to various SCEV crashes. So all we can do is to use builder and rely
2867   // on InstCombine for future simplifications. Here we handle some trivial
2868   // cases only.
2869   auto CreateAdd = [&B](Value *X, Value *Y) {
2870     assert(X->getType() == Y->getType() && "Types don't match!");
2871     if (auto *CX = dyn_cast<ConstantInt>(X))
2872       if (CX->isZero())
2873         return Y;
2874     if (auto *CY = dyn_cast<ConstantInt>(Y))
2875       if (CY->isZero())
2876         return X;
2877     return B.CreateAdd(X, Y);
2878   };
2879 
2880   auto CreateMul = [&B](Value *X, Value *Y) {
2881     assert(X->getType() == Y->getType() && "Types don't match!");
2882     if (auto *CX = dyn_cast<ConstantInt>(X))
2883       if (CX->isOne())
2884         return Y;
2885     if (auto *CY = dyn_cast<ConstantInt>(Y))
2886       if (CY->isOne())
2887         return X;
2888     return B.CreateMul(X, Y);
2889   };
2890 
2891   // Get a suitable insert point for SCEV expansion. For blocks in the vector
2892   // loop, choose the end of the vector loop header (=LoopVectorBody), because
2893   // the DomTree is not kept up-to-date for additional blocks generated in the
2894   // vector loop. By using the header as insertion point, we guarantee that the
2895   // expanded instructions dominate all their uses.
2896   auto GetInsertPoint = [this, &B]() {
2897     BasicBlock *InsertBB = B.GetInsertPoint()->getParent();
2898     if (InsertBB != LoopVectorBody &&
2899         LI->getLoopFor(LoopVectorBody) == LI->getLoopFor(InsertBB))
2900       return LoopVectorBody->getTerminator();
2901     return &*B.GetInsertPoint();
2902   };
2903   switch (ID.getKind()) {
2904   case InductionDescriptor::IK_IntInduction: {
2905     assert(Index->getType() == StartValue->getType() &&
2906            "Index type does not match StartValue type");
2907     if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne())
2908       return B.CreateSub(StartValue, Index);
2909     auto *Offset = CreateMul(
2910         Index, Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint()));
2911     return CreateAdd(StartValue, Offset);
2912   }
2913   case InductionDescriptor::IK_PtrInduction: {
2914     assert(isa<SCEVConstant>(Step) &&
2915            "Expected constant step for pointer induction");
2916     return B.CreateGEP(
2917         StartValue->getType()->getPointerElementType(), StartValue,
2918         CreateMul(Index,
2919                   Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint())));
2920   }
2921   case InductionDescriptor::IK_FpInduction: {
2922     assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
2923     auto InductionBinOp = ID.getInductionBinOp();
2924     assert(InductionBinOp &&
2925            (InductionBinOp->getOpcode() == Instruction::FAdd ||
2926             InductionBinOp->getOpcode() == Instruction::FSub) &&
2927            "Original bin op should be defined for FP induction");
2928 
2929     Value *StepValue = cast<SCEVUnknown>(Step)->getValue();
2930 
2931     // Floating point operations had to be 'fast' to enable the induction.
2932     FastMathFlags Flags;
2933     Flags.setFast();
2934 
2935     Value *MulExp = B.CreateFMul(StepValue, Index);
2936     if (isa<Instruction>(MulExp))
2937       // We have to check, the MulExp may be a constant.
2938       cast<Instruction>(MulExp)->setFastMathFlags(Flags);
2939 
2940     Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
2941                                "induction");
2942     if (isa<Instruction>(BOp))
2943       cast<Instruction>(BOp)->setFastMathFlags(Flags);
2944 
2945     return BOp;
2946   }
2947   case InductionDescriptor::IK_NoInduction:
2948     return nullptr;
2949   }
2950   llvm_unreachable("invalid enum");
2951 }
2952 
2953 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
2954   /*
2955    In this function we generate a new loop. The new loop will contain
2956    the vectorized instructions while the old loop will continue to run the
2957    scalar remainder.
2958 
2959        [ ] <-- loop iteration number check.
2960     /   |
2961    /    v
2962   |    [ ] <-- vector loop bypass (may consist of multiple blocks).
2963   |  /  |
2964   | /   v
2965   ||   [ ]     <-- vector pre header.
2966   |/    |
2967   |     v
2968   |    [  ] \
2969   |    [  ]_|   <-- vector loop.
2970   |     |
2971   |     v
2972   |   -[ ]   <--- middle-block.
2973   |  /  |
2974   | /   v
2975   -|- >[ ]     <--- new preheader.
2976    |    |
2977    |    v
2978    |   [ ] \
2979    |   [ ]_|   <-- old scalar loop to handle remainder.
2980     \   |
2981      \  v
2982       >[ ]     <-- exit block.
2983    ...
2984    */
2985 
2986   MDNode *OrigLoopID = OrigLoop->getLoopID();
2987 
2988   // Some loops have a single integer induction variable, while other loops
2989   // don't. One example is c++ iterators that often have multiple pointer
2990   // induction variables. In the code below we also support a case where we
2991   // don't have a single induction variable.
2992   //
2993   // We try to obtain an induction variable from the original loop as hard
2994   // as possible. However if we don't find one that:
2995   //   - is an integer
2996   //   - counts from zero, stepping by one
2997   //   - is the size of the widest induction variable type
2998   // then we create a new one.
2999   OldInduction = Legal->getPrimaryInduction();
3000   Type *IdxTy = Legal->getWidestInductionType();
3001 
3002   // Split the single block loop into the two loop structure described above.
3003   LoopScalarBody = OrigLoop->getHeader();
3004   LoopVectorPreHeader = OrigLoop->getLoopPreheader();
3005   LoopExitBlock = OrigLoop->getExitBlock();
3006   assert(LoopExitBlock && "Must have an exit block");
3007   assert(LoopVectorPreHeader && "Invalid loop structure");
3008 
3009   LoopMiddleBlock =
3010       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
3011                  LI, nullptr, "middle.block");
3012   LoopScalarPreHeader =
3013       SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,
3014                  nullptr, "scalar.ph");
3015   // We intentionally don't let SplitBlock to update LoopInfo since
3016   // LoopVectorBody should belong to another loop than LoopVectorPreHeader.
3017   // LoopVectorBody is explicitly added to the correct place few lines later.
3018   LoopVectorBody =
3019       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
3020                  nullptr, nullptr, "vector.body");
3021 
3022   // Update dominator for loop exit.
3023   DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
3024 
3025   // Create and register the new vector loop.
3026   Loop *Lp = LI->AllocateLoop();
3027   Loop *ParentLoop = OrigLoop->getParentLoop();
3028 
3029   // Insert the new loop into the loop nest and register the new basic blocks
3030   // before calling any utilities such as SCEV that require valid LoopInfo.
3031   if (ParentLoop) {
3032     ParentLoop->addChildLoop(Lp);
3033   } else {
3034     LI->addTopLevelLoop(Lp);
3035   }
3036   Lp->addBasicBlockToLoop(LoopVectorBody, *LI);
3037 
3038   // Find the loop boundaries.
3039   Value *Count = getOrCreateTripCount(Lp);
3040 
3041   Value *StartIdx = ConstantInt::get(IdxTy, 0);
3042 
3043   // Now, compare the new count to zero. If it is zero skip the vector loop and
3044   // jump to the scalar loop. This check also covers the case where the
3045   // backedge-taken count is uint##_max: adding one to it will overflow leading
3046   // to an incorrect trip count of zero. In this (rare) case we will also jump
3047   // to the scalar loop.
3048   emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader);
3049 
3050   // Generate the code to check any assumptions that we've made for SCEV
3051   // expressions.
3052   emitSCEVChecks(Lp, LoopScalarPreHeader);
3053 
3054   // Generate the code that checks in runtime if arrays overlap. We put the
3055   // checks into a separate block to make the more common case of few elements
3056   // faster.
3057   emitMemRuntimeChecks(Lp, LoopScalarPreHeader);
3058 
3059   // Generate the induction variable.
3060   // The loop step is equal to the vectorization factor (num of SIMD elements)
3061   // times the unroll factor (num of SIMD instructions).
3062   Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
3063   Constant *Step = ConstantInt::get(IdxTy, VF * UF);
3064   Induction =
3065       createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
3066                               getDebugLocFromInstOrOperands(OldInduction));
3067 
3068   // We are going to resume the execution of the scalar loop.
3069   // Go over all of the induction variables that we found and fix the
3070   // PHIs that are left in the scalar version of the loop.
3071   // The starting values of PHI nodes depend on the counter of the last
3072   // iteration in the vectorized loop.
3073   // If we come from a bypass edge then we need to start from the original
3074   // start value.
3075 
3076   // This variable saves the new starting index for the scalar loop. It is used
3077   // to test if there are any tail iterations left once the vector loop has
3078   // completed.
3079   for (auto &InductionEntry : Legal->getInductionVars()) {
3080     PHINode *OrigPhi = InductionEntry.first;
3081     InductionDescriptor II = InductionEntry.second;
3082 
3083     // Create phi nodes to merge from the  backedge-taken check block.
3084     PHINode *BCResumeVal =
3085         PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
3086                         LoopScalarPreHeader->getTerminator());
3087     // Copy original phi DL over to the new one.
3088     BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
3089     Value *&EndValue = IVEndValues[OrigPhi];
3090     if (OrigPhi == OldInduction) {
3091       // We know what the end value is.
3092       EndValue = CountRoundDown;
3093     } else {
3094       IRBuilder<> B(Lp->getLoopPreheader()->getTerminator());
3095       Type *StepType = II.getStep()->getType();
3096       Instruction::CastOps CastOp =
3097           CastInst::getCastOpcode(CountRoundDown, true, StepType, true);
3098       Value *CRD = B.CreateCast(CastOp, CountRoundDown, StepType, "cast.crd");
3099       const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout();
3100       EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
3101       EndValue->setName("ind.end");
3102     }
3103 
3104     // The new PHI merges the original incoming value, in case of a bypass,
3105     // or the value at the end of the vectorized loop.
3106     BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);
3107 
3108     // Fix the scalar body counter (PHI node).
3109     // The old induction's phi node in the scalar body needs the truncated
3110     // value.
3111     for (BasicBlock *BB : LoopBypassBlocks)
3112       BCResumeVal->addIncoming(II.getStartValue(), BB);
3113     OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
3114   }
3115 
3116   // We need the OrigLoop (scalar loop part) latch terminator to help
3117   // produce correct debug info for the middle block BB instructions.
3118   // The legality check stage guarantees that the loop will have a single
3119   // latch.
3120   assert(isa<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()) &&
3121          "Scalar loop latch terminator isn't a branch");
3122   BranchInst *ScalarLatchBr =
3123       cast<BranchInst>(OrigLoop->getLoopLatch()->getTerminator());
3124 
3125   // Add a check in the middle block to see if we have completed
3126   // all of the iterations in the first vector loop.
3127   // If (N - N%VF) == N, then we *don't* need to run the remainder.
3128   // If tail is to be folded, we know we don't need to run the remainder.
3129   Value *CmpN = Builder.getTrue();
3130   if (!Cost->foldTailByMasking()) {
3131     CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count,
3132                            CountRoundDown, "cmp.n",
3133                            LoopMiddleBlock->getTerminator());
3134 
3135     // Here we use the same DebugLoc as the scalar loop latch branch instead
3136     // of the corresponding compare because they may have ended up with
3137     // different line numbers and we want to avoid awkward line stepping while
3138     // debugging. Eg. if the compare has got a line number inside the loop.
3139     cast<Instruction>(CmpN)->setDebugLoc(ScalarLatchBr->getDebugLoc());
3140   }
3141 
3142   BranchInst *BrInst =
3143       BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, CmpN);
3144   BrInst->setDebugLoc(ScalarLatchBr->getDebugLoc());
3145   ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst);
3146 
3147   // Get ready to start creating new instructions into the vectorized body.
3148   assert(LoopVectorPreHeader == Lp->getLoopPreheader() &&
3149          "Inconsistent vector loop preheader");
3150   Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt());
3151 
3152   Optional<MDNode *> VectorizedLoopID =
3153       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
3154                                       LLVMLoopVectorizeFollowupVectorized});
3155   if (VectorizedLoopID.hasValue()) {
3156     Lp->setLoopID(VectorizedLoopID.getValue());
3157 
3158     // Do not setAlreadyVectorized if loop attributes have been defined
3159     // explicitly.
3160     return LoopVectorPreHeader;
3161   }
3162 
3163   // Keep all loop hints from the original loop on the vector loop (we'll
3164   // replace the vectorizer-specific hints below).
3165   if (MDNode *LID = OrigLoop->getLoopID())
3166     Lp->setLoopID(LID);
3167 
3168   LoopVectorizeHints Hints(Lp, true, *ORE);
3169   Hints.setAlreadyVectorized();
3170 
3171 #ifdef EXPENSIVE_CHECKS
3172   assert(DT->verify(DominatorTree::VerificationLevel::Fast));
3173   LI->verify(*DT);
3174 #endif
3175 
3176   return LoopVectorPreHeader;
3177 }
3178 
3179 // Fix up external users of the induction variable. At this point, we are
3180 // in LCSSA form, with all external PHIs that use the IV having one input value,
3181 // coming from the remainder loop. We need those PHIs to also have a correct
3182 // value for the IV when arriving directly from the middle block.
3183 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
3184                                        const InductionDescriptor &II,
3185                                        Value *CountRoundDown, Value *EndValue,
3186                                        BasicBlock *MiddleBlock) {
3187   // There are two kinds of external IV usages - those that use the value
3188   // computed in the last iteration (the PHI) and those that use the penultimate
3189   // value (the value that feeds into the phi from the loop latch).
3190   // We allow both, but they, obviously, have different values.
3191 
3192   assert(OrigLoop->getExitBlock() && "Expected a single exit block");
3193 
3194   DenseMap<Value *, Value *> MissingVals;
3195 
3196   // An external user of the last iteration's value should see the value that
3197   // the remainder loop uses to initialize its own IV.
3198   Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
3199   for (User *U : PostInc->users()) {
3200     Instruction *UI = cast<Instruction>(U);
3201     if (!OrigLoop->contains(UI)) {
3202       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3203       MissingVals[UI] = EndValue;
3204     }
3205   }
3206 
3207   // An external user of the penultimate value need to see EndValue - Step.
3208   // The simplest way to get this is to recompute it from the constituent SCEVs,
3209   // that is Start + (Step * (CRD - 1)).
3210   for (User *U : OrigPhi->users()) {
3211     auto *UI = cast<Instruction>(U);
3212     if (!OrigLoop->contains(UI)) {
3213       const DataLayout &DL =
3214           OrigLoop->getHeader()->getModule()->getDataLayout();
3215       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3216 
3217       IRBuilder<> B(MiddleBlock->getTerminator());
3218       Value *CountMinusOne = B.CreateSub(
3219           CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1));
3220       Value *CMO =
3221           !II.getStep()->getType()->isIntegerTy()
3222               ? B.CreateCast(Instruction::SIToFP, CountMinusOne,
3223                              II.getStep()->getType())
3224               : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType());
3225       CMO->setName("cast.cmo");
3226       Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II);
3227       Escape->setName("ind.escape");
3228       MissingVals[UI] = Escape;
3229     }
3230   }
3231 
3232   for (auto &I : MissingVals) {
3233     PHINode *PHI = cast<PHINode>(I.first);
3234     // One corner case we have to handle is two IVs "chasing" each-other,
3235     // that is %IV2 = phi [...], [ %IV1, %latch ]
3236     // In this case, if IV1 has an external use, we need to avoid adding both
3237     // "last value of IV1" and "penultimate value of IV2". So, verify that we
3238     // don't already have an incoming value for the middle block.
3239     if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
3240       PHI->addIncoming(I.second, MiddleBlock);
3241   }
3242 }
3243 
3244 namespace {
3245 
3246 struct CSEDenseMapInfo {
3247   static bool canHandle(const Instruction *I) {
3248     return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
3249            isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
3250   }
3251 
3252   static inline Instruction *getEmptyKey() {
3253     return DenseMapInfo<Instruction *>::getEmptyKey();
3254   }
3255 
3256   static inline Instruction *getTombstoneKey() {
3257     return DenseMapInfo<Instruction *>::getTombstoneKey();
3258   }
3259 
3260   static unsigned getHashValue(const Instruction *I) {
3261     assert(canHandle(I) && "Unknown instruction!");
3262     return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
3263                                                            I->value_op_end()));
3264   }
3265 
3266   static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
3267     if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3268         LHS == getTombstoneKey() || RHS == getTombstoneKey())
3269       return LHS == RHS;
3270     return LHS->isIdenticalTo(RHS);
3271   }
3272 };
3273 
3274 } // end anonymous namespace
3275 
3276 ///Perform cse of induction variable instructions.
3277 static void cse(BasicBlock *BB) {
3278   // Perform simple cse.
3279   SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
3280   for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
3281     Instruction *In = &*I++;
3282 
3283     if (!CSEDenseMapInfo::canHandle(In))
3284       continue;
3285 
3286     // Check if we can replace this instruction with any of the
3287     // visited instructions.
3288     if (Instruction *V = CSEMap.lookup(In)) {
3289       In->replaceAllUsesWith(V);
3290       In->eraseFromParent();
3291       continue;
3292     }
3293 
3294     CSEMap[In] = In;
3295   }
3296 }
3297 
3298 unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
3299                                                        unsigned VF,
3300                                                        bool &NeedToScalarize) {
3301   Function *F = CI->getCalledFunction();
3302   Type *ScalarRetTy = CI->getType();
3303   SmallVector<Type *, 4> Tys, ScalarTys;
3304   for (auto &ArgOp : CI->arg_operands())
3305     ScalarTys.push_back(ArgOp->getType());
3306 
3307   // Estimate cost of scalarized vector call. The source operands are assumed
3308   // to be vectors, so we need to extract individual elements from there,
3309   // execute VF scalar calls, and then gather the result into the vector return
3310   // value.
3311   unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys,
3312                                                  TTI::TCK_RecipThroughput);
3313   if (VF == 1)
3314     return ScalarCallCost;
3315 
3316   // Compute corresponding vector type for return value and arguments.
3317   Type *RetTy = ToVectorTy(ScalarRetTy, VF);
3318   for (Type *ScalarTy : ScalarTys)
3319     Tys.push_back(ToVectorTy(ScalarTy, VF));
3320 
3321   // Compute costs of unpacking argument values for the scalar calls and
3322   // packing the return values to a vector.
3323   unsigned ScalarizationCost = getScalarizationOverhead(CI, VF);
3324 
3325   unsigned Cost = ScalarCallCost * VF + ScalarizationCost;
3326 
3327   // If we can't emit a vector call for this function, then the currently found
3328   // cost is the cost we need to return.
3329   NeedToScalarize = true;
3330   VFShape Shape = VFShape::get(*CI, {VF, false}, false /*HasGlobalPred*/);
3331   Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
3332 
3333   if (!TLI || CI->isNoBuiltin() || !VecFunc)
3334     return Cost;
3335 
3336   // If the corresponding vector cost is cheaper, return its cost.
3337   unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys,
3338                                                  TTI::TCK_RecipThroughput);
3339   if (VectorCallCost < Cost) {
3340     NeedToScalarize = false;
3341     return VectorCallCost;
3342   }
3343   return Cost;
3344 }
3345 
3346 unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
3347                                                             unsigned VF) {
3348   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
3349   assert(ID && "Expected intrinsic call!");
3350 
3351   IntrinsicCostAttributes CostAttrs(ID, *CI, VF);
3352   return TTI.getIntrinsicInstrCost(CostAttrs,
3353                                    TargetTransformInfo::TCK_RecipThroughput);
3354 }
3355 
3356 static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
3357   auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3358   auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3359   return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3360 }
3361 
3362 static Type *largestIntegerVectorType(Type *T1, Type *T2) {
3363   auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3364   auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3365   return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3366 }
3367 
3368 void InnerLoopVectorizer::truncateToMinimalBitwidths() {
3369   // For every instruction `I` in MinBWs, truncate the operands, create a
3370   // truncated version of `I` and reextend its result. InstCombine runs
3371   // later and will remove any ext/trunc pairs.
3372   SmallPtrSet<Value *, 4> Erased;
3373   for (const auto &KV : Cost->getMinimalBitwidths()) {
3374     // If the value wasn't vectorized, we must maintain the original scalar
3375     // type. The absence of the value from VectorLoopValueMap indicates that it
3376     // wasn't vectorized.
3377     if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3378       continue;
3379     for (unsigned Part = 0; Part < UF; ++Part) {
3380       Value *I = getOrCreateVectorValue(KV.first, Part);
3381       if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I))
3382         continue;
3383       Type *OriginalTy = I->getType();
3384       Type *ScalarTruncatedTy =
3385           IntegerType::get(OriginalTy->getContext(), KV.second);
3386       auto *TruncatedTy = FixedVectorType::get(
3387           ScalarTruncatedTy, cast<VectorType>(OriginalTy)->getNumElements());
3388       if (TruncatedTy == OriginalTy)
3389         continue;
3390 
3391       IRBuilder<> B(cast<Instruction>(I));
3392       auto ShrinkOperand = [&](Value *V) -> Value * {
3393         if (auto *ZI = dyn_cast<ZExtInst>(V))
3394           if (ZI->getSrcTy() == TruncatedTy)
3395             return ZI->getOperand(0);
3396         return B.CreateZExtOrTrunc(V, TruncatedTy);
3397       };
3398 
3399       // The actual instruction modification depends on the instruction type,
3400       // unfortunately.
3401       Value *NewI = nullptr;
3402       if (auto *BO = dyn_cast<BinaryOperator>(I)) {
3403         NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
3404                              ShrinkOperand(BO->getOperand(1)));
3405 
3406         // Any wrapping introduced by shrinking this operation shouldn't be
3407         // considered undefined behavior. So, we can't unconditionally copy
3408         // arithmetic wrapping flags to NewI.
3409         cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
3410       } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
3411         NewI =
3412             B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
3413                          ShrinkOperand(CI->getOperand(1)));
3414       } else if (auto *SI = dyn_cast<SelectInst>(I)) {
3415         NewI = B.CreateSelect(SI->getCondition(),
3416                               ShrinkOperand(SI->getTrueValue()),
3417                               ShrinkOperand(SI->getFalseValue()));
3418       } else if (auto *CI = dyn_cast<CastInst>(I)) {
3419         switch (CI->getOpcode()) {
3420         default:
3421           llvm_unreachable("Unhandled cast!");
3422         case Instruction::Trunc:
3423           NewI = ShrinkOperand(CI->getOperand(0));
3424           break;
3425         case Instruction::SExt:
3426           NewI = B.CreateSExtOrTrunc(
3427               CI->getOperand(0),
3428               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3429           break;
3430         case Instruction::ZExt:
3431           NewI = B.CreateZExtOrTrunc(
3432               CI->getOperand(0),
3433               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3434           break;
3435         }
3436       } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
3437         auto Elements0 =
3438             cast<VectorType>(SI->getOperand(0)->getType())->getNumElements();
3439         auto *O0 = B.CreateZExtOrTrunc(
3440             SI->getOperand(0),
3441             FixedVectorType::get(ScalarTruncatedTy, Elements0));
3442         auto Elements1 =
3443             cast<VectorType>(SI->getOperand(1)->getType())->getNumElements();
3444         auto *O1 = B.CreateZExtOrTrunc(
3445             SI->getOperand(1),
3446             FixedVectorType::get(ScalarTruncatedTy, Elements1));
3447 
3448         NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask());
3449       } else if (isa<LoadInst>(I) || isa<PHINode>(I)) {
3450         // Don't do anything with the operands, just extend the result.
3451         continue;
3452       } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
3453         auto Elements =
3454             cast<VectorType>(IE->getOperand(0)->getType())->getNumElements();
3455         auto *O0 = B.CreateZExtOrTrunc(
3456             IE->getOperand(0),
3457             FixedVectorType::get(ScalarTruncatedTy, Elements));
3458         auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
3459         NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
3460       } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
3461         auto Elements =
3462             cast<VectorType>(EE->getOperand(0)->getType())->getNumElements();
3463         auto *O0 = B.CreateZExtOrTrunc(
3464             EE->getOperand(0),
3465             FixedVectorType::get(ScalarTruncatedTy, Elements));
3466         NewI = B.CreateExtractElement(O0, EE->getOperand(2));
3467       } else {
3468         // If we don't know what to do, be conservative and don't do anything.
3469         continue;
3470       }
3471 
3472       // Lastly, extend the result.
3473       NewI->takeName(cast<Instruction>(I));
3474       Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
3475       I->replaceAllUsesWith(Res);
3476       cast<Instruction>(I)->eraseFromParent();
3477       Erased.insert(I);
3478       VectorLoopValueMap.resetVectorValue(KV.first, Part, Res);
3479     }
3480   }
3481 
3482   // We'll have created a bunch of ZExts that are now parentless. Clean up.
3483   for (const auto &KV : Cost->getMinimalBitwidths()) {
3484     // If the value wasn't vectorized, we must maintain the original scalar
3485     // type. The absence of the value from VectorLoopValueMap indicates that it
3486     // wasn't vectorized.
3487     if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3488       continue;
3489     for (unsigned Part = 0; Part < UF; ++Part) {
3490       Value *I = getOrCreateVectorValue(KV.first, Part);
3491       ZExtInst *Inst = dyn_cast<ZExtInst>(I);
3492       if (Inst && Inst->use_empty()) {
3493         Value *NewI = Inst->getOperand(0);
3494         Inst->eraseFromParent();
3495         VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI);
3496       }
3497     }
3498   }
3499 }
3500 
3501 void InnerLoopVectorizer::fixVectorizedLoop() {
3502   // Insert truncates and extends for any truncated instructions as hints to
3503   // InstCombine.
3504   if (VF > 1)
3505     truncateToMinimalBitwidths();
3506 
3507   // Fix widened non-induction PHIs by setting up the PHI operands.
3508   if (OrigPHIsToFix.size()) {
3509     assert(EnableVPlanNativePath &&
3510            "Unexpected non-induction PHIs for fixup in non VPlan-native path");
3511     fixNonInductionPHIs();
3512   }
3513 
3514   // At this point every instruction in the original loop is widened to a
3515   // vector form. Now we need to fix the recurrences in the loop. These PHI
3516   // nodes are currently empty because we did not want to introduce cycles.
3517   // This is the second stage of vectorizing recurrences.
3518   fixCrossIterationPHIs();
3519 
3520   // Forget the original basic block.
3521   PSE.getSE()->forgetLoop(OrigLoop);
3522 
3523   // Fix-up external users of the induction variables.
3524   for (auto &Entry : Legal->getInductionVars())
3525     fixupIVUsers(Entry.first, Entry.second,
3526                  getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)),
3527                  IVEndValues[Entry.first], LoopMiddleBlock);
3528 
3529   fixLCSSAPHIs();
3530   for (Instruction *PI : PredicatedInstructions)
3531     sinkScalarOperands(&*PI);
3532 
3533   // Remove redundant induction instructions.
3534   cse(LoopVectorBody);
3535 
3536   // Set/update profile weights for the vector and remainder loops as original
3537   // loop iterations are now distributed among them. Note that original loop
3538   // represented by LoopScalarBody becomes remainder loop after vectorization.
3539   //
3540   // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
3541   // end up getting slightly roughened result but that should be OK since
3542   // profile is not inherently precise anyway. Note also possible bypass of
3543   // vector code caused by legality checks is ignored, assigning all the weight
3544   // to the vector loop, optimistically.
3545   setProfileInfoAfterUnrolling(LI->getLoopFor(LoopScalarBody),
3546                                LI->getLoopFor(LoopVectorBody),
3547                                LI->getLoopFor(LoopScalarBody), VF * UF);
3548 }
3549 
3550 void InnerLoopVectorizer::fixCrossIterationPHIs() {
3551   // In order to support recurrences we need to be able to vectorize Phi nodes.
3552   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3553   // stage #2: We now need to fix the recurrences by adding incoming edges to
3554   // the currently empty PHI nodes. At this point every instruction in the
3555   // original loop is widened to a vector form so we can use them to construct
3556   // the incoming edges.
3557   for (PHINode &Phi : OrigLoop->getHeader()->phis()) {
3558     // Handle first-order recurrences and reductions that need to be fixed.
3559     if (Legal->isFirstOrderRecurrence(&Phi))
3560       fixFirstOrderRecurrence(&Phi);
3561     else if (Legal->isReductionVariable(&Phi))
3562       fixReduction(&Phi);
3563   }
3564 }
3565 
3566 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
3567   // This is the second phase of vectorizing first-order recurrences. An
3568   // overview of the transformation is described below. Suppose we have the
3569   // following loop.
3570   //
3571   //   for (int i = 0; i < n; ++i)
3572   //     b[i] = a[i] - a[i - 1];
3573   //
3574   // There is a first-order recurrence on "a". For this loop, the shorthand
3575   // scalar IR looks like:
3576   //
3577   //   scalar.ph:
3578   //     s_init = a[-1]
3579   //     br scalar.body
3580   //
3581   //   scalar.body:
3582   //     i = phi [0, scalar.ph], [i+1, scalar.body]
3583   //     s1 = phi [s_init, scalar.ph], [s2, scalar.body]
3584   //     s2 = a[i]
3585   //     b[i] = s2 - s1
3586   //     br cond, scalar.body, ...
3587   //
3588   // In this example, s1 is a recurrence because it's value depends on the
3589   // previous iteration. In the first phase of vectorization, we created a
3590   // temporary value for s1. We now complete the vectorization and produce the
3591   // shorthand vector IR shown below (for VF = 4, UF = 1).
3592   //
3593   //   vector.ph:
3594   //     v_init = vector(..., ..., ..., a[-1])
3595   //     br vector.body
3596   //
3597   //   vector.body
3598   //     i = phi [0, vector.ph], [i+4, vector.body]
3599   //     v1 = phi [v_init, vector.ph], [v2, vector.body]
3600   //     v2 = a[i, i+1, i+2, i+3];
3601   //     v3 = vector(v1(3), v2(0, 1, 2))
3602   //     b[i, i+1, i+2, i+3] = v2 - v3
3603   //     br cond, vector.body, middle.block
3604   //
3605   //   middle.block:
3606   //     x = v2(3)
3607   //     br scalar.ph
3608   //
3609   //   scalar.ph:
3610   //     s_init = phi [x, middle.block], [a[-1], otherwise]
3611   //     br scalar.body
3612   //
3613   // After execution completes the vector loop, we extract the next value of
3614   // the recurrence (x) to use as the initial value in the scalar loop.
3615 
3616   // Get the original loop preheader and single loop latch.
3617   auto *Preheader = OrigLoop->getLoopPreheader();
3618   auto *Latch = OrigLoop->getLoopLatch();
3619 
3620   // Get the initial and previous values of the scalar recurrence.
3621   auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader);
3622   auto *Previous = Phi->getIncomingValueForBlock(Latch);
3623 
3624   // Create a vector from the initial value.
3625   auto *VectorInit = ScalarInit;
3626   if (VF > 1) {
3627     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
3628     VectorInit = Builder.CreateInsertElement(
3629         UndefValue::get(FixedVectorType::get(VectorInit->getType(), VF)),
3630         VectorInit, Builder.getInt32(VF - 1), "vector.recur.init");
3631   }
3632 
3633   // We constructed a temporary phi node in the first phase of vectorization.
3634   // This phi node will eventually be deleted.
3635   Builder.SetInsertPoint(
3636       cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0)));
3637 
3638   // Create a phi node for the new recurrence. The current value will either be
3639   // the initial value inserted into a vector or loop-varying vector value.
3640   auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur");
3641   VecPhi->addIncoming(VectorInit, LoopVectorPreHeader);
3642 
3643   // Get the vectorized previous value of the last part UF - 1. It appears last
3644   // among all unrolled iterations, due to the order of their construction.
3645   Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1);
3646 
3647   // Find and set the insertion point after the previous value if it is an
3648   // instruction.
3649   BasicBlock::iterator InsertPt;
3650   // Note that the previous value may have been constant-folded so it is not
3651   // guaranteed to be an instruction in the vector loop.
3652   // FIXME: Loop invariant values do not form recurrences. We should deal with
3653   //        them earlier.
3654   if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart))
3655     InsertPt = LoopVectorBody->getFirstInsertionPt();
3656   else {
3657     Instruction *PreviousInst = cast<Instruction>(PreviousLastPart);
3658     if (isa<PHINode>(PreviousLastPart))
3659       // If the previous value is a phi node, we should insert after all the phi
3660       // nodes in the block containing the PHI to avoid breaking basic block
3661       // verification. Note that the basic block may be different to
3662       // LoopVectorBody, in case we predicate the loop.
3663       InsertPt = PreviousInst->getParent()->getFirstInsertionPt();
3664     else
3665       InsertPt = ++PreviousInst->getIterator();
3666   }
3667   Builder.SetInsertPoint(&*InsertPt);
3668 
3669   // We will construct a vector for the recurrence by combining the values for
3670   // the current and previous iterations. This is the required shuffle mask.
3671   SmallVector<int, 8> ShuffleMask(VF);
3672   ShuffleMask[0] = VF - 1;
3673   for (unsigned I = 1; I < VF; ++I)
3674     ShuffleMask[I] = I + VF - 1;
3675 
3676   // The vector from which to take the initial value for the current iteration
3677   // (actual or unrolled). Initially, this is the vector phi node.
3678   Value *Incoming = VecPhi;
3679 
3680   // Shuffle the current and previous vector and update the vector parts.
3681   for (unsigned Part = 0; Part < UF; ++Part) {
3682     Value *PreviousPart = getOrCreateVectorValue(Previous, Part);
3683     Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part);
3684     auto *Shuffle = VF > 1 ? Builder.CreateShuffleVector(Incoming, PreviousPart,
3685                                                          ShuffleMask)
3686                            : Incoming;
3687     PhiPart->replaceAllUsesWith(Shuffle);
3688     cast<Instruction>(PhiPart)->eraseFromParent();
3689     VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle);
3690     Incoming = PreviousPart;
3691   }
3692 
3693   // Fix the latch value of the new recurrence in the vector loop.
3694   VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
3695 
3696   // Extract the last vector element in the middle block. This will be the
3697   // initial value for the recurrence when jumping to the scalar loop.
3698   auto *ExtractForScalar = Incoming;
3699   if (VF > 1) {
3700     Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
3701     ExtractForScalar = Builder.CreateExtractElement(
3702         ExtractForScalar, Builder.getInt32(VF - 1), "vector.recur.extract");
3703   }
3704   // Extract the second last element in the middle block if the
3705   // Phi is used outside the loop. We need to extract the phi itself
3706   // and not the last element (the phi update in the current iteration). This
3707   // will be the value when jumping to the exit block from the LoopMiddleBlock,
3708   // when the scalar loop is not run at all.
3709   Value *ExtractForPhiUsedOutsideLoop = nullptr;
3710   if (VF > 1)
3711     ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
3712         Incoming, Builder.getInt32(VF - 2), "vector.recur.extract.for.phi");
3713   // When loop is unrolled without vectorizing, initialize
3714   // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of
3715   // `Incoming`. This is analogous to the vectorized case above: extracting the
3716   // second last element when VF > 1.
3717   else if (UF > 1)
3718     ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2);
3719 
3720   // Fix the initial value of the original recurrence in the scalar loop.
3721   Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
3722   auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
3723   for (auto *BB : predecessors(LoopScalarPreHeader)) {
3724     auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
3725     Start->addIncoming(Incoming, BB);
3726   }
3727 
3728   Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start);
3729   Phi->setName("scalar.recur");
3730 
3731   // Finally, fix users of the recurrence outside the loop. The users will need
3732   // either the last value of the scalar recurrence or the last value of the
3733   // vector recurrence we extracted in the middle block. Since the loop is in
3734   // LCSSA form, we just need to find all the phi nodes for the original scalar
3735   // recurrence in the exit block, and then add an edge for the middle block.
3736   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3737     if (LCSSAPhi.getIncomingValue(0) == Phi) {
3738       LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
3739     }
3740   }
3741 }
3742 
3743 void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
3744   Constant *Zero = Builder.getInt32(0);
3745 
3746   // Get it's reduction variable descriptor.
3747   assert(Legal->isReductionVariable(Phi) &&
3748          "Unable to find the reduction variable");
3749   RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi];
3750 
3751   RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();
3752   TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
3753   Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
3754   RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind =
3755     RdxDesc.getMinMaxRecurrenceKind();
3756   setDebugLocFromInst(Builder, ReductionStartValue);
3757 
3758   // We need to generate a reduction vector from the incoming scalar.
3759   // To do so, we need to generate the 'identity' vector and override
3760   // one of the elements with the incoming scalar reduction. We need
3761   // to do it in the vector-loop preheader.
3762   Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
3763 
3764   // This is the vector-clone of the value that leaves the loop.
3765   Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType();
3766 
3767   // Find the reduction identity variable. Zero for addition, or, xor,
3768   // one for multiplication, -1 for And.
3769   Value *Identity;
3770   Value *VectorStart;
3771   if (RK == RecurrenceDescriptor::RK_IntegerMinMax ||
3772       RK == RecurrenceDescriptor::RK_FloatMinMax) {
3773     // MinMax reduction have the start value as their identify.
3774     if (VF == 1) {
3775       VectorStart = Identity = ReductionStartValue;
3776     } else {
3777       VectorStart = Identity =
3778         Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident");
3779     }
3780   } else {
3781     // Handle other reduction kinds:
3782     Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity(
3783         RK, VecTy->getScalarType());
3784     if (VF == 1) {
3785       Identity = Iden;
3786       // This vector is the Identity vector where the first element is the
3787       // incoming scalar reduction.
3788       VectorStart = ReductionStartValue;
3789     } else {
3790       Identity = ConstantVector::getSplat({VF, false}, Iden);
3791 
3792       // This vector is the Identity vector where the first element is the
3793       // incoming scalar reduction.
3794       VectorStart =
3795         Builder.CreateInsertElement(Identity, ReductionStartValue, Zero);
3796     }
3797   }
3798 
3799   // Wrap flags are in general invalid after vectorization, clear them.
3800   clearReductionWrapFlags(RdxDesc);
3801 
3802   // Fix the vector-loop phi.
3803 
3804   // Reductions do not have to start at zero. They can start with
3805   // any loop invariant values.
3806   BasicBlock *Latch = OrigLoop->getLoopLatch();
3807   Value *LoopVal = Phi->getIncomingValueForBlock(Latch);
3808 
3809   for (unsigned Part = 0; Part < UF; ++Part) {
3810     Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part);
3811     Value *Val = getOrCreateVectorValue(LoopVal, Part);
3812     // Make sure to add the reduction start value only to the
3813     // first unroll part.
3814     Value *StartVal = (Part == 0) ? VectorStart : Identity;
3815     cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader);
3816     cast<PHINode>(VecRdxPhi)
3817       ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
3818   }
3819 
3820   // Before each round, move the insertion point right between
3821   // the PHIs and the values we are going to write.
3822   // This allows us to write both PHINodes and the extractelement
3823   // instructions.
3824   Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3825 
3826   setDebugLocFromInst(Builder, LoopExitInst);
3827 
3828   // If tail is folded by masking, the vector value to leave the loop should be
3829   // a Select choosing between the vectorized LoopExitInst and vectorized Phi,
3830   // instead of the former.
3831   if (Cost->foldTailByMasking()) {
3832     for (unsigned Part = 0; Part < UF; ++Part) {
3833       Value *VecLoopExitInst =
3834           VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3835       Value *Sel = nullptr;
3836       for (User *U : VecLoopExitInst->users()) {
3837         if (isa<SelectInst>(U)) {
3838           assert(!Sel && "Reduction exit feeding two selects");
3839           Sel = U;
3840         } else
3841           assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select");
3842       }
3843       assert(Sel && "Reduction exit feeds no select");
3844       VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, Sel);
3845     }
3846   }
3847 
3848   // If the vector reduction can be performed in a smaller type, we truncate
3849   // then extend the loop exit value to enable InstCombine to evaluate the
3850   // entire expression in the smaller type.
3851   if (VF > 1 && Phi->getType() != RdxDesc.getRecurrenceType()) {
3852     Type *RdxVecTy = FixedVectorType::get(RdxDesc.getRecurrenceType(), VF);
3853     Builder.SetInsertPoint(
3854         LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator());
3855     VectorParts RdxParts(UF);
3856     for (unsigned Part = 0; Part < UF; ++Part) {
3857       RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3858       Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3859       Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
3860                                         : Builder.CreateZExt(Trunc, VecTy);
3861       for (Value::user_iterator UI = RdxParts[Part]->user_begin();
3862            UI != RdxParts[Part]->user_end();)
3863         if (*UI != Trunc) {
3864           (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd);
3865           RdxParts[Part] = Extnd;
3866         } else {
3867           ++UI;
3868         }
3869     }
3870     Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3871     for (unsigned Part = 0; Part < UF; ++Part) {
3872       RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3873       VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]);
3874     }
3875   }
3876 
3877   // Reduce all of the unrolled parts into a single vector.
3878   Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0);
3879   unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK);
3880 
3881   // The middle block terminator has already been assigned a DebugLoc here (the
3882   // OrigLoop's single latch terminator). We want the whole middle block to
3883   // appear to execute on this line because: (a) it is all compiler generated,
3884   // (b) these instructions are always executed after evaluating the latch
3885   // conditional branch, and (c) other passes may add new predecessors which
3886   // terminate on this line. This is the easiest way to ensure we don't
3887   // accidentally cause an extra step back into the loop while debugging.
3888   setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator());
3889   for (unsigned Part = 1; Part < UF; ++Part) {
3890     Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3891     if (Op != Instruction::ICmp && Op != Instruction::FCmp)
3892       // Floating point operations had to be 'fast' to enable the reduction.
3893       ReducedPartRdx = addFastMathFlag(
3894           Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxPart,
3895                               ReducedPartRdx, "bin.rdx"),
3896           RdxDesc.getFastMathFlags());
3897     else
3898       ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx,
3899                                       RdxPart);
3900   }
3901 
3902   if (VF > 1) {
3903     bool NoNaN = Legal->hasFunNoNaNAttr();
3904     ReducedPartRdx =
3905         createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN);
3906     // If the reduction can be performed in a smaller type, we need to extend
3907     // the reduction to the wider type before we branch to the original loop.
3908     if (Phi->getType() != RdxDesc.getRecurrenceType())
3909       ReducedPartRdx =
3910         RdxDesc.isSigned()
3911         ? Builder.CreateSExt(ReducedPartRdx, Phi->getType())
3912         : Builder.CreateZExt(ReducedPartRdx, Phi->getType());
3913   }
3914 
3915   // Create a phi node that merges control-flow from the backedge-taken check
3916   // block and the middle block.
3917   PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx",
3918                                         LoopScalarPreHeader->getTerminator());
3919   for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
3920     BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]);
3921   BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
3922 
3923   // Now, we need to fix the users of the reduction variable
3924   // inside and outside of the scalar remainder loop.
3925   // We know that the loop is in LCSSA form. We need to update the
3926   // PHI nodes in the exit blocks.
3927   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3928     // All PHINodes need to have a single entry edge, or two if
3929     // we already fixed them.
3930     assert(LCSSAPhi.getNumIncomingValues() < 3 && "Invalid LCSSA PHI");
3931 
3932     // We found a reduction value exit-PHI. Update it with the
3933     // incoming bypass edge.
3934     if (LCSSAPhi.getIncomingValue(0) == LoopExitInst)
3935       LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
3936   } // end of the LCSSA phi scan.
3937 
3938     // Fix the scalar loop reduction variable with the incoming reduction sum
3939     // from the vector body and from the backedge value.
3940   int IncomingEdgeBlockIdx =
3941     Phi->getBasicBlockIndex(OrigLoop->getLoopLatch());
3942   assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
3943   // Pick the other block.
3944   int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
3945   Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
3946   Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
3947 }
3948 
3949 void InnerLoopVectorizer::clearReductionWrapFlags(
3950     RecurrenceDescriptor &RdxDesc) {
3951   RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();
3952   if (RK != RecurrenceDescriptor::RK_IntegerAdd &&
3953       RK != RecurrenceDescriptor::RK_IntegerMult)
3954     return;
3955 
3956   Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr();
3957   assert(LoopExitInstr && "null loop exit instruction");
3958   SmallVector<Instruction *, 8> Worklist;
3959   SmallPtrSet<Instruction *, 8> Visited;
3960   Worklist.push_back(LoopExitInstr);
3961   Visited.insert(LoopExitInstr);
3962 
3963   while (!Worklist.empty()) {
3964     Instruction *Cur = Worklist.pop_back_val();
3965     if (isa<OverflowingBinaryOperator>(Cur))
3966       for (unsigned Part = 0; Part < UF; ++Part) {
3967         Value *V = getOrCreateVectorValue(Cur, Part);
3968         cast<Instruction>(V)->dropPoisonGeneratingFlags();
3969       }
3970 
3971     for (User *U : Cur->users()) {
3972       Instruction *UI = cast<Instruction>(U);
3973       if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) &&
3974           Visited.insert(UI).second)
3975         Worklist.push_back(UI);
3976     }
3977   }
3978 }
3979 
3980 void InnerLoopVectorizer::fixLCSSAPHIs() {
3981   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3982     if (LCSSAPhi.getNumIncomingValues() == 1) {
3983       auto *IncomingValue = LCSSAPhi.getIncomingValue(0);
3984       // Non-instruction incoming values will have only one value.
3985       unsigned LastLane = 0;
3986       if (isa<Instruction>(IncomingValue))
3987           LastLane = Cost->isUniformAfterVectorization(
3988                          cast<Instruction>(IncomingValue), VF)
3989                          ? 0
3990                          : VF - 1;
3991       // Can be a loop invariant incoming value or the last scalar value to be
3992       // extracted from the vectorized loop.
3993       Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
3994       Value *lastIncomingValue =
3995           getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane });
3996       LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock);
3997     }
3998   }
3999 }
4000 
4001 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
4002   // The basic block and loop containing the predicated instruction.
4003   auto *PredBB = PredInst->getParent();
4004   auto *VectorLoop = LI->getLoopFor(PredBB);
4005 
4006   // Initialize a worklist with the operands of the predicated instruction.
4007   SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
4008 
4009   // Holds instructions that we need to analyze again. An instruction may be
4010   // reanalyzed if we don't yet know if we can sink it or not.
4011   SmallVector<Instruction *, 8> InstsToReanalyze;
4012 
4013   // Returns true if a given use occurs in the predicated block. Phi nodes use
4014   // their operands in their corresponding predecessor blocks.
4015   auto isBlockOfUsePredicated = [&](Use &U) -> bool {
4016     auto *I = cast<Instruction>(U.getUser());
4017     BasicBlock *BB = I->getParent();
4018     if (auto *Phi = dyn_cast<PHINode>(I))
4019       BB = Phi->getIncomingBlock(
4020           PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
4021     return BB == PredBB;
4022   };
4023 
4024   // Iteratively sink the scalarized operands of the predicated instruction
4025   // into the block we created for it. When an instruction is sunk, it's
4026   // operands are then added to the worklist. The algorithm ends after one pass
4027   // through the worklist doesn't sink a single instruction.
4028   bool Changed;
4029   do {
4030     // Add the instructions that need to be reanalyzed to the worklist, and
4031     // reset the changed indicator.
4032     Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
4033     InstsToReanalyze.clear();
4034     Changed = false;
4035 
4036     while (!Worklist.empty()) {
4037       auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
4038 
4039       // We can't sink an instruction if it is a phi node, is already in the
4040       // predicated block, is not in the loop, or may have side effects.
4041       if (!I || isa<PHINode>(I) || I->getParent() == PredBB ||
4042           !VectorLoop->contains(I) || I->mayHaveSideEffects())
4043         continue;
4044 
4045       // It's legal to sink the instruction if all its uses occur in the
4046       // predicated block. Otherwise, there's nothing to do yet, and we may
4047       // need to reanalyze the instruction.
4048       if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
4049         InstsToReanalyze.push_back(I);
4050         continue;
4051       }
4052 
4053       // Move the instruction to the beginning of the predicated block, and add
4054       // it's operands to the worklist.
4055       I->moveBefore(&*PredBB->getFirstInsertionPt());
4056       Worklist.insert(I->op_begin(), I->op_end());
4057 
4058       // The sinking may have enabled other instructions to be sunk, so we will
4059       // need to iterate.
4060       Changed = true;
4061     }
4062   } while (Changed);
4063 }
4064 
4065 void InnerLoopVectorizer::fixNonInductionPHIs() {
4066   for (PHINode *OrigPhi : OrigPHIsToFix) {
4067     PHINode *NewPhi =
4068         cast<PHINode>(VectorLoopValueMap.getVectorValue(OrigPhi, 0));
4069     unsigned NumIncomingValues = OrigPhi->getNumIncomingValues();
4070 
4071     SmallVector<BasicBlock *, 2> ScalarBBPredecessors(
4072         predecessors(OrigPhi->getParent()));
4073     SmallVector<BasicBlock *, 2> VectorBBPredecessors(
4074         predecessors(NewPhi->getParent()));
4075     assert(ScalarBBPredecessors.size() == VectorBBPredecessors.size() &&
4076            "Scalar and Vector BB should have the same number of predecessors");
4077 
4078     // The insertion point in Builder may be invalidated by the time we get
4079     // here. Force the Builder insertion point to something valid so that we do
4080     // not run into issues during insertion point restore in
4081     // getOrCreateVectorValue calls below.
4082     Builder.SetInsertPoint(NewPhi);
4083 
4084     // The predecessor order is preserved and we can rely on mapping between
4085     // scalar and vector block predecessors.
4086     for (unsigned i = 0; i < NumIncomingValues; ++i) {
4087       BasicBlock *NewPredBB = VectorBBPredecessors[i];
4088 
4089       // When looking up the new scalar/vector values to fix up, use incoming
4090       // values from original phi.
4091       Value *ScIncV =
4092           OrigPhi->getIncomingValueForBlock(ScalarBBPredecessors[i]);
4093 
4094       // Scalar incoming value may need a broadcast
4095       Value *NewIncV = getOrCreateVectorValue(ScIncV, 0);
4096       NewPhi->addIncoming(NewIncV, NewPredBB);
4097     }
4098   }
4099 }
4100 
4101 void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPUser &Operands,
4102                                    unsigned UF, unsigned VF,
4103                                    bool IsPtrLoopInvariant,
4104                                    SmallBitVector &IsIndexLoopInvariant,
4105                                    VPTransformState &State) {
4106   // Construct a vector GEP by widening the operands of the scalar GEP as
4107   // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
4108   // results in a vector of pointers when at least one operand of the GEP
4109   // is vector-typed. Thus, to keep the representation compact, we only use
4110   // vector-typed operands for loop-varying values.
4111 
4112   if (VF > 1 && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) {
4113     // If we are vectorizing, but the GEP has only loop-invariant operands,
4114     // the GEP we build (by only using vector-typed operands for
4115     // loop-varying values) would be a scalar pointer. Thus, to ensure we
4116     // produce a vector of pointers, we need to either arbitrarily pick an
4117     // operand to broadcast, or broadcast a clone of the original GEP.
4118     // Here, we broadcast a clone of the original.
4119     //
4120     // TODO: If at some point we decide to scalarize instructions having
4121     //       loop-invariant operands, this special case will no longer be
4122     //       required. We would add the scalarization decision to
4123     //       collectLoopScalars() and teach getVectorValue() to broadcast
4124     //       the lane-zero scalar value.
4125     auto *Clone = Builder.Insert(GEP->clone());
4126     for (unsigned Part = 0; Part < UF; ++Part) {
4127       Value *EntryPart = Builder.CreateVectorSplat(VF, Clone);
4128       VectorLoopValueMap.setVectorValue(GEP, Part, EntryPart);
4129       addMetadata(EntryPart, GEP);
4130     }
4131   } else {
4132     // If the GEP has at least one loop-varying operand, we are sure to
4133     // produce a vector of pointers. But if we are only unrolling, we want
4134     // to produce a scalar GEP for each unroll part. Thus, the GEP we
4135     // produce with the code below will be scalar (if VF == 1) or vector
4136     // (otherwise). Note that for the unroll-only case, we still maintain
4137     // values in the vector mapping with initVector, as we do for other
4138     // instructions.
4139     for (unsigned Part = 0; Part < UF; ++Part) {
4140       // The pointer operand of the new GEP. If it's loop-invariant, we
4141       // won't broadcast it.
4142       auto *Ptr = IsPtrLoopInvariant ? State.get(Operands.getOperand(0), {0, 0})
4143                                      : State.get(Operands.getOperand(0), Part);
4144 
4145       // Collect all the indices for the new GEP. If any index is
4146       // loop-invariant, we won't broadcast it.
4147       SmallVector<Value *, 4> Indices;
4148       for (unsigned I = 1, E = Operands.getNumOperands(); I < E; I++) {
4149         VPValue *Operand = Operands.getOperand(I);
4150         if (IsIndexLoopInvariant[I - 1])
4151           Indices.push_back(State.get(Operand, {0, 0}));
4152         else
4153           Indices.push_back(State.get(Operand, Part));
4154       }
4155 
4156       // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
4157       // but it should be a vector, otherwise.
4158       auto *NewGEP =
4159           GEP->isInBounds()
4160               ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr,
4161                                           Indices)
4162               : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices);
4163       assert((VF == 1 || NewGEP->getType()->isVectorTy()) &&
4164              "NewGEP is not a pointer vector");
4165       VectorLoopValueMap.setVectorValue(GEP, Part, NewGEP);
4166       addMetadata(NewGEP, GEP);
4167     }
4168   }
4169 }
4170 
4171 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
4172                                               unsigned VF) {
4173   PHINode *P = cast<PHINode>(PN);
4174   if (EnableVPlanNativePath) {
4175     // Currently we enter here in the VPlan-native path for non-induction
4176     // PHIs where all control flow is uniform. We simply widen these PHIs.
4177     // Create a vector phi with no operands - the vector phi operands will be
4178     // set at the end of vector code generation.
4179     Type *VecTy =
4180         (VF == 1) ? PN->getType() : FixedVectorType::get(PN->getType(), VF);
4181     Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi");
4182     VectorLoopValueMap.setVectorValue(P, 0, VecPhi);
4183     OrigPHIsToFix.push_back(P);
4184 
4185     return;
4186   }
4187 
4188   assert(PN->getParent() == OrigLoop->getHeader() &&
4189          "Non-header phis should have been handled elsewhere");
4190 
4191   // In order to support recurrences we need to be able to vectorize Phi nodes.
4192   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
4193   // stage #1: We create a new vector PHI node with no incoming edges. We'll use
4194   // this value when we vectorize all of the instructions that use the PHI.
4195   if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) {
4196     for (unsigned Part = 0; Part < UF; ++Part) {
4197       // This is phase one of vectorizing PHIs.
4198       Type *VecTy =
4199           (VF == 1) ? PN->getType() : FixedVectorType::get(PN->getType(), VF);
4200       Value *EntryPart = PHINode::Create(
4201           VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt());
4202       VectorLoopValueMap.setVectorValue(P, Part, EntryPart);
4203     }
4204     return;
4205   }
4206 
4207   setDebugLocFromInst(Builder, P);
4208 
4209   // This PHINode must be an induction variable.
4210   // Make sure that we know about it.
4211   assert(Legal->getInductionVars().count(P) && "Not an induction variable");
4212 
4213   InductionDescriptor II = Legal->getInductionVars().lookup(P);
4214   const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
4215 
4216   // FIXME: The newly created binary instructions should contain nsw/nuw flags,
4217   // which can be found from the original scalar operations.
4218   switch (II.getKind()) {
4219   case InductionDescriptor::IK_NoInduction:
4220     llvm_unreachable("Unknown induction");
4221   case InductionDescriptor::IK_IntInduction:
4222   case InductionDescriptor::IK_FpInduction:
4223     llvm_unreachable("Integer/fp induction is handled elsewhere.");
4224   case InductionDescriptor::IK_PtrInduction: {
4225     // Handle the pointer induction variable case.
4226     assert(P->getType()->isPointerTy() && "Unexpected type.");
4227 
4228     if (Cost->isScalarAfterVectorization(P, VF)) {
4229       // This is the normalized GEP that starts counting at zero.
4230       Value *PtrInd =
4231           Builder.CreateSExtOrTrunc(Induction, II.getStep()->getType());
4232       // Determine the number of scalars we need to generate for each unroll
4233       // iteration. If the instruction is uniform, we only need to generate the
4234       // first lane. Otherwise, we generate all VF values.
4235       unsigned Lanes = Cost->isUniformAfterVectorization(P, VF) ? 1 : VF;
4236       for (unsigned Part = 0; Part < UF; ++Part) {
4237         for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
4238           Constant *Idx = ConstantInt::get(PtrInd->getType(), Lane + Part * VF);
4239           Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
4240           Value *SclrGep =
4241               emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II);
4242           SclrGep->setName("next.gep");
4243           VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep);
4244         }
4245       }
4246       return;
4247     }
4248     assert(isa<SCEVConstant>(II.getStep()) &&
4249            "Induction step not a SCEV constant!");
4250     Type *PhiType = II.getStep()->getType();
4251 
4252     // Build a pointer phi
4253     Value *ScalarStartValue = II.getStartValue();
4254     Type *ScStValueType = ScalarStartValue->getType();
4255     PHINode *NewPointerPhi =
4256         PHINode::Create(ScStValueType, 2, "pointer.phi", Induction);
4257     NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader);
4258 
4259     // A pointer induction, performed by using a gep
4260     BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
4261     Instruction *InductionLoc = LoopLatch->getTerminator();
4262     const SCEV *ScalarStep = II.getStep();
4263     SCEVExpander Exp(*PSE.getSE(), DL, "induction");
4264     Value *ScalarStepValue =
4265         Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc);
4266     Value *InductionGEP = GetElementPtrInst::Create(
4267         ScStValueType->getPointerElementType(), NewPointerPhi,
4268         Builder.CreateMul(ScalarStepValue, ConstantInt::get(PhiType, VF * UF)),
4269         "ptr.ind", InductionLoc);
4270     NewPointerPhi->addIncoming(InductionGEP, LoopLatch);
4271 
4272     // Create UF many actual address geps that use the pointer
4273     // phi as base and a vectorized version of the step value
4274     // (<step*0, ..., step*N>) as offset.
4275     for (unsigned Part = 0; Part < UF; ++Part) {
4276       SmallVector<Constant *, 8> Indices;
4277       // Create a vector of consecutive numbers from zero to VF.
4278       for (unsigned i = 0; i < VF; ++i)
4279         Indices.push_back(ConstantInt::get(PhiType, i + Part * VF));
4280       Constant *StartOffset = ConstantVector::get(Indices);
4281 
4282       Value *GEP = Builder.CreateGEP(
4283           ScStValueType->getPointerElementType(), NewPointerPhi,
4284           Builder.CreateMul(StartOffset,
4285                             Builder.CreateVectorSplat(VF, ScalarStepValue),
4286                             "vector.gep"));
4287       VectorLoopValueMap.setVectorValue(P, Part, GEP);
4288     }
4289   }
4290   }
4291 }
4292 
4293 /// A helper function for checking whether an integer division-related
4294 /// instruction may divide by zero (in which case it must be predicated if
4295 /// executed conditionally in the scalar code).
4296 /// TODO: It may be worthwhile to generalize and check isKnownNonZero().
4297 /// Non-zero divisors that are non compile-time constants will not be
4298 /// converted into multiplication, so we will still end up scalarizing
4299 /// the division, but can do so w/o predication.
4300 static bool mayDivideByZero(Instruction &I) {
4301   assert((I.getOpcode() == Instruction::UDiv ||
4302           I.getOpcode() == Instruction::SDiv ||
4303           I.getOpcode() == Instruction::URem ||
4304           I.getOpcode() == Instruction::SRem) &&
4305          "Unexpected instruction");
4306   Value *Divisor = I.getOperand(1);
4307   auto *CInt = dyn_cast<ConstantInt>(Divisor);
4308   return !CInt || CInt->isZero();
4309 }
4310 
4311 void InnerLoopVectorizer::widenInstruction(Instruction &I, VPUser &User,
4312                                            VPTransformState &State) {
4313   switch (I.getOpcode()) {
4314   case Instruction::Call:
4315   case Instruction::Br:
4316   case Instruction::PHI:
4317   case Instruction::GetElementPtr:
4318   case Instruction::Select:
4319     llvm_unreachable("This instruction is handled by a different recipe.");
4320   case Instruction::UDiv:
4321   case Instruction::SDiv:
4322   case Instruction::SRem:
4323   case Instruction::URem:
4324   case Instruction::Add:
4325   case Instruction::FAdd:
4326   case Instruction::Sub:
4327   case Instruction::FSub:
4328   case Instruction::FNeg:
4329   case Instruction::Mul:
4330   case Instruction::FMul:
4331   case Instruction::FDiv:
4332   case Instruction::FRem:
4333   case Instruction::Shl:
4334   case Instruction::LShr:
4335   case Instruction::AShr:
4336   case Instruction::And:
4337   case Instruction::Or:
4338   case Instruction::Xor: {
4339     // Just widen unops and binops.
4340     setDebugLocFromInst(Builder, &I);
4341 
4342     for (unsigned Part = 0; Part < UF; ++Part) {
4343       SmallVector<Value *, 2> Ops;
4344       for (VPValue *VPOp : User.operands())
4345         Ops.push_back(State.get(VPOp, Part));
4346 
4347       Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops);
4348 
4349       if (auto *VecOp = dyn_cast<Instruction>(V))
4350         VecOp->copyIRFlags(&I);
4351 
4352       // Use this vector value for all users of the original instruction.
4353       VectorLoopValueMap.setVectorValue(&I, Part, V);
4354       addMetadata(V, &I);
4355     }
4356 
4357     break;
4358   }
4359   case Instruction::ICmp:
4360   case Instruction::FCmp: {
4361     // Widen compares. Generate vector compares.
4362     bool FCmp = (I.getOpcode() == Instruction::FCmp);
4363     auto *Cmp = cast<CmpInst>(&I);
4364     setDebugLocFromInst(Builder, Cmp);
4365     for (unsigned Part = 0; Part < UF; ++Part) {
4366       Value *A = State.get(User.getOperand(0), Part);
4367       Value *B = State.get(User.getOperand(1), Part);
4368       Value *C = nullptr;
4369       if (FCmp) {
4370         // Propagate fast math flags.
4371         IRBuilder<>::FastMathFlagGuard FMFG(Builder);
4372         Builder.setFastMathFlags(Cmp->getFastMathFlags());
4373         C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
4374       } else {
4375         C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
4376       }
4377       VectorLoopValueMap.setVectorValue(&I, Part, C);
4378       addMetadata(C, &I);
4379     }
4380 
4381     break;
4382   }
4383 
4384   case Instruction::ZExt:
4385   case Instruction::SExt:
4386   case Instruction::FPToUI:
4387   case Instruction::FPToSI:
4388   case Instruction::FPExt:
4389   case Instruction::PtrToInt:
4390   case Instruction::IntToPtr:
4391   case Instruction::SIToFP:
4392   case Instruction::UIToFP:
4393   case Instruction::Trunc:
4394   case Instruction::FPTrunc:
4395   case Instruction::BitCast: {
4396     auto *CI = cast<CastInst>(&I);
4397     setDebugLocFromInst(Builder, CI);
4398 
4399     /// Vectorize casts.
4400     Type *DestTy =
4401         (VF == 1) ? CI->getType() : FixedVectorType::get(CI->getType(), VF);
4402 
4403     for (unsigned Part = 0; Part < UF; ++Part) {
4404       Value *A = State.get(User.getOperand(0), Part);
4405       Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
4406       VectorLoopValueMap.setVectorValue(&I, Part, Cast);
4407       addMetadata(Cast, &I);
4408     }
4409     break;
4410   }
4411   default:
4412     // This instruction is not vectorized by simple widening.
4413     LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
4414     llvm_unreachable("Unhandled instruction!");
4415   } // end of switch.
4416 }
4417 
4418 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPUser &ArgOperands,
4419                                                VPTransformState &State) {
4420   assert(!isa<DbgInfoIntrinsic>(I) &&
4421          "DbgInfoIntrinsic should have been dropped during VPlan construction");
4422   setDebugLocFromInst(Builder, &I);
4423 
4424   Module *M = I.getParent()->getParent()->getParent();
4425   auto *CI = cast<CallInst>(&I);
4426 
4427   SmallVector<Type *, 4> Tys;
4428   for (Value *ArgOperand : CI->arg_operands())
4429     Tys.push_back(ToVectorTy(ArgOperand->getType(), VF));
4430 
4431   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
4432 
4433   // The flag shows whether we use Intrinsic or a usual Call for vectorized
4434   // version of the instruction.
4435   // Is it beneficial to perform intrinsic call compared to lib call?
4436   bool NeedToScalarize = false;
4437   unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize);
4438   bool UseVectorIntrinsic =
4439       ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost;
4440   assert((UseVectorIntrinsic || !NeedToScalarize) &&
4441          "Instruction should be scalarized elsewhere.");
4442 
4443   for (unsigned Part = 0; Part < UF; ++Part) {
4444     SmallVector<Value *, 4> Args;
4445     for (auto &I : enumerate(ArgOperands.operands())) {
4446       // Some intrinsics have a scalar argument - don't replace it with a
4447       // vector.
4448       Value *Arg;
4449       if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index()))
4450         Arg = State.get(I.value(), Part);
4451       else
4452         Arg = State.get(I.value(), {0, 0});
4453       Args.push_back(Arg);
4454     }
4455 
4456     Function *VectorF;
4457     if (UseVectorIntrinsic) {
4458       // Use vector version of the intrinsic.
4459       Type *TysForDecl[] = {CI->getType()};
4460       if (VF > 1)
4461         TysForDecl[0] =
4462             FixedVectorType::get(CI->getType()->getScalarType(), VF);
4463       VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
4464       assert(VectorF && "Can't retrieve vector intrinsic.");
4465     } else {
4466       // Use vector version of the function call.
4467       const VFShape Shape =
4468           VFShape::get(*CI, {VF, false} /*EC*/, false /*HasGlobalPred*/);
4469 #ifndef NDEBUG
4470       assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr &&
4471              "Can't create vector function.");
4472 #endif
4473         VectorF = VFDatabase(*CI).getVectorizedFunction(Shape);
4474     }
4475       SmallVector<OperandBundleDef, 1> OpBundles;
4476       CI->getOperandBundlesAsDefs(OpBundles);
4477       CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
4478 
4479       if (isa<FPMathOperator>(V))
4480         V->copyFastMathFlags(CI);
4481 
4482       VectorLoopValueMap.setVectorValue(&I, Part, V);
4483       addMetadata(V, &I);
4484   }
4485 }
4486 
4487 void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I,
4488                                                  VPUser &Operands,
4489                                                  bool InvariantCond,
4490                                                  VPTransformState &State) {
4491   setDebugLocFromInst(Builder, &I);
4492 
4493   // The condition can be loop invariant  but still defined inside the
4494   // loop. This means that we can't just use the original 'cond' value.
4495   // We have to take the 'vectorized' value and pick the first lane.
4496   // Instcombine will make this a no-op.
4497   auto *InvarCond =
4498       InvariantCond ? State.get(Operands.getOperand(0), {0, 0}) : nullptr;
4499 
4500   for (unsigned Part = 0; Part < UF; ++Part) {
4501     Value *Cond =
4502         InvarCond ? InvarCond : State.get(Operands.getOperand(0), Part);
4503     Value *Op0 = State.get(Operands.getOperand(1), Part);
4504     Value *Op1 = State.get(Operands.getOperand(2), Part);
4505     Value *Sel = Builder.CreateSelect(Cond, Op0, Op1);
4506     VectorLoopValueMap.setVectorValue(&I, Part, Sel);
4507     addMetadata(Sel, &I);
4508   }
4509 }
4510 
4511 void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
4512   // We should not collect Scalars more than once per VF. Right now, this
4513   // function is called from collectUniformsAndScalars(), which already does
4514   // this check. Collecting Scalars for VF=1 does not make any sense.
4515   assert(VF >= 2 && Scalars.find(VF) == Scalars.end() &&
4516          "This function should not be visited twice for the same VF");
4517 
4518   SmallSetVector<Instruction *, 8> Worklist;
4519 
4520   // These sets are used to seed the analysis with pointers used by memory
4521   // accesses that will remain scalar.
4522   SmallSetVector<Instruction *, 8> ScalarPtrs;
4523   SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
4524   auto *Latch = TheLoop->getLoopLatch();
4525 
4526   // A helper that returns true if the use of Ptr by MemAccess will be scalar.
4527   // The pointer operands of loads and stores will be scalar as long as the
4528   // memory access is not a gather or scatter operation. The value operand of a
4529   // store will remain scalar if the store is scalarized.
4530   auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
4531     InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
4532     assert(WideningDecision != CM_Unknown &&
4533            "Widening decision should be ready at this moment");
4534     if (auto *Store = dyn_cast<StoreInst>(MemAccess))
4535       if (Ptr == Store->getValueOperand())
4536         return WideningDecision == CM_Scalarize;
4537     assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
4538            "Ptr is neither a value or pointer operand");
4539     return WideningDecision != CM_GatherScatter;
4540   };
4541 
4542   // A helper that returns true if the given value is a bitcast or
4543   // getelementptr instruction contained in the loop.
4544   auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
4545     return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
4546             isa<GetElementPtrInst>(V)) &&
4547            !TheLoop->isLoopInvariant(V);
4548   };
4549 
4550   auto isScalarPtrInduction = [&](Instruction *MemAccess, Value *Ptr) {
4551     if (!isa<PHINode>(Ptr) ||
4552         !Legal->getInductionVars().count(cast<PHINode>(Ptr)))
4553       return false;
4554     auto &Induction = Legal->getInductionVars()[cast<PHINode>(Ptr)];
4555     if (Induction.getKind() != InductionDescriptor::IK_PtrInduction)
4556       return false;
4557     return isScalarUse(MemAccess, Ptr);
4558   };
4559 
4560   // A helper that evaluates a memory access's use of a pointer. If the
4561   // pointer is actually the pointer induction of a loop, it is being
4562   // inserted into Worklist. If the use will be a scalar use, and the
4563   // pointer is only used by memory accesses, we place the pointer in
4564   // ScalarPtrs. Otherwise, the pointer is placed in PossibleNonScalarPtrs.
4565   auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
4566     if (isScalarPtrInduction(MemAccess, Ptr)) {
4567       Worklist.insert(cast<Instruction>(Ptr));
4568       Instruction *Update = cast<Instruction>(
4569           cast<PHINode>(Ptr)->getIncomingValueForBlock(Latch));
4570       Worklist.insert(Update);
4571       LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Ptr
4572                         << "\n");
4573       LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Update
4574                         << "\n");
4575       return;
4576     }
4577     // We only care about bitcast and getelementptr instructions contained in
4578     // the loop.
4579     if (!isLoopVaryingBitCastOrGEP(Ptr))
4580       return;
4581 
4582     // If the pointer has already been identified as scalar (e.g., if it was
4583     // also identified as uniform), there's nothing to do.
4584     auto *I = cast<Instruction>(Ptr);
4585     if (Worklist.count(I))
4586       return;
4587 
4588     // If the use of the pointer will be a scalar use, and all users of the
4589     // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
4590     // place the pointer in PossibleNonScalarPtrs.
4591     if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
4592           return isa<LoadInst>(U) || isa<StoreInst>(U);
4593         }))
4594       ScalarPtrs.insert(I);
4595     else
4596       PossibleNonScalarPtrs.insert(I);
4597   };
4598 
4599   // We seed the scalars analysis with three classes of instructions: (1)
4600   // instructions marked uniform-after-vectorization and (2) bitcast,
4601   // getelementptr and (pointer) phi instructions used by memory accesses
4602   // requiring a scalar use.
4603   //
4604   // (1) Add to the worklist all instructions that have been identified as
4605   // uniform-after-vectorization.
4606   Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
4607 
4608   // (2) Add to the worklist all bitcast and getelementptr instructions used by
4609   // memory accesses requiring a scalar use. The pointer operands of loads and
4610   // stores will be scalar as long as the memory accesses is not a gather or
4611   // scatter operation. The value operand of a store will remain scalar if the
4612   // store is scalarized.
4613   for (auto *BB : TheLoop->blocks())
4614     for (auto &I : *BB) {
4615       if (auto *Load = dyn_cast<LoadInst>(&I)) {
4616         evaluatePtrUse(Load, Load->getPointerOperand());
4617       } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
4618         evaluatePtrUse(Store, Store->getPointerOperand());
4619         evaluatePtrUse(Store, Store->getValueOperand());
4620       }
4621     }
4622   for (auto *I : ScalarPtrs)
4623     if (!PossibleNonScalarPtrs.count(I)) {
4624       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
4625       Worklist.insert(I);
4626     }
4627 
4628   // Insert the forced scalars.
4629   // FIXME: Currently widenPHIInstruction() often creates a dead vector
4630   // induction variable when the PHI user is scalarized.
4631   auto ForcedScalar = ForcedScalars.find(VF);
4632   if (ForcedScalar != ForcedScalars.end())
4633     for (auto *I : ForcedScalar->second)
4634       Worklist.insert(I);
4635 
4636   // Expand the worklist by looking through any bitcasts and getelementptr
4637   // instructions we've already identified as scalar. This is similar to the
4638   // expansion step in collectLoopUniforms(); however, here we're only
4639   // expanding to include additional bitcasts and getelementptr instructions.
4640   unsigned Idx = 0;
4641   while (Idx != Worklist.size()) {
4642     Instruction *Dst = Worklist[Idx++];
4643     if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
4644       continue;
4645     auto *Src = cast<Instruction>(Dst->getOperand(0));
4646     if (llvm::all_of(Src->users(), [&](User *U) -> bool {
4647           auto *J = cast<Instruction>(U);
4648           return !TheLoop->contains(J) || Worklist.count(J) ||
4649                  ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
4650                   isScalarUse(J, Src));
4651         })) {
4652       Worklist.insert(Src);
4653       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
4654     }
4655   }
4656 
4657   // An induction variable will remain scalar if all users of the induction
4658   // variable and induction variable update remain scalar.
4659   for (auto &Induction : Legal->getInductionVars()) {
4660     auto *Ind = Induction.first;
4661     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4662 
4663     // If tail-folding is applied, the primary induction variable will be used
4664     // to feed a vector compare.
4665     if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
4666       continue;
4667 
4668     // Determine if all users of the induction variable are scalar after
4669     // vectorization.
4670     auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4671       auto *I = cast<Instruction>(U);
4672       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I);
4673     });
4674     if (!ScalarInd)
4675       continue;
4676 
4677     // Determine if all users of the induction variable update instruction are
4678     // scalar after vectorization.
4679     auto ScalarIndUpdate =
4680         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4681           auto *I = cast<Instruction>(U);
4682           return I == Ind || !TheLoop->contains(I) || Worklist.count(I);
4683         });
4684     if (!ScalarIndUpdate)
4685       continue;
4686 
4687     // The induction variable and its update instruction will remain scalar.
4688     Worklist.insert(Ind);
4689     Worklist.insert(IndUpdate);
4690     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
4691     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
4692                       << "\n");
4693   }
4694 
4695   Scalars[VF].insert(Worklist.begin(), Worklist.end());
4696 }
4697 
4698 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, unsigned VF) {
4699   if (!blockNeedsPredication(I->getParent()))
4700     return false;
4701   switch(I->getOpcode()) {
4702   default:
4703     break;
4704   case Instruction::Load:
4705   case Instruction::Store: {
4706     if (!Legal->isMaskRequired(I))
4707       return false;
4708     auto *Ptr = getLoadStorePointerOperand(I);
4709     auto *Ty = getMemInstValueType(I);
4710     // We have already decided how to vectorize this instruction, get that
4711     // result.
4712     if (VF > 1) {
4713       InstWidening WideningDecision = getWideningDecision(I, VF);
4714       assert(WideningDecision != CM_Unknown &&
4715              "Widening decision should be ready at this moment");
4716       return WideningDecision == CM_Scalarize;
4717     }
4718     const Align Alignment = getLoadStoreAlignment(I);
4719     return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
4720                                 isLegalMaskedGather(Ty, Alignment))
4721                             : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
4722                                 isLegalMaskedScatter(Ty, Alignment));
4723   }
4724   case Instruction::UDiv:
4725   case Instruction::SDiv:
4726   case Instruction::SRem:
4727   case Instruction::URem:
4728     return mayDivideByZero(*I);
4729   }
4730   return false;
4731 }
4732 
4733 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(Instruction *I,
4734                                                                unsigned VF) {
4735   assert(isAccessInterleaved(I) && "Expecting interleaved access.");
4736   assert(getWideningDecision(I, VF) == CM_Unknown &&
4737          "Decision should not be set yet.");
4738   auto *Group = getInterleavedAccessGroup(I);
4739   assert(Group && "Must have a group.");
4740 
4741   // If the instruction's allocated size doesn't equal it's type size, it
4742   // requires padding and will be scalarized.
4743   auto &DL = I->getModule()->getDataLayout();
4744   auto *ScalarTy = getMemInstValueType(I);
4745   if (hasIrregularType(ScalarTy, DL, VF))
4746     return false;
4747 
4748   // Check if masking is required.
4749   // A Group may need masking for one of two reasons: it resides in a block that
4750   // needs predication, or it was decided to use masking to deal with gaps.
4751   bool PredicatedAccessRequiresMasking =
4752       Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I);
4753   bool AccessWithGapsRequiresMasking =
4754       Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
4755   if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking)
4756     return true;
4757 
4758   // If masked interleaving is required, we expect that the user/target had
4759   // enabled it, because otherwise it either wouldn't have been created or
4760   // it should have been invalidated by the CostModel.
4761   assert(useMaskedInterleavedAccesses(TTI) &&
4762          "Masked interleave-groups for predicated accesses are not enabled.");
4763 
4764   auto *Ty = getMemInstValueType(I);
4765   const Align Alignment = getLoadStoreAlignment(I);
4766   return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
4767                           : TTI.isLegalMaskedStore(Ty, Alignment);
4768 }
4769 
4770 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(Instruction *I,
4771                                                                unsigned VF) {
4772   // Get and ensure we have a valid memory instruction.
4773   LoadInst *LI = dyn_cast<LoadInst>(I);
4774   StoreInst *SI = dyn_cast<StoreInst>(I);
4775   assert((LI || SI) && "Invalid memory instruction");
4776 
4777   auto *Ptr = getLoadStorePointerOperand(I);
4778 
4779   // In order to be widened, the pointer should be consecutive, first of all.
4780   if (!Legal->isConsecutivePtr(Ptr))
4781     return false;
4782 
4783   // If the instruction is a store located in a predicated block, it will be
4784   // scalarized.
4785   if (isScalarWithPredication(I))
4786     return false;
4787 
4788   // If the instruction's allocated size doesn't equal it's type size, it
4789   // requires padding and will be scalarized.
4790   auto &DL = I->getModule()->getDataLayout();
4791   auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType();
4792   if (hasIrregularType(ScalarTy, DL, VF))
4793     return false;
4794 
4795   return true;
4796 }
4797 
4798 void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
4799   // We should not collect Uniforms more than once per VF. Right now,
4800   // this function is called from collectUniformsAndScalars(), which
4801   // already does this check. Collecting Uniforms for VF=1 does not make any
4802   // sense.
4803 
4804   assert(VF >= 2 && Uniforms.find(VF) == Uniforms.end() &&
4805          "This function should not be visited twice for the same VF");
4806 
4807   // Visit the list of Uniforms. If we'll not find any uniform value, we'll
4808   // not analyze again.  Uniforms.count(VF) will return 1.
4809   Uniforms[VF].clear();
4810 
4811   // We now know that the loop is vectorizable!
4812   // Collect instructions inside the loop that will remain uniform after
4813   // vectorization.
4814 
4815   // Global values, params and instructions outside of current loop are out of
4816   // scope.
4817   auto isOutOfScope = [&](Value *V) -> bool {
4818     Instruction *I = dyn_cast<Instruction>(V);
4819     return (!I || !TheLoop->contains(I));
4820   };
4821 
4822   SetVector<Instruction *> Worklist;
4823   BasicBlock *Latch = TheLoop->getLoopLatch();
4824 
4825   // Instructions that are scalar with predication must not be considered
4826   // uniform after vectorization, because that would create an erroneous
4827   // replicating region where only a single instance out of VF should be formed.
4828   // TODO: optimize such seldom cases if found important, see PR40816.
4829   auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
4830     if (isScalarWithPredication(I, VF)) {
4831       LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "
4832                         << *I << "\n");
4833       return;
4834     }
4835     LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
4836     Worklist.insert(I);
4837   };
4838 
4839   // Start with the conditional branch. If the branch condition is an
4840   // instruction contained in the loop that is only used by the branch, it is
4841   // uniform.
4842   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
4843   if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
4844     addToWorklistIfAllowed(Cmp);
4845 
4846   // Holds consecutive and consecutive-like pointers. Consecutive-like pointers
4847   // are pointers that are treated like consecutive pointers during
4848   // vectorization. The pointer operands of interleaved accesses are an
4849   // example.
4850   SmallSetVector<Instruction *, 8> ConsecutiveLikePtrs;
4851 
4852   // Holds pointer operands of instructions that are possibly non-uniform.
4853   SmallPtrSet<Instruction *, 8> PossibleNonUniformPtrs;
4854 
4855   auto isUniformDecision = [&](Instruction *I, unsigned VF) {
4856     InstWidening WideningDecision = getWideningDecision(I, VF);
4857     assert(WideningDecision != CM_Unknown &&
4858            "Widening decision should be ready at this moment");
4859 
4860     return (WideningDecision == CM_Widen ||
4861             WideningDecision == CM_Widen_Reverse ||
4862             WideningDecision == CM_Interleave);
4863   };
4864   // Iterate over the instructions in the loop, and collect all
4865   // consecutive-like pointer operands in ConsecutiveLikePtrs. If it's possible
4866   // that a consecutive-like pointer operand will be scalarized, we collect it
4867   // in PossibleNonUniformPtrs instead. We use two sets here because a single
4868   // getelementptr instruction can be used by both vectorized and scalarized
4869   // memory instructions. For example, if a loop loads and stores from the same
4870   // location, but the store is conditional, the store will be scalarized, and
4871   // the getelementptr won't remain uniform.
4872   for (auto *BB : TheLoop->blocks())
4873     for (auto &I : *BB) {
4874       // If there's no pointer operand, there's nothing to do.
4875       auto *Ptr = dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
4876       if (!Ptr)
4877         continue;
4878 
4879       // True if all users of Ptr are memory accesses that have Ptr as their
4880       // pointer operand.
4881       auto UsersAreMemAccesses =
4882           llvm::all_of(Ptr->users(), [&](User *U) -> bool {
4883             return getLoadStorePointerOperand(U) == Ptr;
4884           });
4885 
4886       // Ensure the memory instruction will not be scalarized or used by
4887       // gather/scatter, making its pointer operand non-uniform. If the pointer
4888       // operand is used by any instruction other than a memory access, we
4889       // conservatively assume the pointer operand may be non-uniform.
4890       if (!UsersAreMemAccesses || !isUniformDecision(&I, VF))
4891         PossibleNonUniformPtrs.insert(Ptr);
4892 
4893       // If the memory instruction will be vectorized and its pointer operand
4894       // is consecutive-like, or interleaving - the pointer operand should
4895       // remain uniform.
4896       else
4897         ConsecutiveLikePtrs.insert(Ptr);
4898     }
4899 
4900   // Add to the Worklist all consecutive and consecutive-like pointers that
4901   // aren't also identified as possibly non-uniform.
4902   for (auto *V : ConsecutiveLikePtrs)
4903     if (!PossibleNonUniformPtrs.count(V))
4904       addToWorklistIfAllowed(V);
4905 
4906   // Expand Worklist in topological order: whenever a new instruction
4907   // is added , its users should be already inside Worklist.  It ensures
4908   // a uniform instruction will only be used by uniform instructions.
4909   unsigned idx = 0;
4910   while (idx != Worklist.size()) {
4911     Instruction *I = Worklist[idx++];
4912 
4913     for (auto OV : I->operand_values()) {
4914       // isOutOfScope operands cannot be uniform instructions.
4915       if (isOutOfScope(OV))
4916         continue;
4917       // First order recurrence Phi's should typically be considered
4918       // non-uniform.
4919       auto *OP = dyn_cast<PHINode>(OV);
4920       if (OP && Legal->isFirstOrderRecurrence(OP))
4921         continue;
4922       // If all the users of the operand are uniform, then add the
4923       // operand into the uniform worklist.
4924       auto *OI = cast<Instruction>(OV);
4925       if (llvm::all_of(OI->users(), [&](User *U) -> bool {
4926             auto *J = cast<Instruction>(U);
4927             return Worklist.count(J) ||
4928                    (OI == getLoadStorePointerOperand(J) &&
4929                     isUniformDecision(J, VF));
4930           }))
4931         addToWorklistIfAllowed(OI);
4932     }
4933   }
4934 
4935   // Returns true if Ptr is the pointer operand of a memory access instruction
4936   // I, and I is known to not require scalarization.
4937   auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
4938     return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF);
4939   };
4940 
4941   // For an instruction to be added into Worklist above, all its users inside
4942   // the loop should also be in Worklist. However, this condition cannot be
4943   // true for phi nodes that form a cyclic dependence. We must process phi
4944   // nodes separately. An induction variable will remain uniform if all users
4945   // of the induction variable and induction variable update remain uniform.
4946   // The code below handles both pointer and non-pointer induction variables.
4947   for (auto &Induction : Legal->getInductionVars()) {
4948     auto *Ind = Induction.first;
4949     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4950 
4951     // Determine if all users of the induction variable are uniform after
4952     // vectorization.
4953     auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4954       auto *I = cast<Instruction>(U);
4955       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
4956              isVectorizedMemAccessUse(I, Ind);
4957     });
4958     if (!UniformInd)
4959       continue;
4960 
4961     // Determine if all users of the induction variable update instruction are
4962     // uniform after vectorization.
4963     auto UniformIndUpdate =
4964         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4965           auto *I = cast<Instruction>(U);
4966           return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
4967                  isVectorizedMemAccessUse(I, IndUpdate);
4968         });
4969     if (!UniformIndUpdate)
4970       continue;
4971 
4972     // The induction variable and its update instruction will remain uniform.
4973     addToWorklistIfAllowed(Ind);
4974     addToWorklistIfAllowed(IndUpdate);
4975   }
4976 
4977   Uniforms[VF].insert(Worklist.begin(), Worklist.end());
4978 }
4979 
4980 bool LoopVectorizationCostModel::runtimeChecksRequired() {
4981   LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
4982 
4983   if (Legal->getRuntimePointerChecking()->Need) {
4984     reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
4985         "runtime pointer checks needed. Enable vectorization of this "
4986         "loop with '#pragma clang loop vectorize(enable)' when "
4987         "compiling with -Os/-Oz",
4988         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4989     return true;
4990   }
4991 
4992   if (!PSE.getUnionPredicate().getPredicates().empty()) {
4993     reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
4994         "runtime SCEV checks needed. Enable vectorization of this "
4995         "loop with '#pragma clang loop vectorize(enable)' when "
4996         "compiling with -Os/-Oz",
4997         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4998     return true;
4999   }
5000 
5001   // FIXME: Avoid specializing for stride==1 instead of bailing out.
5002   if (!Legal->getLAI()->getSymbolicStrides().empty()) {
5003     reportVectorizationFailure("Runtime stride check for small trip count",
5004         "runtime stride == 1 checks needed. Enable vectorization of "
5005         "this loop without such check by compiling with -Os/-Oz",
5006         "CantVersionLoopWithOptForSize", ORE, TheLoop);
5007     return true;
5008   }
5009 
5010   return false;
5011 }
5012 
5013 Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(unsigned UserVF,
5014                                                             unsigned UserIC) {
5015   if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
5016     // TODO: It may by useful to do since it's still likely to be dynamically
5017     // uniform if the target can skip.
5018     reportVectorizationFailure(
5019         "Not inserting runtime ptr check for divergent target",
5020         "runtime pointer checks needed. Not enabled for divergent target",
5021         "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
5022     return None;
5023   }
5024 
5025   unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
5026   LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
5027   if (TC == 1) {
5028     reportVectorizationFailure("Single iteration (non) loop",
5029         "loop trip count is one, irrelevant for vectorization",
5030         "SingleIterationLoop", ORE, TheLoop);
5031     return None;
5032   }
5033 
5034   switch (ScalarEpilogueStatus) {
5035   case CM_ScalarEpilogueAllowed:
5036     return UserVF ? UserVF : computeFeasibleMaxVF(TC);
5037   case CM_ScalarEpilogueNotNeededUsePredicate:
5038     LLVM_DEBUG(
5039         dbgs() << "LV: vector predicate hint/switch found.\n"
5040                << "LV: Not allowing scalar epilogue, creating predicated "
5041                << "vector loop.\n");
5042     break;
5043   case CM_ScalarEpilogueNotAllowedLowTripLoop:
5044     // fallthrough as a special case of OptForSize
5045   case CM_ScalarEpilogueNotAllowedOptSize:
5046     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
5047       LLVM_DEBUG(
5048           dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
5049     else
5050       LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
5051                         << "count.\n");
5052 
5053     // Bail if runtime checks are required, which are not good when optimising
5054     // for size.
5055     if (runtimeChecksRequired())
5056       return None;
5057     break;
5058   }
5059 
5060   // Now try the tail folding
5061 
5062   // Invalidate interleave groups that require an epilogue if we can't mask
5063   // the interleave-group.
5064   if (!useMaskedInterleavedAccesses(TTI)) {
5065     assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() &&
5066            "No decisions should have been taken at this point");
5067     // Note: There is no need to invalidate any cost modeling decisions here, as
5068     // non where taken so far.
5069     InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
5070   }
5071 
5072   unsigned MaxVF = UserVF ? UserVF : computeFeasibleMaxVF(TC);
5073   assert((UserVF || isPowerOf2_32(MaxVF)) && "MaxVF must be a power of 2");
5074   unsigned MaxVFtimesIC = UserIC ? MaxVF * UserIC : MaxVF;
5075   if (TC > 0 && TC % MaxVFtimesIC == 0) {
5076     // Accept MaxVF if we do not have a tail.
5077     LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
5078     return MaxVF;
5079   }
5080 
5081   // If we don't know the precise trip count, or if the trip count that we
5082   // found modulo the vectorization factor is not zero, try to fold the tail
5083   // by masking.
5084   // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
5085   if (Legal->prepareToFoldTailByMasking()) {
5086     FoldTailByMasking = true;
5087     return MaxVF;
5088   }
5089 
5090   if (TC == 0) {
5091     reportVectorizationFailure(
5092         "Unable to calculate the loop count due to complex control flow",
5093         "unable to calculate the loop count due to complex control flow",
5094         "UnknownLoopCountComplexCFG", ORE, TheLoop);
5095     return None;
5096   }
5097 
5098   reportVectorizationFailure(
5099       "Cannot optimize for size and vectorize at the same time.",
5100       "cannot optimize for size and vectorize at the same time. "
5101       "Enable vectorization of this loop with '#pragma clang loop "
5102       "vectorize(enable)' when compiling with -Os/-Oz",
5103       "NoTailLoopWithOptForSize", ORE, TheLoop);
5104   return None;
5105 }
5106 
5107 unsigned
5108 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) {
5109   MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
5110   unsigned SmallestType, WidestType;
5111   std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
5112   unsigned WidestRegister = TTI.getRegisterBitWidth(true);
5113 
5114   // Get the maximum safe dependence distance in bits computed by LAA.
5115   // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
5116   // the memory accesses that is most restrictive (involved in the smallest
5117   // dependence distance).
5118   unsigned MaxSafeRegisterWidth = Legal->getMaxSafeRegisterWidth();
5119 
5120   WidestRegister = std::min(WidestRegister, MaxSafeRegisterWidth);
5121 
5122   // Ensure MaxVF is a power of 2; the dependence distance bound may not be.
5123   // Note that both WidestRegister and WidestType may not be a powers of 2.
5124   unsigned MaxVectorSize = PowerOf2Floor(WidestRegister / WidestType);
5125 
5126   LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
5127                     << " / " << WidestType << " bits.\n");
5128   LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
5129                     << WidestRegister << " bits.\n");
5130 
5131   assert(MaxVectorSize <= 256 && "Did not expect to pack so many elements"
5132                                  " into one vector!");
5133   if (MaxVectorSize == 0) {
5134     LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n");
5135     MaxVectorSize = 1;
5136     return MaxVectorSize;
5137   } else if (ConstTripCount && ConstTripCount < MaxVectorSize &&
5138              isPowerOf2_32(ConstTripCount)) {
5139     // We need to clamp the VF to be the ConstTripCount. There is no point in
5140     // choosing a higher viable VF as done in the loop below.
5141     LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: "
5142                       << ConstTripCount << "\n");
5143     MaxVectorSize = ConstTripCount;
5144     return MaxVectorSize;
5145   }
5146 
5147   unsigned MaxVF = MaxVectorSize;
5148   if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) ||
5149       (MaximizeBandwidth && isScalarEpilogueAllowed())) {
5150     // Collect all viable vectorization factors larger than the default MaxVF
5151     // (i.e. MaxVectorSize).
5152     SmallVector<unsigned, 8> VFs;
5153     unsigned NewMaxVectorSize = WidestRegister / SmallestType;
5154     for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2)
5155       VFs.push_back(VS);
5156 
5157     // For each VF calculate its register usage.
5158     auto RUs = calculateRegisterUsage(VFs);
5159 
5160     // Select the largest VF which doesn't require more registers than existing
5161     // ones.
5162     for (int i = RUs.size() - 1; i >= 0; --i) {
5163       bool Selected = true;
5164       for (auto& pair : RUs[i].MaxLocalUsers) {
5165         unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5166         if (pair.second > TargetNumRegisters)
5167           Selected = false;
5168       }
5169       if (Selected) {
5170         MaxVF = VFs[i];
5171         break;
5172       }
5173     }
5174     if (unsigned MinVF = TTI.getMinimumVF(SmallestType)) {
5175       if (MaxVF < MinVF) {
5176         LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
5177                           << ") with target's minimum: " << MinVF << '\n');
5178         MaxVF = MinVF;
5179       }
5180     }
5181   }
5182   return MaxVF;
5183 }
5184 
5185 VectorizationFactor
5186 LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) {
5187   float Cost = expectedCost(1).first;
5188   const float ScalarCost = Cost;
5189   unsigned Width = 1;
5190   LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n");
5191 
5192   bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
5193   if (ForceVectorization && MaxVF > 1) {
5194     // Ignore scalar width, because the user explicitly wants vectorization.
5195     // Initialize cost to max so that VF = 2 is, at least, chosen during cost
5196     // evaluation.
5197     Cost = std::numeric_limits<float>::max();
5198   }
5199 
5200   for (unsigned i = 2; i <= MaxVF; i *= 2) {
5201     // Notice that the vector loop needs to be executed less times, so
5202     // we need to divide the cost of the vector loops by the width of
5203     // the vector elements.
5204     VectorizationCostTy C = expectedCost(i);
5205     float VectorCost = C.first / (float)i;
5206     LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
5207                       << " costs: " << (int)VectorCost << ".\n");
5208     if (!C.second && !ForceVectorization) {
5209       LLVM_DEBUG(
5210           dbgs() << "LV: Not considering vector loop of width " << i
5211                  << " because it will not generate any vector instructions.\n");
5212       continue;
5213     }
5214     if (VectorCost < Cost) {
5215       Cost = VectorCost;
5216       Width = i;
5217     }
5218   }
5219 
5220   if (!EnableCondStoresVectorization && NumPredStores) {
5221     reportVectorizationFailure("There are conditional stores.",
5222         "store that is conditionally executed prevents vectorization",
5223         "ConditionalStore", ORE, TheLoop);
5224     Width = 1;
5225     Cost = ScalarCost;
5226   }
5227 
5228   LLVM_DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs()
5229              << "LV: Vectorization seems to be not beneficial, "
5230              << "but was forced by a user.\n");
5231   LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n");
5232   VectorizationFactor Factor = {Width, (unsigned)(Width * Cost)};
5233   return Factor;
5234 }
5235 
5236 std::pair<unsigned, unsigned>
5237 LoopVectorizationCostModel::getSmallestAndWidestTypes() {
5238   unsigned MinWidth = -1U;
5239   unsigned MaxWidth = 8;
5240   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5241 
5242   // For each block.
5243   for (BasicBlock *BB : TheLoop->blocks()) {
5244     // For each instruction in the loop.
5245     for (Instruction &I : BB->instructionsWithoutDebug()) {
5246       Type *T = I.getType();
5247 
5248       // Skip ignored values.
5249       if (ValuesToIgnore.count(&I))
5250         continue;
5251 
5252       // Only examine Loads, Stores and PHINodes.
5253       if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
5254         continue;
5255 
5256       // Examine PHI nodes that are reduction variables. Update the type to
5257       // account for the recurrence type.
5258       if (auto *PN = dyn_cast<PHINode>(&I)) {
5259         if (!Legal->isReductionVariable(PN))
5260           continue;
5261         RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[PN];
5262         T = RdxDesc.getRecurrenceType();
5263       }
5264 
5265       // Examine the stored values.
5266       if (auto *ST = dyn_cast<StoreInst>(&I))
5267         T = ST->getValueOperand()->getType();
5268 
5269       // Ignore loaded pointer types and stored pointer types that are not
5270       // vectorizable.
5271       //
5272       // FIXME: The check here attempts to predict whether a load or store will
5273       //        be vectorized. We only know this for certain after a VF has
5274       //        been selected. Here, we assume that if an access can be
5275       //        vectorized, it will be. We should also look at extending this
5276       //        optimization to non-pointer types.
5277       //
5278       if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) &&
5279           !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I))
5280         continue;
5281 
5282       MinWidth = std::min(MinWidth,
5283                           (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
5284       MaxWidth = std::max(MaxWidth,
5285                           (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
5286     }
5287   }
5288 
5289   return {MinWidth, MaxWidth};
5290 }
5291 
5292 unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF,
5293                                                            unsigned LoopCost) {
5294   // -- The interleave heuristics --
5295   // We interleave the loop in order to expose ILP and reduce the loop overhead.
5296   // There are many micro-architectural considerations that we can't predict
5297   // at this level. For example, frontend pressure (on decode or fetch) due to
5298   // code size, or the number and capabilities of the execution ports.
5299   //
5300   // We use the following heuristics to select the interleave count:
5301   // 1. If the code has reductions, then we interleave to break the cross
5302   // iteration dependency.
5303   // 2. If the loop is really small, then we interleave to reduce the loop
5304   // overhead.
5305   // 3. We don't interleave if we think that we will spill registers to memory
5306   // due to the increased register pressure.
5307 
5308   if (!isScalarEpilogueAllowed())
5309     return 1;
5310 
5311   // We used the distance for the interleave count.
5312   if (Legal->getMaxSafeDepDistBytes() != -1U)
5313     return 1;
5314 
5315   // Do not interleave loops with a relatively small known or estimated trip
5316   // count.
5317   auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
5318   if (BestKnownTC && *BestKnownTC < TinyTripCountInterleaveThreshold)
5319     return 1;
5320 
5321   RegisterUsage R = calculateRegisterUsage({VF})[0];
5322   // We divide by these constants so assume that we have at least one
5323   // instruction that uses at least one register.
5324   for (auto& pair : R.MaxLocalUsers) {
5325     pair.second = std::max(pair.second, 1U);
5326   }
5327 
5328   // We calculate the interleave count using the following formula.
5329   // Subtract the number of loop invariants from the number of available
5330   // registers. These registers are used by all of the interleaved instances.
5331   // Next, divide the remaining registers by the number of registers that is
5332   // required by the loop, in order to estimate how many parallel instances
5333   // fit without causing spills. All of this is rounded down if necessary to be
5334   // a power of two. We want power of two interleave count to simplify any
5335   // addressing operations or alignment considerations.
5336   // We also want power of two interleave counts to ensure that the induction
5337   // variable of the vector loop wraps to zero, when tail is folded by masking;
5338   // this currently happens when OptForSize, in which case IC is set to 1 above.
5339   unsigned IC = UINT_MAX;
5340 
5341   for (auto& pair : R.MaxLocalUsers) {
5342     unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5343     LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
5344                       << " registers of "
5345                       << TTI.getRegisterClassName(pair.first) << " register class\n");
5346     if (VF == 1) {
5347       if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
5348         TargetNumRegisters = ForceTargetNumScalarRegs;
5349     } else {
5350       if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
5351         TargetNumRegisters = ForceTargetNumVectorRegs;
5352     }
5353     unsigned MaxLocalUsers = pair.second;
5354     unsigned LoopInvariantRegs = 0;
5355     if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end())
5356       LoopInvariantRegs = R.LoopInvariantRegs[pair.first];
5357 
5358     unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers);
5359     // Don't count the induction variable as interleaved.
5360     if (EnableIndVarRegisterHeur) {
5361       TmpIC =
5362           PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) /
5363                         std::max(1U, (MaxLocalUsers - 1)));
5364     }
5365 
5366     IC = std::min(IC, TmpIC);
5367   }
5368 
5369   // Clamp the interleave ranges to reasonable counts.
5370   unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF);
5371 
5372   // Check if the user has overridden the max.
5373   if (VF == 1) {
5374     if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
5375       MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
5376   } else {
5377     if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
5378       MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
5379   }
5380 
5381   // If trip count is known or estimated compile time constant, limit the
5382   // interleave count to be less than the trip count divided by VF.
5383   if (BestKnownTC) {
5384     MaxInterleaveCount = std::min(*BestKnownTC / VF, MaxInterleaveCount);
5385   }
5386 
5387   // If we did not calculate the cost for VF (because the user selected the VF)
5388   // then we calculate the cost of VF here.
5389   if (LoopCost == 0)
5390     LoopCost = expectedCost(VF).first;
5391 
5392   assert(LoopCost && "Non-zero loop cost expected");
5393 
5394   // Clamp the calculated IC to be between the 1 and the max interleave count
5395   // that the target and trip count allows.
5396   if (IC > MaxInterleaveCount)
5397     IC = MaxInterleaveCount;
5398   else if (IC < 1)
5399     IC = 1;
5400 
5401   // Interleave if we vectorized this loop and there is a reduction that could
5402   // benefit from interleaving.
5403   if (VF > 1 && !Legal->getReductionVars().empty()) {
5404     LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
5405     return IC;
5406   }
5407 
5408   // Note that if we've already vectorized the loop we will have done the
5409   // runtime check and so interleaving won't require further checks.
5410   bool InterleavingRequiresRuntimePointerCheck =
5411       (VF == 1 && Legal->getRuntimePointerChecking()->Need);
5412 
5413   // We want to interleave small loops in order to reduce the loop overhead and
5414   // potentially expose ILP opportunities.
5415   LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n');
5416   if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) {
5417     // We assume that the cost overhead is 1 and we use the cost model
5418     // to estimate the cost of the loop and interleave until the cost of the
5419     // loop overhead is about 5% of the cost of the loop.
5420     unsigned SmallIC =
5421         std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost));
5422 
5423     // Interleave until store/load ports (estimated by max interleave count) are
5424     // saturated.
5425     unsigned NumStores = Legal->getNumStores();
5426     unsigned NumLoads = Legal->getNumLoads();
5427     unsigned StoresIC = IC / (NumStores ? NumStores : 1);
5428     unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
5429 
5430     // If we have a scalar reduction (vector reductions are already dealt with
5431     // by this point), we can increase the critical path length if the loop
5432     // we're interleaving is inside another loop. Limit, by default to 2, so the
5433     // critical path only gets increased by one reduction operation.
5434     if (!Legal->getReductionVars().empty() && TheLoop->getLoopDepth() > 1) {
5435       unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
5436       SmallIC = std::min(SmallIC, F);
5437       StoresIC = std::min(StoresIC, F);
5438       LoadsIC = std::min(LoadsIC, F);
5439     }
5440 
5441     if (EnableLoadStoreRuntimeInterleave &&
5442         std::max(StoresIC, LoadsIC) > SmallIC) {
5443       LLVM_DEBUG(
5444           dbgs() << "LV: Interleaving to saturate store or load ports.\n");
5445       return std::max(StoresIC, LoadsIC);
5446     }
5447 
5448     LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
5449     return SmallIC;
5450   }
5451 
5452   // Interleave if this is a large loop (small loops are already dealt with by
5453   // this point) that could benefit from interleaving.
5454   bool HasReductions = !Legal->getReductionVars().empty();
5455   if (TTI.enableAggressiveInterleaving(HasReductions)) {
5456     LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5457     return IC;
5458   }
5459 
5460   LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
5461   return 1;
5462 }
5463 
5464 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
5465 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {
5466   // This function calculates the register usage by measuring the highest number
5467   // of values that are alive at a single location. Obviously, this is a very
5468   // rough estimation. We scan the loop in a topological order in order and
5469   // assign a number to each instruction. We use RPO to ensure that defs are
5470   // met before their users. We assume that each instruction that has in-loop
5471   // users starts an interval. We record every time that an in-loop value is
5472   // used, so we have a list of the first and last occurrences of each
5473   // instruction. Next, we transpose this data structure into a multi map that
5474   // holds the list of intervals that *end* at a specific location. This multi
5475   // map allows us to perform a linear search. We scan the instructions linearly
5476   // and record each time that a new interval starts, by placing it in a set.
5477   // If we find this value in the multi-map then we remove it from the set.
5478   // The max register usage is the maximum size of the set.
5479   // We also search for instructions that are defined outside the loop, but are
5480   // used inside the loop. We need this number separately from the max-interval
5481   // usage number because when we unroll, loop-invariant values do not take
5482   // more register.
5483   LoopBlocksDFS DFS(TheLoop);
5484   DFS.perform(LI);
5485 
5486   RegisterUsage RU;
5487 
5488   // Each 'key' in the map opens a new interval. The values
5489   // of the map are the index of the 'last seen' usage of the
5490   // instruction that is the key.
5491   using IntervalMap = DenseMap<Instruction *, unsigned>;
5492 
5493   // Maps instruction to its index.
5494   SmallVector<Instruction *, 64> IdxToInstr;
5495   // Marks the end of each interval.
5496   IntervalMap EndPoint;
5497   // Saves the list of instruction indices that are used in the loop.
5498   SmallPtrSet<Instruction *, 8> Ends;
5499   // Saves the list of values that are used in the loop but are
5500   // defined outside the loop, such as arguments and constants.
5501   SmallPtrSet<Value *, 8> LoopInvariants;
5502 
5503   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
5504     for (Instruction &I : BB->instructionsWithoutDebug()) {
5505       IdxToInstr.push_back(&I);
5506 
5507       // Save the end location of each USE.
5508       for (Value *U : I.operands()) {
5509         auto *Instr = dyn_cast<Instruction>(U);
5510 
5511         // Ignore non-instruction values such as arguments, constants, etc.
5512         if (!Instr)
5513           continue;
5514 
5515         // If this instruction is outside the loop then record it and continue.
5516         if (!TheLoop->contains(Instr)) {
5517           LoopInvariants.insert(Instr);
5518           continue;
5519         }
5520 
5521         // Overwrite previous end points.
5522         EndPoint[Instr] = IdxToInstr.size();
5523         Ends.insert(Instr);
5524       }
5525     }
5526   }
5527 
5528   // Saves the list of intervals that end with the index in 'key'.
5529   using InstrList = SmallVector<Instruction *, 2>;
5530   DenseMap<unsigned, InstrList> TransposeEnds;
5531 
5532   // Transpose the EndPoints to a list of values that end at each index.
5533   for (auto &Interval : EndPoint)
5534     TransposeEnds[Interval.second].push_back(Interval.first);
5535 
5536   SmallPtrSet<Instruction *, 8> OpenIntervals;
5537 
5538   // Get the size of the widest register.
5539   unsigned MaxSafeDepDist = -1U;
5540   if (Legal->getMaxSafeDepDistBytes() != -1U)
5541     MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8;
5542   unsigned WidestRegister =
5543       std::min(TTI.getRegisterBitWidth(true), MaxSafeDepDist);
5544   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5545 
5546   SmallVector<RegisterUsage, 8> RUs(VFs.size());
5547   SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size());
5548 
5549   LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
5550 
5551   // A lambda that gets the register usage for the given type and VF.
5552   auto GetRegUsage = [&DL, WidestRegister](Type *Ty, unsigned VF) {
5553     if (Ty->isTokenTy())
5554       return 0U;
5555     unsigned TypeSize = DL.getTypeSizeInBits(Ty->getScalarType());
5556     return std::max<unsigned>(1, VF * TypeSize / WidestRegister);
5557   };
5558 
5559   for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
5560     Instruction *I = IdxToInstr[i];
5561 
5562     // Remove all of the instructions that end at this location.
5563     InstrList &List = TransposeEnds[i];
5564     for (Instruction *ToRemove : List)
5565       OpenIntervals.erase(ToRemove);
5566 
5567     // Ignore instructions that are never used within the loop.
5568     if (!Ends.count(I))
5569       continue;
5570 
5571     // Skip ignored values.
5572     if (ValuesToIgnore.count(I))
5573       continue;
5574 
5575     // For each VF find the maximum usage of registers.
5576     for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
5577       // Count the number of live intervals.
5578       SmallMapVector<unsigned, unsigned, 4> RegUsage;
5579 
5580       if (VFs[j] == 1) {
5581         for (auto Inst : OpenIntervals) {
5582           unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
5583           if (RegUsage.find(ClassID) == RegUsage.end())
5584             RegUsage[ClassID] = 1;
5585           else
5586             RegUsage[ClassID] += 1;
5587         }
5588       } else {
5589         collectUniformsAndScalars(VFs[j]);
5590         for (auto Inst : OpenIntervals) {
5591           // Skip ignored values for VF > 1.
5592           if (VecValuesToIgnore.count(Inst))
5593             continue;
5594           if (isScalarAfterVectorization(Inst, VFs[j])) {
5595             unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
5596             if (RegUsage.find(ClassID) == RegUsage.end())
5597               RegUsage[ClassID] = 1;
5598             else
5599               RegUsage[ClassID] += 1;
5600           } else {
5601             unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType());
5602             if (RegUsage.find(ClassID) == RegUsage.end())
5603               RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]);
5604             else
5605               RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]);
5606           }
5607         }
5608       }
5609 
5610       for (auto& pair : RegUsage) {
5611         if (MaxUsages[j].find(pair.first) != MaxUsages[j].end())
5612           MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second);
5613         else
5614           MaxUsages[j][pair.first] = pair.second;
5615       }
5616     }
5617 
5618     LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
5619                       << OpenIntervals.size() << '\n');
5620 
5621     // Add the current instruction to the list of open intervals.
5622     OpenIntervals.insert(I);
5623   }
5624 
5625   for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
5626     SmallMapVector<unsigned, unsigned, 4> Invariant;
5627 
5628     for (auto Inst : LoopInvariants) {
5629       unsigned Usage = VFs[i] == 1 ? 1 : GetRegUsage(Inst->getType(), VFs[i]);
5630       unsigned ClassID = TTI.getRegisterClassForType(VFs[i] > 1, Inst->getType());
5631       if (Invariant.find(ClassID) == Invariant.end())
5632         Invariant[ClassID] = Usage;
5633       else
5634         Invariant[ClassID] += Usage;
5635     }
5636 
5637     LLVM_DEBUG({
5638       dbgs() << "LV(REG): VF = " << VFs[i] << '\n';
5639       dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size()
5640              << " item\n";
5641       for (const auto &pair : MaxUsages[i]) {
5642         dbgs() << "LV(REG): RegisterClass: "
5643                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5644                << " registers\n";
5645       }
5646       dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()
5647              << " item\n";
5648       for (const auto &pair : Invariant) {
5649         dbgs() << "LV(REG): RegisterClass: "
5650                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5651                << " registers\n";
5652       }
5653     });
5654 
5655     RU.LoopInvariantRegs = Invariant;
5656     RU.MaxLocalUsers = MaxUsages[i];
5657     RUs[i] = RU;
5658   }
5659 
5660   return RUs;
5661 }
5662 
5663 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){
5664   // TODO: Cost model for emulated masked load/store is completely
5665   // broken. This hack guides the cost model to use an artificially
5666   // high enough value to practically disable vectorization with such
5667   // operations, except where previously deployed legality hack allowed
5668   // using very low cost values. This is to avoid regressions coming simply
5669   // from moving "masked load/store" check from legality to cost model.
5670   // Masked Load/Gather emulation was previously never allowed.
5671   // Limited number of Masked Store/Scatter emulation was allowed.
5672   assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction");
5673   return isa<LoadInst>(I) ||
5674          (isa<StoreInst>(I) &&
5675           NumPredStores > NumberOfStoresToPredicate);
5676 }
5677 
5678 void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) {
5679   // If we aren't vectorizing the loop, or if we've already collected the
5680   // instructions to scalarize, there's nothing to do. Collection may already
5681   // have occurred if we have a user-selected VF and are now computing the
5682   // expected cost for interleaving.
5683   if (VF < 2 || InstsToScalarize.find(VF) != InstsToScalarize.end())
5684     return;
5685 
5686   // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
5687   // not profitable to scalarize any instructions, the presence of VF in the
5688   // map will indicate that we've analyzed it already.
5689   ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
5690 
5691   // Find all the instructions that are scalar with predication in the loop and
5692   // determine if it would be better to not if-convert the blocks they are in.
5693   // If so, we also record the instructions to scalarize.
5694   for (BasicBlock *BB : TheLoop->blocks()) {
5695     if (!blockNeedsPredication(BB))
5696       continue;
5697     for (Instruction &I : *BB)
5698       if (isScalarWithPredication(&I)) {
5699         ScalarCostsTy ScalarCosts;
5700         // Do not apply discount logic if hacked cost is needed
5701         // for emulated masked memrefs.
5702         if (!useEmulatedMaskMemRefHack(&I) &&
5703             computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
5704           ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
5705         // Remember that BB will remain after vectorization.
5706         PredicatedBBsAfterVectorization.insert(BB);
5707       }
5708   }
5709 }
5710 
5711 int LoopVectorizationCostModel::computePredInstDiscount(
5712     Instruction *PredInst, DenseMap<Instruction *, unsigned> &ScalarCosts,
5713     unsigned VF) {
5714   assert(!isUniformAfterVectorization(PredInst, VF) &&
5715          "Instruction marked uniform-after-vectorization will be predicated");
5716 
5717   // Initialize the discount to zero, meaning that the scalar version and the
5718   // vector version cost the same.
5719   int Discount = 0;
5720 
5721   // Holds instructions to analyze. The instructions we visit are mapped in
5722   // ScalarCosts. Those instructions are the ones that would be scalarized if
5723   // we find that the scalar version costs less.
5724   SmallVector<Instruction *, 8> Worklist;
5725 
5726   // Returns true if the given instruction can be scalarized.
5727   auto canBeScalarized = [&](Instruction *I) -> bool {
5728     // We only attempt to scalarize instructions forming a single-use chain
5729     // from the original predicated block that would otherwise be vectorized.
5730     // Although not strictly necessary, we give up on instructions we know will
5731     // already be scalar to avoid traversing chains that are unlikely to be
5732     // beneficial.
5733     if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
5734         isScalarAfterVectorization(I, VF))
5735       return false;
5736 
5737     // If the instruction is scalar with predication, it will be analyzed
5738     // separately. We ignore it within the context of PredInst.
5739     if (isScalarWithPredication(I))
5740       return false;
5741 
5742     // If any of the instruction's operands are uniform after vectorization,
5743     // the instruction cannot be scalarized. This prevents, for example, a
5744     // masked load from being scalarized.
5745     //
5746     // We assume we will only emit a value for lane zero of an instruction
5747     // marked uniform after vectorization, rather than VF identical values.
5748     // Thus, if we scalarize an instruction that uses a uniform, we would
5749     // create uses of values corresponding to the lanes we aren't emitting code
5750     // for. This behavior can be changed by allowing getScalarValue to clone
5751     // the lane zero values for uniforms rather than asserting.
5752     for (Use &U : I->operands())
5753       if (auto *J = dyn_cast<Instruction>(U.get()))
5754         if (isUniformAfterVectorization(J, VF))
5755           return false;
5756 
5757     // Otherwise, we can scalarize the instruction.
5758     return true;
5759   };
5760 
5761   // Compute the expected cost discount from scalarizing the entire expression
5762   // feeding the predicated instruction. We currently only consider expressions
5763   // that are single-use instruction chains.
5764   Worklist.push_back(PredInst);
5765   while (!Worklist.empty()) {
5766     Instruction *I = Worklist.pop_back_val();
5767 
5768     // If we've already analyzed the instruction, there's nothing to do.
5769     if (ScalarCosts.find(I) != ScalarCosts.end())
5770       continue;
5771 
5772     // Compute the cost of the vector instruction. Note that this cost already
5773     // includes the scalarization overhead of the predicated instruction.
5774     unsigned VectorCost = getInstructionCost(I, VF).first;
5775 
5776     // Compute the cost of the scalarized instruction. This cost is the cost of
5777     // the instruction as if it wasn't if-converted and instead remained in the
5778     // predicated block. We will scale this cost by block probability after
5779     // computing the scalarization overhead.
5780     unsigned ScalarCost = VF * getInstructionCost(I, 1).first;
5781 
5782     // Compute the scalarization overhead of needed insertelement instructions
5783     // and phi nodes.
5784     if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) {
5785       ScalarCost += TTI.getScalarizationOverhead(
5786           cast<VectorType>(ToVectorTy(I->getType(), VF)),
5787           APInt::getAllOnesValue(VF), true, false);
5788       ScalarCost += VF * TTI.getCFInstrCost(Instruction::PHI,
5789                                             TTI::TCK_RecipThroughput);
5790     }
5791 
5792     // Compute the scalarization overhead of needed extractelement
5793     // instructions. For each of the instruction's operands, if the operand can
5794     // be scalarized, add it to the worklist; otherwise, account for the
5795     // overhead.
5796     for (Use &U : I->operands())
5797       if (auto *J = dyn_cast<Instruction>(U.get())) {
5798         assert(VectorType::isValidElementType(J->getType()) &&
5799                "Instruction has non-scalar type");
5800         if (canBeScalarized(J))
5801           Worklist.push_back(J);
5802         else if (needsExtract(J, VF))
5803           ScalarCost += TTI.getScalarizationOverhead(
5804               cast<VectorType>(ToVectorTy(J->getType(), VF)),
5805               APInt::getAllOnesValue(VF), false, true);
5806       }
5807 
5808     // Scale the total scalar cost by block probability.
5809     ScalarCost /= getReciprocalPredBlockProb();
5810 
5811     // Compute the discount. A non-negative discount means the vector version
5812     // of the instruction costs more, and scalarizing would be beneficial.
5813     Discount += VectorCost - ScalarCost;
5814     ScalarCosts[I] = ScalarCost;
5815   }
5816 
5817   return Discount;
5818 }
5819 
5820 LoopVectorizationCostModel::VectorizationCostTy
5821 LoopVectorizationCostModel::expectedCost(unsigned VF) {
5822   VectorizationCostTy Cost;
5823 
5824   // For each block.
5825   for (BasicBlock *BB : TheLoop->blocks()) {
5826     VectorizationCostTy BlockCost;
5827 
5828     // For each instruction in the old loop.
5829     for (Instruction &I : BB->instructionsWithoutDebug()) {
5830       // Skip ignored values.
5831       if (ValuesToIgnore.count(&I) || (VF > 1 && VecValuesToIgnore.count(&I)))
5832         continue;
5833 
5834       VectorizationCostTy C = getInstructionCost(&I, VF);
5835 
5836       // Check if we should override the cost.
5837       if (ForceTargetInstructionCost.getNumOccurrences() > 0)
5838         C.first = ForceTargetInstructionCost;
5839 
5840       BlockCost.first += C.first;
5841       BlockCost.second |= C.second;
5842       LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first
5843                         << " for VF " << VF << " For instruction: " << I
5844                         << '\n');
5845     }
5846 
5847     // If we are vectorizing a predicated block, it will have been
5848     // if-converted. This means that the block's instructions (aside from
5849     // stores and instructions that may divide by zero) will now be
5850     // unconditionally executed. For the scalar case, we may not always execute
5851     // the predicated block. Thus, scale the block's cost by the probability of
5852     // executing it.
5853     if (VF == 1 && blockNeedsPredication(BB))
5854       BlockCost.first /= getReciprocalPredBlockProb();
5855 
5856     Cost.first += BlockCost.first;
5857     Cost.second |= BlockCost.second;
5858   }
5859 
5860   return Cost;
5861 }
5862 
5863 /// Gets Address Access SCEV after verifying that the access pattern
5864 /// is loop invariant except the induction variable dependence.
5865 ///
5866 /// This SCEV can be sent to the Target in order to estimate the address
5867 /// calculation cost.
5868 static const SCEV *getAddressAccessSCEV(
5869               Value *Ptr,
5870               LoopVectorizationLegality *Legal,
5871               PredicatedScalarEvolution &PSE,
5872               const Loop *TheLoop) {
5873 
5874   auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
5875   if (!Gep)
5876     return nullptr;
5877 
5878   // We are looking for a gep with all loop invariant indices except for one
5879   // which should be an induction variable.
5880   auto SE = PSE.getSE();
5881   unsigned NumOperands = Gep->getNumOperands();
5882   for (unsigned i = 1; i < NumOperands; ++i) {
5883     Value *Opd = Gep->getOperand(i);
5884     if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
5885         !Legal->isInductionVariable(Opd))
5886       return nullptr;
5887   }
5888 
5889   // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
5890   return PSE.getSCEV(Ptr);
5891 }
5892 
5893 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
5894   return Legal->hasStride(I->getOperand(0)) ||
5895          Legal->hasStride(I->getOperand(1));
5896 }
5897 
5898 unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
5899                                                                  unsigned VF) {
5900   assert(VF > 1 && "Scalarization cost of instruction implies vectorization.");
5901   Type *ValTy = getMemInstValueType(I);
5902   auto SE = PSE.getSE();
5903 
5904   unsigned AS = getLoadStoreAddressSpace(I);
5905   Value *Ptr = getLoadStorePointerOperand(I);
5906   Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
5907 
5908   // Figure out whether the access is strided and get the stride value
5909   // if it's known in compile time
5910   const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
5911 
5912   // Get the cost of the scalar memory instruction and address computation.
5913   unsigned Cost = VF * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
5914 
5915   // Don't pass *I here, since it is scalar but will actually be part of a
5916   // vectorized loop where the user of it is a vectorized instruction.
5917   const Align Alignment = getLoadStoreAlignment(I);
5918   Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(),
5919                                    Alignment, AS,
5920                                    TTI::TCK_RecipThroughput);
5921 
5922   // Get the overhead of the extractelement and insertelement instructions
5923   // we might create due to scalarization.
5924   Cost += getScalarizationOverhead(I, VF);
5925 
5926   // If we have a predicated store, it may not be executed for each vector
5927   // lane. Scale the cost by the probability of executing the predicated
5928   // block.
5929   if (isPredicatedInst(I)) {
5930     Cost /= getReciprocalPredBlockProb();
5931 
5932     if (useEmulatedMaskMemRefHack(I))
5933       // Artificially setting to a high enough value to practically disable
5934       // vectorization with such operations.
5935       Cost = 3000000;
5936   }
5937 
5938   return Cost;
5939 }
5940 
5941 unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
5942                                                              unsigned VF) {
5943   Type *ValTy = getMemInstValueType(I);
5944   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
5945   Value *Ptr = getLoadStorePointerOperand(I);
5946   unsigned AS = getLoadStoreAddressSpace(I);
5947   int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
5948   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
5949 
5950   assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
5951          "Stride should be 1 or -1 for consecutive memory access");
5952   const Align Alignment = getLoadStoreAlignment(I);
5953   unsigned Cost = 0;
5954   if (Legal->isMaskRequired(I))
5955     Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
5956                                       CostKind);
5957   else
5958     Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
5959                                 CostKind, I);
5960 
5961   bool Reverse = ConsecutiveStride < 0;
5962   if (Reverse)
5963     Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
5964   return Cost;
5965 }
5966 
5967 unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
5968                                                          unsigned VF) {
5969   Type *ValTy = getMemInstValueType(I);
5970   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
5971   const Align Alignment = getLoadStoreAlignment(I);
5972   unsigned AS = getLoadStoreAddressSpace(I);
5973   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
5974   if (isa<LoadInst>(I)) {
5975     return TTI.getAddressComputationCost(ValTy) +
5976            TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS,
5977                                CostKind) +
5978            TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
5979   }
5980   StoreInst *SI = cast<StoreInst>(I);
5981 
5982   bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand());
5983   return TTI.getAddressComputationCost(ValTy) +
5984          TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS,
5985                              CostKind) +
5986          (isLoopInvariantStoreValue
5987               ? 0
5988               : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
5989                                        VF - 1));
5990 }
5991 
5992 unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
5993                                                           unsigned VF) {
5994   Type *ValTy = getMemInstValueType(I);
5995   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
5996   const Align Alignment = getLoadStoreAlignment(I);
5997   const Value *Ptr = getLoadStorePointerOperand(I);
5998 
5999   return TTI.getAddressComputationCost(VectorTy) +
6000          TTI.getGatherScatterOpCost(
6001              I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment,
6002              TargetTransformInfo::TCK_RecipThroughput, I);
6003 }
6004 
6005 unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
6006                                                             unsigned VF) {
6007   Type *ValTy = getMemInstValueType(I);
6008   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6009   unsigned AS = getLoadStoreAddressSpace(I);
6010 
6011   auto Group = getInterleavedAccessGroup(I);
6012   assert(Group && "Fail to get an interleaved access group.");
6013 
6014   unsigned InterleaveFactor = Group->getFactor();
6015   auto *WideVecTy = FixedVectorType::get(ValTy, VF * InterleaveFactor);
6016 
6017   // Holds the indices of existing members in an interleaved load group.
6018   // An interleaved store group doesn't need this as it doesn't allow gaps.
6019   SmallVector<unsigned, 4> Indices;
6020   if (isa<LoadInst>(I)) {
6021     for (unsigned i = 0; i < InterleaveFactor; i++)
6022       if (Group->getMember(i))
6023         Indices.push_back(i);
6024   }
6025 
6026   // Calculate the cost of the whole interleaved group.
6027   bool UseMaskForGaps =
6028       Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
6029   unsigned Cost = TTI.getInterleavedMemoryOpCost(
6030       I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(),
6031       AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps);
6032 
6033   if (Group->isReverse()) {
6034     // TODO: Add support for reversed masked interleaved access.
6035     assert(!Legal->isMaskRequired(I) &&
6036            "Reverse masked interleaved access not supported.");
6037     Cost += Group->getNumMembers() *
6038             TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
6039   }
6040   return Cost;
6041 }
6042 
6043 unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
6044                                                               unsigned VF) {
6045   // Calculate scalar cost only. Vectorization cost should be ready at this
6046   // moment.
6047   if (VF == 1) {
6048     Type *ValTy = getMemInstValueType(I);
6049     const Align Alignment = getLoadStoreAlignment(I);
6050     unsigned AS = getLoadStoreAddressSpace(I);
6051 
6052     return TTI.getAddressComputationCost(ValTy) +
6053            TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS,
6054                                TTI::TCK_RecipThroughput, I);
6055   }
6056   return getWideningCost(I, VF);
6057 }
6058 
6059 LoopVectorizationCostModel::VectorizationCostTy
6060 LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
6061   // If we know that this instruction will remain uniform, check the cost of
6062   // the scalar version.
6063   if (isUniformAfterVectorization(I, VF))
6064     VF = 1;
6065 
6066   if (VF > 1 && isProfitableToScalarize(I, VF))
6067     return VectorizationCostTy(InstsToScalarize[VF][I], false);
6068 
6069   // Forced scalars do not have any scalarization overhead.
6070   auto ForcedScalar = ForcedScalars.find(VF);
6071   if (VF > 1 && ForcedScalar != ForcedScalars.end()) {
6072     auto InstSet = ForcedScalar->second;
6073     if (InstSet.count(I))
6074       return VectorizationCostTy((getInstructionCost(I, 1).first * VF), false);
6075   }
6076 
6077   Type *VectorTy;
6078   unsigned C = getInstructionCost(I, VF, VectorTy);
6079 
6080   bool TypeNotScalarized =
6081       VF > 1 && VectorTy->isVectorTy() && TTI.getNumberOfParts(VectorTy) < VF;
6082   return VectorizationCostTy(C, TypeNotScalarized);
6083 }
6084 
6085 unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
6086                                                               unsigned VF) {
6087 
6088   if (VF == 1)
6089     return 0;
6090 
6091   unsigned Cost = 0;
6092   Type *RetTy = ToVectorTy(I->getType(), VF);
6093   if (!RetTy->isVoidTy() &&
6094       (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
6095     Cost += TTI.getScalarizationOverhead(
6096         cast<VectorType>(RetTy), APInt::getAllOnesValue(VF), true, false);
6097 
6098   // Some targets keep addresses scalar.
6099   if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
6100     return Cost;
6101 
6102   // Some targets support efficient element stores.
6103   if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
6104     return Cost;
6105 
6106   // Collect operands to consider.
6107   CallInst *CI = dyn_cast<CallInst>(I);
6108   Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands();
6109 
6110   // Skip operands that do not require extraction/scalarization and do not incur
6111   // any overhead.
6112   return Cost + TTI.getOperandsScalarizationOverhead(
6113                     filterExtractingOperands(Ops, VF), VF);
6114 }
6115 
6116 void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {
6117   if (VF == 1)
6118     return;
6119   NumPredStores = 0;
6120   for (BasicBlock *BB : TheLoop->blocks()) {
6121     // For each instruction in the old loop.
6122     for (Instruction &I : *BB) {
6123       Value *Ptr =  getLoadStorePointerOperand(&I);
6124       if (!Ptr)
6125         continue;
6126 
6127       // TODO: We should generate better code and update the cost model for
6128       // predicated uniform stores. Today they are treated as any other
6129       // predicated store (see added test cases in
6130       // invariant-store-vectorization.ll).
6131       if (isa<StoreInst>(&I) && isScalarWithPredication(&I))
6132         NumPredStores++;
6133 
6134       if (Legal->isUniform(Ptr) &&
6135           // Conditional loads and stores should be scalarized and predicated.
6136           // isScalarWithPredication cannot be used here since masked
6137           // gather/scatters are not considered scalar with predication.
6138           !Legal->blockNeedsPredication(I.getParent())) {
6139         // TODO: Avoid replicating loads and stores instead of
6140         // relying on instcombine to remove them.
6141         // Load: Scalar load + broadcast
6142         // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
6143         unsigned Cost = getUniformMemOpCost(&I, VF);
6144         setWideningDecision(&I, VF, CM_Scalarize, Cost);
6145         continue;
6146       }
6147 
6148       // We assume that widening is the best solution when possible.
6149       if (memoryInstructionCanBeWidened(&I, VF)) {
6150         unsigned Cost = getConsecutiveMemOpCost(&I, VF);
6151         int ConsecutiveStride =
6152                Legal->isConsecutivePtr(getLoadStorePointerOperand(&I));
6153         assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6154                "Expected consecutive stride.");
6155         InstWidening Decision =
6156             ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
6157         setWideningDecision(&I, VF, Decision, Cost);
6158         continue;
6159       }
6160 
6161       // Choose between Interleaving, Gather/Scatter or Scalarization.
6162       unsigned InterleaveCost = std::numeric_limits<unsigned>::max();
6163       unsigned NumAccesses = 1;
6164       if (isAccessInterleaved(&I)) {
6165         auto Group = getInterleavedAccessGroup(&I);
6166         assert(Group && "Fail to get an interleaved access group.");
6167 
6168         // Make one decision for the whole group.
6169         if (getWideningDecision(&I, VF) != CM_Unknown)
6170           continue;
6171 
6172         NumAccesses = Group->getNumMembers();
6173         if (interleavedAccessCanBeWidened(&I, VF))
6174           InterleaveCost = getInterleaveGroupCost(&I, VF);
6175       }
6176 
6177       unsigned GatherScatterCost =
6178           isLegalGatherOrScatter(&I)
6179               ? getGatherScatterCost(&I, VF) * NumAccesses
6180               : std::numeric_limits<unsigned>::max();
6181 
6182       unsigned ScalarizationCost =
6183           getMemInstScalarizationCost(&I, VF) * NumAccesses;
6184 
6185       // Choose better solution for the current VF,
6186       // write down this decision and use it during vectorization.
6187       unsigned Cost;
6188       InstWidening Decision;
6189       if (InterleaveCost <= GatherScatterCost &&
6190           InterleaveCost < ScalarizationCost) {
6191         Decision = CM_Interleave;
6192         Cost = InterleaveCost;
6193       } else if (GatherScatterCost < ScalarizationCost) {
6194         Decision = CM_GatherScatter;
6195         Cost = GatherScatterCost;
6196       } else {
6197         Decision = CM_Scalarize;
6198         Cost = ScalarizationCost;
6199       }
6200       // If the instructions belongs to an interleave group, the whole group
6201       // receives the same decision. The whole group receives the cost, but
6202       // the cost will actually be assigned to one instruction.
6203       if (auto Group = getInterleavedAccessGroup(&I))
6204         setWideningDecision(Group, VF, Decision, Cost);
6205       else
6206         setWideningDecision(&I, VF, Decision, Cost);
6207     }
6208   }
6209 
6210   // Make sure that any load of address and any other address computation
6211   // remains scalar unless there is gather/scatter support. This avoids
6212   // inevitable extracts into address registers, and also has the benefit of
6213   // activating LSR more, since that pass can't optimize vectorized
6214   // addresses.
6215   if (TTI.prefersVectorizedAddressing())
6216     return;
6217 
6218   // Start with all scalar pointer uses.
6219   SmallPtrSet<Instruction *, 8> AddrDefs;
6220   for (BasicBlock *BB : TheLoop->blocks())
6221     for (Instruction &I : *BB) {
6222       Instruction *PtrDef =
6223         dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
6224       if (PtrDef && TheLoop->contains(PtrDef) &&
6225           getWideningDecision(&I, VF) != CM_GatherScatter)
6226         AddrDefs.insert(PtrDef);
6227     }
6228 
6229   // Add all instructions used to generate the addresses.
6230   SmallVector<Instruction *, 4> Worklist;
6231   for (auto *I : AddrDefs)
6232     Worklist.push_back(I);
6233   while (!Worklist.empty()) {
6234     Instruction *I = Worklist.pop_back_val();
6235     for (auto &Op : I->operands())
6236       if (auto *InstOp = dyn_cast<Instruction>(Op))
6237         if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
6238             AddrDefs.insert(InstOp).second)
6239           Worklist.push_back(InstOp);
6240   }
6241 
6242   for (auto *I : AddrDefs) {
6243     if (isa<LoadInst>(I)) {
6244       // Setting the desired widening decision should ideally be handled in
6245       // by cost functions, but since this involves the task of finding out
6246       // if the loaded register is involved in an address computation, it is
6247       // instead changed here when we know this is the case.
6248       InstWidening Decision = getWideningDecision(I, VF);
6249       if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
6250         // Scalarize a widened load of address.
6251         setWideningDecision(I, VF, CM_Scalarize,
6252                             (VF * getMemoryInstructionCost(I, 1)));
6253       else if (auto Group = getInterleavedAccessGroup(I)) {
6254         // Scalarize an interleave group of address loads.
6255         for (unsigned I = 0; I < Group->getFactor(); ++I) {
6256           if (Instruction *Member = Group->getMember(I))
6257             setWideningDecision(Member, VF, CM_Scalarize,
6258                                 (VF * getMemoryInstructionCost(Member, 1)));
6259         }
6260       }
6261     } else
6262       // Make sure I gets scalarized and a cost estimate without
6263       // scalarization overhead.
6264       ForcedScalars[VF].insert(I);
6265   }
6266 }
6267 
6268 unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
6269                                                         unsigned VF,
6270                                                         Type *&VectorTy) {
6271   Type *RetTy = I->getType();
6272   if (canTruncateToMinimalBitwidth(I, VF))
6273     RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
6274   VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF);
6275   auto SE = PSE.getSE();
6276   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6277 
6278   // TODO: We need to estimate the cost of intrinsic calls.
6279   switch (I->getOpcode()) {
6280   case Instruction::GetElementPtr:
6281     // We mark this instruction as zero-cost because the cost of GEPs in
6282     // vectorized code depends on whether the corresponding memory instruction
6283     // is scalarized or not. Therefore, we handle GEPs with the memory
6284     // instruction cost.
6285     return 0;
6286   case Instruction::Br: {
6287     // In cases of scalarized and predicated instructions, there will be VF
6288     // predicated blocks in the vectorized loop. Each branch around these
6289     // blocks requires also an extract of its vector compare i1 element.
6290     bool ScalarPredicatedBB = false;
6291     BranchInst *BI = cast<BranchInst>(I);
6292     if (VF > 1 && BI->isConditional() &&
6293         (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) ||
6294          PredicatedBBsAfterVectorization.count(BI->getSuccessor(1))))
6295       ScalarPredicatedBB = true;
6296 
6297     if (ScalarPredicatedBB) {
6298       // Return cost for branches around scalarized and predicated blocks.
6299       auto *Vec_i1Ty =
6300           FixedVectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
6301       return (TTI.getScalarizationOverhead(Vec_i1Ty, APInt::getAllOnesValue(VF),
6302                                            false, true) +
6303               (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF));
6304     } else if (I->getParent() == TheLoop->getLoopLatch() || VF == 1)
6305       // The back-edge branch will remain, as will all scalar branches.
6306       return TTI.getCFInstrCost(Instruction::Br, CostKind);
6307     else
6308       // This branch will be eliminated by if-conversion.
6309       return 0;
6310     // Note: We currently assume zero cost for an unconditional branch inside
6311     // a predicated block since it will become a fall-through, although we
6312     // may decide in the future to call TTI for all branches.
6313   }
6314   case Instruction::PHI: {
6315     auto *Phi = cast<PHINode>(I);
6316 
6317     // First-order recurrences are replaced by vector shuffles inside the loop.
6318     // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type.
6319     if (VF > 1 && Legal->isFirstOrderRecurrence(Phi))
6320       return TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector,
6321                                 cast<VectorType>(VectorTy), VF - 1,
6322                                 FixedVectorType::get(RetTy, 1));
6323 
6324     // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
6325     // converted into select instructions. We require N - 1 selects per phi
6326     // node, where N is the number of incoming values.
6327     if (VF > 1 && Phi->getParent() != TheLoop->getHeader())
6328       return (Phi->getNumIncomingValues() - 1) *
6329              TTI.getCmpSelInstrCost(
6330                  Instruction::Select, ToVectorTy(Phi->getType(), VF),
6331                  ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF),
6332                  CostKind);
6333 
6334     return TTI.getCFInstrCost(Instruction::PHI, CostKind);
6335   }
6336   case Instruction::UDiv:
6337   case Instruction::SDiv:
6338   case Instruction::URem:
6339   case Instruction::SRem:
6340     // If we have a predicated instruction, it may not be executed for each
6341     // vector lane. Get the scalarization cost and scale this amount by the
6342     // probability of executing the predicated block. If the instruction is not
6343     // predicated, we fall through to the next case.
6344     if (VF > 1 && isScalarWithPredication(I)) {
6345       unsigned Cost = 0;
6346 
6347       // These instructions have a non-void type, so account for the phi nodes
6348       // that we will create. This cost is likely to be zero. The phi node
6349       // cost, if any, should be scaled by the block probability because it
6350       // models a copy at the end of each predicated block.
6351       Cost += VF * TTI.getCFInstrCost(Instruction::PHI, CostKind);
6352 
6353       // The cost of the non-predicated instruction.
6354       Cost += VF * TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind);
6355 
6356       // The cost of insertelement and extractelement instructions needed for
6357       // scalarization.
6358       Cost += getScalarizationOverhead(I, VF);
6359 
6360       // Scale the cost by the probability of executing the predicated blocks.
6361       // This assumes the predicated block for each vector lane is equally
6362       // likely.
6363       return Cost / getReciprocalPredBlockProb();
6364     }
6365     LLVM_FALLTHROUGH;
6366   case Instruction::Add:
6367   case Instruction::FAdd:
6368   case Instruction::Sub:
6369   case Instruction::FSub:
6370   case Instruction::Mul:
6371   case Instruction::FMul:
6372   case Instruction::FDiv:
6373   case Instruction::FRem:
6374   case Instruction::Shl:
6375   case Instruction::LShr:
6376   case Instruction::AShr:
6377   case Instruction::And:
6378   case Instruction::Or:
6379   case Instruction::Xor: {
6380     // Since we will replace the stride by 1 the multiplication should go away.
6381     if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal))
6382       return 0;
6383     // Certain instructions can be cheaper to vectorize if they have a constant
6384     // second vector operand. One example of this are shifts on x86.
6385     Value *Op2 = I->getOperand(1);
6386     TargetTransformInfo::OperandValueProperties Op2VP;
6387     TargetTransformInfo::OperandValueKind Op2VK =
6388         TTI.getOperandInfo(Op2, Op2VP);
6389     if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2))
6390       Op2VK = TargetTransformInfo::OK_UniformValue;
6391 
6392     SmallVector<const Value *, 4> Operands(I->operand_values());
6393     unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
6394     return N * TTI.getArithmeticInstrCost(
6395                    I->getOpcode(), VectorTy, CostKind,
6396                    TargetTransformInfo::OK_AnyValue,
6397                    Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I);
6398   }
6399   case Instruction::FNeg: {
6400     unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
6401     return N * TTI.getArithmeticInstrCost(
6402                    I->getOpcode(), VectorTy, CostKind,
6403                    TargetTransformInfo::OK_AnyValue,
6404                    TargetTransformInfo::OK_AnyValue,
6405                    TargetTransformInfo::OP_None, TargetTransformInfo::OP_None,
6406                    I->getOperand(0), I);
6407   }
6408   case Instruction::Select: {
6409     SelectInst *SI = cast<SelectInst>(I);
6410     const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
6411     bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
6412     Type *CondTy = SI->getCondition()->getType();
6413     if (!ScalarCond)
6414       CondTy = FixedVectorType::get(CondTy, VF);
6415 
6416     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy,
6417                                   CostKind, I);
6418   }
6419   case Instruction::ICmp:
6420   case Instruction::FCmp: {
6421     Type *ValTy = I->getOperand(0)->getType();
6422     Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
6423     if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
6424       ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
6425     VectorTy = ToVectorTy(ValTy, VF);
6426     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, CostKind,
6427                                   I);
6428   }
6429   case Instruction::Store:
6430   case Instruction::Load: {
6431     unsigned Width = VF;
6432     if (Width > 1) {
6433       InstWidening Decision = getWideningDecision(I, Width);
6434       assert(Decision != CM_Unknown &&
6435              "CM decision should be taken at this point");
6436       if (Decision == CM_Scalarize)
6437         Width = 1;
6438     }
6439     VectorTy = ToVectorTy(getMemInstValueType(I), Width);
6440     return getMemoryInstructionCost(I, VF);
6441   }
6442   case Instruction::ZExt:
6443   case Instruction::SExt:
6444   case Instruction::FPToUI:
6445   case Instruction::FPToSI:
6446   case Instruction::FPExt:
6447   case Instruction::PtrToInt:
6448   case Instruction::IntToPtr:
6449   case Instruction::SIToFP:
6450   case Instruction::UIToFP:
6451   case Instruction::Trunc:
6452   case Instruction::FPTrunc:
6453   case Instruction::BitCast: {
6454     // We optimize the truncation of induction variables having constant
6455     // integer steps. The cost of these truncations is the same as the scalar
6456     // operation.
6457     if (isOptimizableIVTruncate(I, VF)) {
6458       auto *Trunc = cast<TruncInst>(I);
6459       return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
6460                                   Trunc->getSrcTy(), CostKind, Trunc);
6461     }
6462 
6463     Type *SrcScalarTy = I->getOperand(0)->getType();
6464     Type *SrcVecTy =
6465         VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
6466     if (canTruncateToMinimalBitwidth(I, VF)) {
6467       // This cast is going to be shrunk. This may remove the cast or it might
6468       // turn it into slightly different cast. For example, if MinBW == 16,
6469       // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
6470       //
6471       // Calculate the modified src and dest types.
6472       Type *MinVecTy = VectorTy;
6473       if (I->getOpcode() == Instruction::Trunc) {
6474         SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
6475         VectorTy =
6476             largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
6477       } else if (I->getOpcode() == Instruction::ZExt ||
6478                  I->getOpcode() == Instruction::SExt) {
6479         SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy);
6480         VectorTy =
6481             smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
6482       }
6483     }
6484 
6485     unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
6486     return N * TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy,
6487                                     CostKind, I);
6488   }
6489   case Instruction::Call: {
6490     bool NeedToScalarize;
6491     CallInst *CI = cast<CallInst>(I);
6492     unsigned CallCost = getVectorCallCost(CI, VF, NeedToScalarize);
6493     if (getVectorIntrinsicIDForCall(CI, TLI))
6494       return std::min(CallCost, getVectorIntrinsicCost(CI, VF));
6495     return CallCost;
6496   }
6497   default:
6498     // The cost of executing VF copies of the scalar instruction. This opcode
6499     // is unknown. Assume that it is the same as 'mul'.
6500     return VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy,
6501                                            CostKind) +
6502            getScalarizationOverhead(I, VF);
6503   } // end of switch.
6504 }
6505 
6506 char LoopVectorize::ID = 0;
6507 
6508 static const char lv_name[] = "Loop Vectorization";
6509 
6510 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
6511 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
6512 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)
6513 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
6514 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
6515 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
6516 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
6517 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
6518 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
6519 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
6520 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
6521 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
6522 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
6523 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
6524 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy)
6525 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
6526 
6527 namespace llvm {
6528 
6529 Pass *createLoopVectorizePass() { return new LoopVectorize(); }
6530 
6531 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced,
6532                               bool VectorizeOnlyWhenForced) {
6533   return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced);
6534 }
6535 
6536 } // end namespace llvm
6537 
6538 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
6539   // Check if the pointer operand of a load or store instruction is
6540   // consecutive.
6541   if (auto *Ptr = getLoadStorePointerOperand(Inst))
6542     return Legal->isConsecutivePtr(Ptr);
6543   return false;
6544 }
6545 
6546 void LoopVectorizationCostModel::collectValuesToIgnore() {
6547   // Ignore ephemeral values.
6548   CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
6549 
6550   // Ignore type-promoting instructions we identified during reduction
6551   // detection.
6552   for (auto &Reduction : Legal->getReductionVars()) {
6553     RecurrenceDescriptor &RedDes = Reduction.second;
6554     SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
6555     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
6556   }
6557   // Ignore type-casting instructions we identified during induction
6558   // detection.
6559   for (auto &Induction : Legal->getInductionVars()) {
6560     InductionDescriptor &IndDes = Induction.second;
6561     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
6562     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
6563   }
6564 }
6565 
6566 // TODO: we could return a pair of values that specify the max VF and
6567 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
6568 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
6569 // doesn't have a cost model that can choose which plan to execute if
6570 // more than one is generated.
6571 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits,
6572                                  LoopVectorizationCostModel &CM) {
6573   unsigned WidestType;
6574   std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
6575   return WidestVectorRegBits / WidestType;
6576 }
6577 
6578 VectorizationFactor
6579 LoopVectorizationPlanner::planInVPlanNativePath(unsigned UserVF) {
6580   unsigned VF = UserVF;
6581   // Outer loop handling: They may require CFG and instruction level
6582   // transformations before even evaluating whether vectorization is profitable.
6583   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
6584   // the vectorization pipeline.
6585   if (!OrigLoop->empty()) {
6586     // If the user doesn't provide a vectorization factor, determine a
6587     // reasonable one.
6588     if (!UserVF) {
6589       VF = determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM);
6590       LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
6591 
6592       // Make sure we have a VF > 1 for stress testing.
6593       if (VPlanBuildStressTest && VF < 2) {
6594         LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
6595                           << "overriding computed VF.\n");
6596         VF = 4;
6597       }
6598     }
6599     assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
6600     assert(isPowerOf2_32(VF) && "VF needs to be a power of two");
6601     LLVM_DEBUG(dbgs() << "LV: Using " << (UserVF ? "user " : "") << "VF " << VF
6602                       << " to build VPlans.\n");
6603     buildVPlans(VF, VF);
6604 
6605     // For VPlan build stress testing, we bail out after VPlan construction.
6606     if (VPlanBuildStressTest)
6607       return VectorizationFactor::Disabled();
6608 
6609     return {VF, 0};
6610   }
6611 
6612   LLVM_DEBUG(
6613       dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
6614                 "VPlan-native path.\n");
6615   return VectorizationFactor::Disabled();
6616 }
6617 
6618 Optional<VectorizationFactor> LoopVectorizationPlanner::plan(unsigned UserVF,
6619                                                              unsigned UserIC) {
6620   assert(OrigLoop->empty() && "Inner loop expected.");
6621   Optional<unsigned> MaybeMaxVF = CM.computeMaxVF(UserVF, UserIC);
6622   if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved.
6623     return None;
6624 
6625   // Invalidate interleave groups if all blocks of loop will be predicated.
6626   if (CM.blockNeedsPredication(OrigLoop->getHeader()) &&
6627       !useMaskedInterleavedAccesses(*TTI)) {
6628     LLVM_DEBUG(
6629         dbgs()
6630         << "LV: Invalidate all interleaved groups due to fold-tail by masking "
6631            "which requires masked-interleaved support.\n");
6632     if (CM.InterleaveInfo.invalidateGroups())
6633       // Invalidating interleave groups also requires invalidating all decisions
6634       // based on them, which includes widening decisions and uniform and scalar
6635       // values.
6636       CM.invalidateCostModelingDecisions();
6637   }
6638 
6639   if (UserVF) {
6640     LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
6641     assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two");
6642     // Collect the instructions (and their associated costs) that will be more
6643     // profitable to scalarize.
6644     CM.selectUserVectorizationFactor(UserVF);
6645     buildVPlansWithVPRecipes(UserVF, UserVF);
6646     LLVM_DEBUG(printPlans(dbgs()));
6647     return {{UserVF, 0}};
6648   }
6649 
6650   unsigned MaxVF = MaybeMaxVF.getValue();
6651   assert(MaxVF != 0 && "MaxVF is zero.");
6652 
6653   for (unsigned VF = 1; VF <= MaxVF; VF *= 2) {
6654     // Collect Uniform and Scalar instructions after vectorization with VF.
6655     CM.collectUniformsAndScalars(VF);
6656 
6657     // Collect the instructions (and their associated costs) that will be more
6658     // profitable to scalarize.
6659     if (VF > 1)
6660       CM.collectInstsToScalarize(VF);
6661   }
6662 
6663   buildVPlansWithVPRecipes(1, MaxVF);
6664   LLVM_DEBUG(printPlans(dbgs()));
6665   if (MaxVF == 1)
6666     return VectorizationFactor::Disabled();
6667 
6668   // Select the optimal vectorization factor.
6669   return CM.selectVectorizationFactor(MaxVF);
6670 }
6671 
6672 void LoopVectorizationPlanner::setBestPlan(unsigned VF, unsigned UF) {
6673   LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF
6674                     << '\n');
6675   BestVF = VF;
6676   BestUF = UF;
6677 
6678   erase_if(VPlans, [VF](const VPlanPtr &Plan) {
6679     return !Plan->hasVF(VF);
6680   });
6681   assert(VPlans.size() == 1 && "Best VF has not a single VPlan.");
6682 }
6683 
6684 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV,
6685                                            DominatorTree *DT) {
6686   // Perform the actual loop transformation.
6687 
6688   // 1. Create a new empty loop. Unlink the old loop and connect the new one.
6689   VPCallbackILV CallbackILV(ILV);
6690 
6691   VPTransformState State{BestVF, BestUF,      LI,
6692                          DT,     ILV.Builder, ILV.VectorLoopValueMap,
6693                          &ILV,   CallbackILV};
6694   State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
6695   State.TripCount = ILV.getOrCreateTripCount(nullptr);
6696   State.CanonicalIV = ILV.Induction;
6697 
6698   //===------------------------------------------------===//
6699   //
6700   // Notice: any optimization or new instruction that go
6701   // into the code below should also be implemented in
6702   // the cost-model.
6703   //
6704   //===------------------------------------------------===//
6705 
6706   // 2. Copy and widen instructions from the old loop into the new loop.
6707   assert(VPlans.size() == 1 && "Not a single VPlan to execute.");
6708   VPlans.front()->execute(&State);
6709 
6710   // 3. Fix the vectorized code: take care of header phi's, live-outs,
6711   //    predication, updating analyses.
6712   ILV.fixVectorizedLoop();
6713 }
6714 
6715 void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
6716     SmallPtrSetImpl<Instruction *> &DeadInstructions) {
6717   BasicBlock *Latch = OrigLoop->getLoopLatch();
6718 
6719   // We create new control-flow for the vectorized loop, so the original
6720   // condition will be dead after vectorization if it's only used by the
6721   // branch.
6722   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
6723   if (Cmp && Cmp->hasOneUse())
6724     DeadInstructions.insert(Cmp);
6725 
6726   // We create new "steps" for induction variable updates to which the original
6727   // induction variables map. An original update instruction will be dead if
6728   // all its users except the induction variable are dead.
6729   for (auto &Induction : Legal->getInductionVars()) {
6730     PHINode *Ind = Induction.first;
6731     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
6732     if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
6733           return U == Ind || DeadInstructions.count(cast<Instruction>(U));
6734         }))
6735       DeadInstructions.insert(IndUpdate);
6736 
6737     // We record as "Dead" also the type-casting instructions we had identified
6738     // during induction analysis. We don't need any handling for them in the
6739     // vectorized loop because we have proven that, under a proper runtime
6740     // test guarding the vectorized loop, the value of the phi, and the casted
6741     // value of the phi, are the same. The last instruction in this casting chain
6742     // will get its scalar/vector/widened def from the scalar/vector/widened def
6743     // of the respective phi node. Any other casts in the induction def-use chain
6744     // have no other uses outside the phi update chain, and will be ignored.
6745     InductionDescriptor &IndDes = Induction.second;
6746     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
6747     DeadInstructions.insert(Casts.begin(), Casts.end());
6748   }
6749 }
6750 
6751 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; }
6752 
6753 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; }
6754 
6755 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step,
6756                                         Instruction::BinaryOps BinOp) {
6757   // When unrolling and the VF is 1, we only need to add a simple scalar.
6758   Type *Ty = Val->getType();
6759   assert(!Ty->isVectorTy() && "Val must be a scalar");
6760 
6761   if (Ty->isFloatingPointTy()) {
6762     Constant *C = ConstantFP::get(Ty, (double)StartIdx);
6763 
6764     // Floating point operations had to be 'fast' to enable the unrolling.
6765     Value *MulOp = addFastMathFlag(Builder.CreateFMul(C, Step));
6766     return addFastMathFlag(Builder.CreateBinOp(BinOp, Val, MulOp));
6767   }
6768   Constant *C = ConstantInt::get(Ty, StartIdx);
6769   return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction");
6770 }
6771 
6772 static void AddRuntimeUnrollDisableMetaData(Loop *L) {
6773   SmallVector<Metadata *, 4> MDs;
6774   // Reserve first location for self reference to the LoopID metadata node.
6775   MDs.push_back(nullptr);
6776   bool IsUnrollMetadata = false;
6777   MDNode *LoopID = L->getLoopID();
6778   if (LoopID) {
6779     // First find existing loop unrolling disable metadata.
6780     for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
6781       auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
6782       if (MD) {
6783         const auto *S = dyn_cast<MDString>(MD->getOperand(0));
6784         IsUnrollMetadata =
6785             S && S->getString().startswith("llvm.loop.unroll.disable");
6786       }
6787       MDs.push_back(LoopID->getOperand(i));
6788     }
6789   }
6790 
6791   if (!IsUnrollMetadata) {
6792     // Add runtime unroll disable metadata.
6793     LLVMContext &Context = L->getHeader()->getContext();
6794     SmallVector<Metadata *, 1> DisableOperands;
6795     DisableOperands.push_back(
6796         MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
6797     MDNode *DisableNode = MDNode::get(Context, DisableOperands);
6798     MDs.push_back(DisableNode);
6799     MDNode *NewLoopID = MDNode::get(Context, MDs);
6800     // Set operand 0 to refer to the loop id itself.
6801     NewLoopID->replaceOperandWith(0, NewLoopID);
6802     L->setLoopID(NewLoopID);
6803   }
6804 }
6805 
6806 bool LoopVectorizationPlanner::getDecisionAndClampRange(
6807     const std::function<bool(unsigned)> &Predicate, VFRange &Range) {
6808   assert(Range.End > Range.Start && "Trying to test an empty VF range.");
6809   bool PredicateAtRangeStart = Predicate(Range.Start);
6810 
6811   for (unsigned TmpVF = Range.Start * 2; TmpVF < Range.End; TmpVF *= 2)
6812     if (Predicate(TmpVF) != PredicateAtRangeStart) {
6813       Range.End = TmpVF;
6814       break;
6815     }
6816 
6817   return PredicateAtRangeStart;
6818 }
6819 
6820 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
6821 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
6822 /// of VF's starting at a given VF and extending it as much as possible. Each
6823 /// vectorization decision can potentially shorten this sub-range during
6824 /// buildVPlan().
6825 void LoopVectorizationPlanner::buildVPlans(unsigned MinVF, unsigned MaxVF) {
6826   for (unsigned VF = MinVF; VF < MaxVF + 1;) {
6827     VFRange SubRange = {VF, MaxVF + 1};
6828     VPlans.push_back(buildVPlan(SubRange));
6829     VF = SubRange.End;
6830   }
6831 }
6832 
6833 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
6834                                          VPlanPtr &Plan) {
6835   assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
6836 
6837   // Look for cached value.
6838   std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
6839   EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
6840   if (ECEntryIt != EdgeMaskCache.end())
6841     return ECEntryIt->second;
6842 
6843   VPValue *SrcMask = createBlockInMask(Src, Plan);
6844 
6845   // The terminator has to be a branch inst!
6846   BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
6847   assert(BI && "Unexpected terminator found");
6848 
6849   if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1))
6850     return EdgeMaskCache[Edge] = SrcMask;
6851 
6852   VPValue *EdgeMask = Plan->getVPValue(BI->getCondition());
6853   assert(EdgeMask && "No Edge Mask found for condition");
6854 
6855   if (BI->getSuccessor(0) != Dst)
6856     EdgeMask = Builder.createNot(EdgeMask);
6857 
6858   if (SrcMask) // Otherwise block in-mask is all-one, no need to AND.
6859     EdgeMask = Builder.createAnd(EdgeMask, SrcMask);
6860 
6861   return EdgeMaskCache[Edge] = EdgeMask;
6862 }
6863 
6864 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
6865   assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
6866 
6867   // Look for cached value.
6868   BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB);
6869   if (BCEntryIt != BlockMaskCache.end())
6870     return BCEntryIt->second;
6871 
6872   // All-one mask is modelled as no-mask following the convention for masked
6873   // load/store/gather/scatter. Initialize BlockMask to no-mask.
6874   VPValue *BlockMask = nullptr;
6875 
6876   if (OrigLoop->getHeader() == BB) {
6877     if (!CM.blockNeedsPredication(BB))
6878       return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one.
6879 
6880     // Introduce the early-exit compare IV <= BTC to form header block mask.
6881     // This is used instead of IV < TC because TC may wrap, unlike BTC.
6882     // Start by constructing the desired canonical IV.
6883     VPValue *IV = nullptr;
6884     if (Legal->getPrimaryInduction())
6885       IV = Plan->getVPValue(Legal->getPrimaryInduction());
6886     else {
6887       auto IVRecipe = new VPWidenCanonicalIVRecipe();
6888       Builder.getInsertBlock()->appendRecipe(IVRecipe);
6889       IV = IVRecipe->getVPValue();
6890     }
6891     VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
6892     bool TailFolded = !CM.isScalarEpilogueAllowed();
6893     if (TailFolded && CM.TTI.emitGetActiveLaneMask())
6894       BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, BTC});
6895     else
6896       BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
6897     return BlockMaskCache[BB] = BlockMask;
6898   }
6899 
6900   // This is the block mask. We OR all incoming edges.
6901   for (auto *Predecessor : predecessors(BB)) {
6902     VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan);
6903     if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too.
6904       return BlockMaskCache[BB] = EdgeMask;
6905 
6906     if (!BlockMask) { // BlockMask has its initialized nullptr value.
6907       BlockMask = EdgeMask;
6908       continue;
6909     }
6910 
6911     BlockMask = Builder.createOr(BlockMask, EdgeMask);
6912   }
6913 
6914   return BlockMaskCache[BB] = BlockMask;
6915 }
6916 
6917 VPWidenMemoryInstructionRecipe *
6918 VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range,
6919                                   VPlanPtr &Plan) {
6920   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
6921          "Must be called with either a load or store");
6922 
6923   auto willWiden = [&](unsigned VF) -> bool {
6924     if (VF == 1)
6925       return false;
6926     LoopVectorizationCostModel::InstWidening Decision =
6927         CM.getWideningDecision(I, VF);
6928     assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
6929            "CM decision should be taken at this point.");
6930     if (Decision == LoopVectorizationCostModel::CM_Interleave)
6931       return true;
6932     if (CM.isScalarAfterVectorization(I, VF) ||
6933         CM.isProfitableToScalarize(I, VF))
6934       return false;
6935     return Decision != LoopVectorizationCostModel::CM_Scalarize;
6936   };
6937 
6938   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
6939     return nullptr;
6940 
6941   VPValue *Mask = nullptr;
6942   if (Legal->isMaskRequired(I))
6943     Mask = createBlockInMask(I->getParent(), Plan);
6944 
6945   VPValue *Addr = Plan->getOrAddVPValue(getLoadStorePointerOperand(I));
6946   if (LoadInst *Load = dyn_cast<LoadInst>(I))
6947     return new VPWidenMemoryInstructionRecipe(*Load, Addr, Mask);
6948 
6949   StoreInst *Store = cast<StoreInst>(I);
6950   VPValue *StoredValue = Plan->getOrAddVPValue(Store->getValueOperand());
6951   return new VPWidenMemoryInstructionRecipe(*Store, Addr, StoredValue, Mask);
6952 }
6953 
6954 VPWidenIntOrFpInductionRecipe *
6955 VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi) const {
6956   // Check if this is an integer or fp induction. If so, build the recipe that
6957   // produces its scalar and vector values.
6958   InductionDescriptor II = Legal->getInductionVars().lookup(Phi);
6959   if (II.getKind() == InductionDescriptor::IK_IntInduction ||
6960       II.getKind() == InductionDescriptor::IK_FpInduction)
6961     return new VPWidenIntOrFpInductionRecipe(Phi);
6962 
6963   return nullptr;
6964 }
6965 
6966 VPWidenIntOrFpInductionRecipe *
6967 VPRecipeBuilder::tryToOptimizeInductionTruncate(TruncInst *I,
6968                                                 VFRange &Range) const {
6969   // Optimize the special case where the source is a constant integer
6970   // induction variable. Notice that we can only optimize the 'trunc' case
6971   // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
6972   // (c) other casts depend on pointer size.
6973 
6974   // Determine whether \p K is a truncation based on an induction variable that
6975   // can be optimized.
6976   auto isOptimizableIVTruncate =
6977       [&](Instruction *K) -> std::function<bool(unsigned)> {
6978     return
6979         [=](unsigned VF) -> bool { return CM.isOptimizableIVTruncate(K, VF); };
6980   };
6981 
6982   if (LoopVectorizationPlanner::getDecisionAndClampRange(
6983           isOptimizableIVTruncate(I), Range))
6984     return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)),
6985                                              I);
6986   return nullptr;
6987 }
6988 
6989 VPBlendRecipe *VPRecipeBuilder::tryToBlend(PHINode *Phi, VPlanPtr &Plan) {
6990   // We know that all PHIs in non-header blocks are converted into selects, so
6991   // we don't have to worry about the insertion order and we can just use the
6992   // builder. At this point we generate the predication tree. There may be
6993   // duplications since this is a simple recursive scan, but future
6994   // optimizations will clean it up.
6995 
6996   SmallVector<VPValue *, 2> Operands;
6997   unsigned NumIncoming = Phi->getNumIncomingValues();
6998   for (unsigned In = 0; In < NumIncoming; In++) {
6999     VPValue *EdgeMask =
7000       createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan);
7001     assert((EdgeMask || NumIncoming == 1) &&
7002            "Multiple predecessors with one having a full mask");
7003     Operands.push_back(Plan->getOrAddVPValue(Phi->getIncomingValue(In)));
7004     if (EdgeMask)
7005       Operands.push_back(EdgeMask);
7006   }
7007   return new VPBlendRecipe(Phi, Operands);
7008 }
7009 
7010 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, VFRange &Range,
7011                                                    VPlan &Plan) const {
7012 
7013   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
7014       [this, CI](unsigned VF) { return CM.isScalarWithPredication(CI, VF); },
7015       Range);
7016 
7017   if (IsPredicated)
7018     return nullptr;
7019 
7020   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
7021   if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
7022              ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect))
7023     return nullptr;
7024 
7025   auto willWiden = [&](unsigned VF) -> bool {
7026     Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
7027     // The following case may be scalarized depending on the VF.
7028     // The flag shows whether we use Intrinsic or a usual Call for vectorized
7029     // version of the instruction.
7030     // Is it beneficial to perform intrinsic call compared to lib call?
7031     bool NeedToScalarize = false;
7032     unsigned CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize);
7033     bool UseVectorIntrinsic =
7034         ID && CM.getVectorIntrinsicCost(CI, VF) <= CallCost;
7035     return UseVectorIntrinsic || !NeedToScalarize;
7036   };
7037 
7038   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
7039     return nullptr;
7040 
7041   return new VPWidenCallRecipe(*CI, Plan.mapToVPValues(CI->arg_operands()));
7042 }
7043 
7044 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
7045   assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) &&
7046          !isa<StoreInst>(I) && "Instruction should have been handled earlier");
7047   // Instruction should be widened, unless it is scalar after vectorization,
7048   // scalarization is profitable or it is predicated.
7049   auto WillScalarize = [this, I](unsigned VF) -> bool {
7050     return CM.isScalarAfterVectorization(I, VF) ||
7051            CM.isProfitableToScalarize(I, VF) ||
7052            CM.isScalarWithPredication(I, VF);
7053   };
7054   return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize,
7055                                                              Range);
7056 }
7057 
7058 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, VPlan &Plan) const {
7059   auto IsVectorizableOpcode = [](unsigned Opcode) {
7060     switch (Opcode) {
7061     case Instruction::Add:
7062     case Instruction::And:
7063     case Instruction::AShr:
7064     case Instruction::BitCast:
7065     case Instruction::FAdd:
7066     case Instruction::FCmp:
7067     case Instruction::FDiv:
7068     case Instruction::FMul:
7069     case Instruction::FNeg:
7070     case Instruction::FPExt:
7071     case Instruction::FPToSI:
7072     case Instruction::FPToUI:
7073     case Instruction::FPTrunc:
7074     case Instruction::FRem:
7075     case Instruction::FSub:
7076     case Instruction::ICmp:
7077     case Instruction::IntToPtr:
7078     case Instruction::LShr:
7079     case Instruction::Mul:
7080     case Instruction::Or:
7081     case Instruction::PtrToInt:
7082     case Instruction::SDiv:
7083     case Instruction::Select:
7084     case Instruction::SExt:
7085     case Instruction::Shl:
7086     case Instruction::SIToFP:
7087     case Instruction::SRem:
7088     case Instruction::Sub:
7089     case Instruction::Trunc:
7090     case Instruction::UDiv:
7091     case Instruction::UIToFP:
7092     case Instruction::URem:
7093     case Instruction::Xor:
7094     case Instruction::ZExt:
7095       return true;
7096     }
7097     return false;
7098   };
7099 
7100   if (!IsVectorizableOpcode(I->getOpcode()))
7101     return nullptr;
7102 
7103   // Success: widen this instruction.
7104   return new VPWidenRecipe(*I, Plan.mapToVPValues(I->operands()));
7105 }
7106 
7107 VPBasicBlock *VPRecipeBuilder::handleReplication(
7108     Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
7109     DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe,
7110     VPlanPtr &Plan) {
7111   bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
7112       [&](unsigned VF) { return CM.isUniformAfterVectorization(I, VF); },
7113       Range);
7114 
7115   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
7116       [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range);
7117 
7118   auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()),
7119                                        IsUniform, IsPredicated);
7120   setRecipe(I, Recipe);
7121 
7122   // Find if I uses a predicated instruction. If so, it will use its scalar
7123   // value. Avoid hoisting the insert-element which packs the scalar value into
7124   // a vector value, as that happens iff all users use the vector value.
7125   for (auto &Op : I->operands())
7126     if (auto *PredInst = dyn_cast<Instruction>(Op))
7127       if (PredInst2Recipe.find(PredInst) != PredInst2Recipe.end())
7128         PredInst2Recipe[PredInst]->setAlsoPack(false);
7129 
7130   // Finalize the recipe for Instr, first if it is not predicated.
7131   if (!IsPredicated) {
7132     LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
7133     VPBB->appendRecipe(Recipe);
7134     return VPBB;
7135   }
7136   LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
7137   assert(VPBB->getSuccessors().empty() &&
7138          "VPBB has successors when handling predicated replication.");
7139   // Record predicated instructions for above packing optimizations.
7140   PredInst2Recipe[I] = Recipe;
7141   VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan);
7142   VPBlockUtils::insertBlockAfter(Region, VPBB);
7143   auto *RegSucc = new VPBasicBlock();
7144   VPBlockUtils::insertBlockAfter(RegSucc, Region);
7145   return RegSucc;
7146 }
7147 
7148 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr,
7149                                                       VPRecipeBase *PredRecipe,
7150                                                       VPlanPtr &Plan) {
7151   // Instructions marked for predication are replicated and placed under an
7152   // if-then construct to prevent side-effects.
7153 
7154   // Generate recipes to compute the block mask for this region.
7155   VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan);
7156 
7157   // Build the triangular if-then region.
7158   std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
7159   assert(Instr->getParent() && "Predicated instruction not in any basic block");
7160   auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask);
7161   auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
7162   auto *PHIRecipe =
7163       Instr->getType()->isVoidTy() ? nullptr : new VPPredInstPHIRecipe(Instr);
7164   auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
7165   auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe);
7166   VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true);
7167 
7168   // Note: first set Entry as region entry and then connect successors starting
7169   // from it in order, to propagate the "parent" of each VPBasicBlock.
7170   VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry);
7171   VPBlockUtils::connectBlocks(Pred, Exit);
7172 
7173   return Region;
7174 }
7175 
7176 VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
7177                                                       VFRange &Range,
7178                                                       VPlanPtr &Plan) {
7179   // First, check for specific widening recipes that deal with calls, memory
7180   // operations, inductions and Phi nodes.
7181   if (auto *CI = dyn_cast<CallInst>(Instr))
7182     return tryToWidenCall(CI, Range, *Plan);
7183 
7184   if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
7185     return tryToWidenMemory(Instr, Range, Plan);
7186 
7187   VPRecipeBase *Recipe;
7188   if (auto Phi = dyn_cast<PHINode>(Instr)) {
7189     if (Phi->getParent() != OrigLoop->getHeader())
7190       return tryToBlend(Phi, Plan);
7191     if ((Recipe = tryToOptimizeInductionPHI(Phi)))
7192       return Recipe;
7193     return new VPWidenPHIRecipe(Phi);
7194   }
7195 
7196   if (isa<TruncInst>(Instr) &&
7197       (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Range)))
7198     return Recipe;
7199 
7200   if (!shouldWiden(Instr, Range))
7201     return nullptr;
7202 
7203   if (auto GEP = dyn_cast<GetElementPtrInst>(Instr))
7204     return new VPWidenGEPRecipe(GEP, Plan->mapToVPValues(GEP->operands()),
7205                                 OrigLoop);
7206 
7207   if (auto *SI = dyn_cast<SelectInst>(Instr)) {
7208     bool InvariantCond =
7209         PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop);
7210     return new VPWidenSelectRecipe(*SI, Plan->mapToVPValues(SI->operands()),
7211                                    InvariantCond);
7212   }
7213 
7214   return tryToWiden(Instr, *Plan);
7215 }
7216 
7217 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF,
7218                                                         unsigned MaxVF) {
7219   assert(OrigLoop->empty() && "Inner loop expected.");
7220 
7221   // Collect conditions feeding internal conditional branches; they need to be
7222   // represented in VPlan for it to model masking.
7223   SmallPtrSet<Value *, 1> NeedDef;
7224 
7225   auto *Latch = OrigLoop->getLoopLatch();
7226   for (BasicBlock *BB : OrigLoop->blocks()) {
7227     if (BB == Latch)
7228       continue;
7229     BranchInst *Branch = dyn_cast<BranchInst>(BB->getTerminator());
7230     if (Branch && Branch->isConditional())
7231       NeedDef.insert(Branch->getCondition());
7232   }
7233 
7234   // If the tail is to be folded by masking, the primary induction variable, if
7235   // exists needs to be represented in VPlan for it to model early-exit masking.
7236   // Also, both the Phi and the live-out instruction of each reduction are
7237   // required in order to introduce a select between them in VPlan.
7238   if (CM.foldTailByMasking()) {
7239     if (Legal->getPrimaryInduction())
7240       NeedDef.insert(Legal->getPrimaryInduction());
7241     for (auto &Reduction : Legal->getReductionVars()) {
7242       NeedDef.insert(Reduction.first);
7243       NeedDef.insert(Reduction.second.getLoopExitInstr());
7244     }
7245   }
7246 
7247   // Collect instructions from the original loop that will become trivially dead
7248   // in the vectorized loop. We don't need to vectorize these instructions. For
7249   // example, original induction update instructions can become dead because we
7250   // separately emit induction "steps" when generating code for the new loop.
7251   // Similarly, we create a new latch condition when setting up the structure
7252   // of the new loop, so the old one can become dead.
7253   SmallPtrSet<Instruction *, 4> DeadInstructions;
7254   collectTriviallyDeadInstructions(DeadInstructions);
7255 
7256   // Add assume instructions we need to drop to DeadInstructions, to prevent
7257   // them from being added to the VPlan.
7258   // TODO: We only need to drop assumes in blocks that get flattend. If the
7259   // control flow is preserved, we should keep them.
7260   auto &ConditionalAssumes = Legal->getConditionalAssumes();
7261   DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end());
7262 
7263   DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter();
7264   // Dead instructions do not need sinking. Remove them from SinkAfter.
7265   for (Instruction *I : DeadInstructions)
7266     SinkAfter.erase(I);
7267 
7268   for (unsigned VF = MinVF; VF < MaxVF + 1;) {
7269     VFRange SubRange = {VF, MaxVF + 1};
7270     VPlans.push_back(buildVPlanWithVPRecipes(SubRange, NeedDef,
7271                                              DeadInstructions, SinkAfter));
7272     VF = SubRange.End;
7273   }
7274 }
7275 
7276 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
7277     VFRange &Range, SmallPtrSetImpl<Value *> &NeedDef,
7278     SmallPtrSetImpl<Instruction *> &DeadInstructions,
7279     const DenseMap<Instruction *, Instruction *> &SinkAfter) {
7280 
7281   // Hold a mapping from predicated instructions to their recipes, in order to
7282   // fix their AlsoPack behavior if a user is determined to replicate and use a
7283   // scalar instead of vector value.
7284   DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe;
7285 
7286   SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
7287 
7288   VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder);
7289 
7290   // ---------------------------------------------------------------------------
7291   // Pre-construction: record ingredients whose recipes we'll need to further
7292   // process after constructing the initial VPlan.
7293   // ---------------------------------------------------------------------------
7294 
7295   // Mark instructions we'll need to sink later and their targets as
7296   // ingredients whose recipe we'll need to record.
7297   for (auto &Entry : SinkAfter) {
7298     RecipeBuilder.recordRecipeOf(Entry.first);
7299     RecipeBuilder.recordRecipeOf(Entry.second);
7300   }
7301 
7302   // For each interleave group which is relevant for this (possibly trimmed)
7303   // Range, add it to the set of groups to be later applied to the VPlan and add
7304   // placeholders for its members' Recipes which we'll be replacing with a
7305   // single VPInterleaveRecipe.
7306   for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
7307     auto applyIG = [IG, this](unsigned VF) -> bool {
7308       return (VF >= 2 && // Query is illegal for VF == 1
7309               CM.getWideningDecision(IG->getInsertPos(), VF) ==
7310                   LoopVectorizationCostModel::CM_Interleave);
7311     };
7312     if (!getDecisionAndClampRange(applyIG, Range))
7313       continue;
7314     InterleaveGroups.insert(IG);
7315     for (unsigned i = 0; i < IG->getFactor(); i++)
7316       if (Instruction *Member = IG->getMember(i))
7317         RecipeBuilder.recordRecipeOf(Member);
7318   };
7319 
7320   // ---------------------------------------------------------------------------
7321   // Build initial VPlan: Scan the body of the loop in a topological order to
7322   // visit each basic block after having visited its predecessor basic blocks.
7323   // ---------------------------------------------------------------------------
7324 
7325   // Create a dummy pre-entry VPBasicBlock to start building the VPlan.
7326   auto Plan = std::make_unique<VPlan>();
7327   VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry");
7328   Plan->setEntry(VPBB);
7329 
7330   // Represent values that will have defs inside VPlan.
7331   for (Value *V : NeedDef)
7332     Plan->addVPValue(V);
7333 
7334   // Scan the body of the loop in a topological order to visit each basic block
7335   // after having visited its predecessor basic blocks.
7336   LoopBlocksDFS DFS(OrigLoop);
7337   DFS.perform(LI);
7338 
7339   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
7340     // Relevant instructions from basic block BB will be grouped into VPRecipe
7341     // ingredients and fill a new VPBasicBlock.
7342     unsigned VPBBsForBB = 0;
7343     auto *FirstVPBBForBB = new VPBasicBlock(BB->getName());
7344     VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB);
7345     VPBB = FirstVPBBForBB;
7346     Builder.setInsertPoint(VPBB);
7347 
7348     // Introduce each ingredient into VPlan.
7349     // TODO: Model and preserve debug instrinsics in VPlan.
7350     for (Instruction &I : BB->instructionsWithoutDebug()) {
7351       Instruction *Instr = &I;
7352 
7353       // First filter out irrelevant instructions, to ensure no recipes are
7354       // built for them.
7355       if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr))
7356         continue;
7357 
7358       if (auto Recipe =
7359               RecipeBuilder.tryToCreateWidenRecipe(Instr, Range, Plan)) {
7360         RecipeBuilder.setRecipe(Instr, Recipe);
7361         VPBB->appendRecipe(Recipe);
7362         continue;
7363       }
7364 
7365       // Otherwise, if all widening options failed, Instruction is to be
7366       // replicated. This may create a successor for VPBB.
7367       VPBasicBlock *NextVPBB = RecipeBuilder.handleReplication(
7368           Instr, Range, VPBB, PredInst2Recipe, Plan);
7369       if (NextVPBB != VPBB) {
7370         VPBB = NextVPBB;
7371         VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++)
7372                                     : "");
7373       }
7374     }
7375   }
7376 
7377   // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks
7378   // may also be empty, such as the last one VPBB, reflecting original
7379   // basic-blocks with no recipes.
7380   VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry());
7381   assert(PreEntry->empty() && "Expecting empty pre-entry block.");
7382   VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor());
7383   VPBlockUtils::disconnectBlocks(PreEntry, Entry);
7384   delete PreEntry;
7385 
7386   // ---------------------------------------------------------------------------
7387   // Transform initial VPlan: Apply previously taken decisions, in order, to
7388   // bring the VPlan to its final state.
7389   // ---------------------------------------------------------------------------
7390 
7391   // Apply Sink-After legal constraints.
7392   for (auto &Entry : SinkAfter) {
7393     VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first);
7394     VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second);
7395     Sink->moveAfter(Target);
7396   }
7397 
7398   // Interleave memory: for each Interleave Group we marked earlier as relevant
7399   // for this VPlan, replace the Recipes widening its memory instructions with a
7400   // single VPInterleaveRecipe at its insertion point.
7401   for (auto IG : InterleaveGroups) {
7402     auto *Recipe = cast<VPWidenMemoryInstructionRecipe>(
7403         RecipeBuilder.getRecipe(IG->getInsertPos()));
7404     (new VPInterleaveRecipe(IG, Recipe->getAddr(), Recipe->getMask()))
7405         ->insertBefore(Recipe);
7406 
7407     for (unsigned i = 0; i < IG->getFactor(); ++i)
7408       if (Instruction *Member = IG->getMember(i)) {
7409         RecipeBuilder.getRecipe(Member)->eraseFromParent();
7410       }
7411   }
7412 
7413   // Finally, if tail is folded by masking, introduce selects between the phi
7414   // and the live-out instruction of each reduction, at the end of the latch.
7415   if (CM.foldTailByMasking()) {
7416     Builder.setInsertPoint(VPBB);
7417     auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan);
7418     for (auto &Reduction : Legal->getReductionVars()) {
7419       VPValue *Phi = Plan->getVPValue(Reduction.first);
7420       VPValue *Red = Plan->getVPValue(Reduction.second.getLoopExitInstr());
7421       Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi});
7422     }
7423   }
7424 
7425   std::string PlanName;
7426   raw_string_ostream RSO(PlanName);
7427   unsigned VF = Range.Start;
7428   Plan->addVF(VF);
7429   RSO << "Initial VPlan for VF={" << VF;
7430   for (VF *= 2; VF < Range.End; VF *= 2) {
7431     Plan->addVF(VF);
7432     RSO << "," << VF;
7433   }
7434   RSO << "},UF>=1";
7435   RSO.flush();
7436   Plan->setName(PlanName);
7437 
7438   return Plan;
7439 }
7440 
7441 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
7442   // Outer loop handling: They may require CFG and instruction level
7443   // transformations before even evaluating whether vectorization is profitable.
7444   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
7445   // the vectorization pipeline.
7446   assert(!OrigLoop->empty());
7447   assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
7448 
7449   // Create new empty VPlan
7450   auto Plan = std::make_unique<VPlan>();
7451 
7452   // Build hierarchical CFG
7453   VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
7454   HCFGBuilder.buildHierarchicalCFG();
7455 
7456   for (unsigned VF = Range.Start; VF < Range.End; VF *= 2)
7457     Plan->addVF(VF);
7458 
7459   if (EnableVPlanPredication) {
7460     VPlanPredicator VPP(*Plan);
7461     VPP.predicate();
7462 
7463     // Avoid running transformation to recipes until masked code generation in
7464     // VPlan-native path is in place.
7465     return Plan;
7466   }
7467 
7468   SmallPtrSet<Instruction *, 1> DeadInstructions;
7469   VPlanTransforms::VPInstructionsToVPRecipes(
7470       OrigLoop, Plan, Legal->getInductionVars(), DeadInstructions);
7471   return Plan;
7472 }
7473 
7474 Value* LoopVectorizationPlanner::VPCallbackILV::
7475 getOrCreateVectorValues(Value *V, unsigned Part) {
7476       return ILV.getOrCreateVectorValue(V, Part);
7477 }
7478 
7479 Value *LoopVectorizationPlanner::VPCallbackILV::getOrCreateScalarValue(
7480     Value *V, const VPIteration &Instance) {
7481   return ILV.getOrCreateScalarValue(V, Instance);
7482 }
7483 
7484 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
7485                                VPSlotTracker &SlotTracker) const {
7486   O << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
7487   IG->getInsertPos()->printAsOperand(O, false);
7488   O << ", ";
7489   getAddr()->printAsOperand(O, SlotTracker);
7490   VPValue *Mask = getMask();
7491   if (Mask) {
7492     O << ", ";
7493     Mask->printAsOperand(O, SlotTracker);
7494   }
7495   for (unsigned i = 0; i < IG->getFactor(); ++i)
7496     if (Instruction *I = IG->getMember(i))
7497       O << "\\l\" +\n" << Indent << "\"  " << VPlanIngredient(I) << " " << i;
7498 }
7499 
7500 void VPWidenCallRecipe::execute(VPTransformState &State) {
7501   State.ILV->widenCallInstruction(Ingredient, User, State);
7502 }
7503 
7504 void VPWidenSelectRecipe::execute(VPTransformState &State) {
7505   State.ILV->widenSelectInstruction(Ingredient, User, InvariantCond, State);
7506 }
7507 
7508 void VPWidenRecipe::execute(VPTransformState &State) {
7509   State.ILV->widenInstruction(Ingredient, User, State);
7510 }
7511 
7512 void VPWidenGEPRecipe::execute(VPTransformState &State) {
7513   State.ILV->widenGEP(GEP, User, State.UF, State.VF, IsPtrLoopInvariant,
7514                       IsIndexLoopInvariant, State);
7515 }
7516 
7517 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
7518   assert(!State.Instance && "Int or FP induction being replicated.");
7519   State.ILV->widenIntOrFpInduction(IV, Trunc);
7520 }
7521 
7522 void VPWidenPHIRecipe::execute(VPTransformState &State) {
7523   State.ILV->widenPHIInstruction(Phi, State.UF, State.VF);
7524 }
7525 
7526 void VPBlendRecipe::execute(VPTransformState &State) {
7527   State.ILV->setDebugLocFromInst(State.Builder, Phi);
7528   // We know that all PHIs in non-header blocks are converted into
7529   // selects, so we don't have to worry about the insertion order and we
7530   // can just use the builder.
7531   // At this point we generate the predication tree. There may be
7532   // duplications since this is a simple recursive scan, but future
7533   // optimizations will clean it up.
7534 
7535   unsigned NumIncoming = getNumIncomingValues();
7536 
7537   // Generate a sequence of selects of the form:
7538   // SELECT(Mask3, In3,
7539   //        SELECT(Mask2, In2,
7540   //               SELECT(Mask1, In1,
7541   //                      In0)))
7542   // Note that Mask0 is never used: lanes for which no path reaches this phi and
7543   // are essentially undef are taken from In0.
7544   InnerLoopVectorizer::VectorParts Entry(State.UF);
7545   for (unsigned In = 0; In < NumIncoming; ++In) {
7546     for (unsigned Part = 0; Part < State.UF; ++Part) {
7547       // We might have single edge PHIs (blocks) - use an identity
7548       // 'select' for the first PHI operand.
7549       Value *In0 = State.get(getIncomingValue(In), Part);
7550       if (In == 0)
7551         Entry[Part] = In0; // Initialize with the first incoming value.
7552       else {
7553         // Select between the current value and the previous incoming edge
7554         // based on the incoming mask.
7555         Value *Cond = State.get(getMask(In), Part);
7556         Entry[Part] =
7557             State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi");
7558       }
7559     }
7560   }
7561   for (unsigned Part = 0; Part < State.UF; ++Part)
7562     State.ValueMap.setVectorValue(Phi, Part, Entry[Part]);
7563 }
7564 
7565 void VPInterleaveRecipe::execute(VPTransformState &State) {
7566   assert(!State.Instance && "Interleave group being replicated.");
7567   State.ILV->vectorizeInterleaveGroup(IG, State, getAddr(), getMask());
7568 }
7569 
7570 void VPReplicateRecipe::execute(VPTransformState &State) {
7571   if (State.Instance) { // Generate a single instance.
7572     State.ILV->scalarizeInstruction(Ingredient, User, *State.Instance,
7573                                     IsPredicated, State);
7574     // Insert scalar instance packing it into a vector.
7575     if (AlsoPack && State.VF > 1) {
7576       // If we're constructing lane 0, initialize to start from undef.
7577       if (State.Instance->Lane == 0) {
7578         Value *Undef = UndefValue::get(
7579             FixedVectorType::get(Ingredient->getType(), State.VF));
7580         State.ValueMap.setVectorValue(Ingredient, State.Instance->Part, Undef);
7581       }
7582       State.ILV->packScalarIntoVectorValue(Ingredient, *State.Instance);
7583     }
7584     return;
7585   }
7586 
7587   // Generate scalar instances for all VF lanes of all UF parts, unless the
7588   // instruction is uniform inwhich case generate only the first lane for each
7589   // of the UF parts.
7590   unsigned EndLane = IsUniform ? 1 : State.VF;
7591   for (unsigned Part = 0; Part < State.UF; ++Part)
7592     for (unsigned Lane = 0; Lane < EndLane; ++Lane)
7593       State.ILV->scalarizeInstruction(Ingredient, User, {Part, Lane},
7594                                       IsPredicated, State);
7595 }
7596 
7597 void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
7598   assert(State.Instance && "Branch on Mask works only on single instance.");
7599 
7600   unsigned Part = State.Instance->Part;
7601   unsigned Lane = State.Instance->Lane;
7602 
7603   Value *ConditionBit = nullptr;
7604   VPValue *BlockInMask = getMask();
7605   if (BlockInMask) {
7606     ConditionBit = State.get(BlockInMask, Part);
7607     if (ConditionBit->getType()->isVectorTy())
7608       ConditionBit = State.Builder.CreateExtractElement(
7609           ConditionBit, State.Builder.getInt32(Lane));
7610   } else // Block in mask is all-one.
7611     ConditionBit = State.Builder.getTrue();
7612 
7613   // Replace the temporary unreachable terminator with a new conditional branch,
7614   // whose two destinations will be set later when they are created.
7615   auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
7616   assert(isa<UnreachableInst>(CurrentTerminator) &&
7617          "Expected to replace unreachable terminator with conditional branch.");
7618   auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
7619   CondBr->setSuccessor(0, nullptr);
7620   ReplaceInstWithInst(CurrentTerminator, CondBr);
7621 }
7622 
7623 void VPPredInstPHIRecipe::execute(VPTransformState &State) {
7624   assert(State.Instance && "Predicated instruction PHI works per instance.");
7625   Instruction *ScalarPredInst = cast<Instruction>(
7626       State.ValueMap.getScalarValue(PredInst, *State.Instance));
7627   BasicBlock *PredicatedBB = ScalarPredInst->getParent();
7628   BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
7629   assert(PredicatingBB && "Predicated block has no single predecessor.");
7630 
7631   // By current pack/unpack logic we need to generate only a single phi node: if
7632   // a vector value for the predicated instruction exists at this point it means
7633   // the instruction has vector users only, and a phi for the vector value is
7634   // needed. In this case the recipe of the predicated instruction is marked to
7635   // also do that packing, thereby "hoisting" the insert-element sequence.
7636   // Otherwise, a phi node for the scalar value is needed.
7637   unsigned Part = State.Instance->Part;
7638   if (State.ValueMap.hasVectorValue(PredInst, Part)) {
7639     Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part);
7640     InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
7641     PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
7642     VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
7643     VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
7644     State.ValueMap.resetVectorValue(PredInst, Part, VPhi); // Update cache.
7645   } else {
7646     Type *PredInstType = PredInst->getType();
7647     PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
7648     Phi->addIncoming(UndefValue::get(ScalarPredInst->getType()), PredicatingBB);
7649     Phi->addIncoming(ScalarPredInst, PredicatedBB);
7650     State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi);
7651   }
7652 }
7653 
7654 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
7655   VPValue *StoredValue = isa<StoreInst>(Instr) ? getStoredValue() : nullptr;
7656   State.ILV->vectorizeMemoryInstruction(&Instr, State, getAddr(), StoredValue,
7657                                         getMask());
7658 }
7659 
7660 // Determine how to lower the scalar epilogue, which depends on 1) optimising
7661 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
7662 // predication, and 4) a TTI hook that analyses whether the loop is suitable
7663 // for predication.
7664 static ScalarEpilogueLowering getScalarEpilogueLowering(
7665     Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI,
7666     BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
7667     AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
7668     LoopVectorizationLegality &LVL) {
7669   bool OptSize =
7670       F->hasOptSize() || llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
7671                                                      PGSOQueryType::IRPass);
7672   // 1) OptSize takes precedence over all other options, i.e. if this is set,
7673   // don't look at hints or options, and don't request a scalar epilogue.
7674   if (OptSize)
7675     return CM_ScalarEpilogueNotAllowedOptSize;
7676 
7677   bool PredicateOptDisabled = PreferPredicateOverEpilog.getNumOccurrences() &&
7678                               !PreferPredicateOverEpilog;
7679 
7680   // 2) Next, if disabling predication is requested on the command line, honour
7681   // this and request a scalar epilogue.
7682   if (PredicateOptDisabled)
7683     return CM_ScalarEpilogueAllowed;
7684 
7685   // 3) and 4) look if enabling predication is requested on the command line,
7686   // with a loop hint, or if the TTI hook indicates this is profitable, request
7687   // predication .
7688   if (PreferPredicateOverEpilog ||
7689       Hints.getPredicate() == LoopVectorizeHints::FK_Enabled ||
7690       (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT,
7691                                         LVL.getLAI()) &&
7692        Hints.getPredicate() != LoopVectorizeHints::FK_Disabled))
7693     return CM_ScalarEpilogueNotNeededUsePredicate;
7694 
7695   return CM_ScalarEpilogueAllowed;
7696 }
7697 
7698 // Process the loop in the VPlan-native vectorization path. This path builds
7699 // VPlan upfront in the vectorization pipeline, which allows to apply
7700 // VPlan-to-VPlan transformations from the very beginning without modifying the
7701 // input LLVM IR.
7702 static bool processLoopInVPlanNativePath(
7703     Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
7704     LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
7705     TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
7706     OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
7707     ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) {
7708 
7709   if (PSE.getBackedgeTakenCount() == PSE.getSE()->getCouldNotCompute()) {
7710     LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n");
7711     return false;
7712   }
7713   assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
7714   Function *F = L->getHeader()->getParent();
7715   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
7716 
7717   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
7718       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL);
7719 
7720   LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
7721                                 &Hints, IAI);
7722   // Use the planner for outer loop vectorization.
7723   // TODO: CM is not used at this point inside the planner. Turn CM into an
7724   // optional argument if we don't need it in the future.
7725   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE);
7726 
7727   // Get user vectorization factor.
7728   const unsigned UserVF = Hints.getWidth();
7729 
7730   // Plan how to best vectorize, return the best VF and its cost.
7731   const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
7732 
7733   // If we are stress testing VPlan builds, do not attempt to generate vector
7734   // code. Masked vector code generation support will follow soon.
7735   // Also, do not attempt to vectorize if no vector code will be produced.
7736   if (VPlanBuildStressTest || EnableVPlanPredication ||
7737       VectorizationFactor::Disabled() == VF)
7738     return false;
7739 
7740   LVP.setBestPlan(VF.Width, 1);
7741 
7742   InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL,
7743                          &CM, BFI, PSI);
7744   LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
7745                     << L->getHeader()->getParent()->getName() << "\"\n");
7746   LVP.executePlan(LB, DT);
7747 
7748   // Mark the loop as already vectorized to avoid vectorizing again.
7749   Hints.setAlreadyVectorized();
7750 
7751   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
7752   return true;
7753 }
7754 
7755 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts)
7756     : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced ||
7757                                !EnableLoopInterleaving),
7758       VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced ||
7759                               !EnableLoopVectorization) {}
7760 
7761 bool LoopVectorizePass::processLoop(Loop *L) {
7762   assert((EnableVPlanNativePath || L->empty()) &&
7763          "VPlan-native path is not enabled. Only process inner loops.");
7764 
7765 #ifndef NDEBUG
7766   const std::string DebugLocStr = getDebugLocString(L);
7767 #endif /* NDEBUG */
7768 
7769   LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \""
7770                     << L->getHeader()->getParent()->getName() << "\" from "
7771                     << DebugLocStr << "\n");
7772 
7773   LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE);
7774 
7775   LLVM_DEBUG(
7776       dbgs() << "LV: Loop hints:"
7777              << " force="
7778              << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
7779                      ? "disabled"
7780                      : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
7781                             ? "enabled"
7782                             : "?"))
7783              << " width=" << Hints.getWidth()
7784              << " unroll=" << Hints.getInterleave() << "\n");
7785 
7786   // Function containing loop
7787   Function *F = L->getHeader()->getParent();
7788 
7789   // Looking at the diagnostic output is the only way to determine if a loop
7790   // was vectorized (other than looking at the IR or machine code), so it
7791   // is important to generate an optimization remark for each loop. Most of
7792   // these messages are generated as OptimizationRemarkAnalysis. Remarks
7793   // generated as OptimizationRemark and OptimizationRemarkMissed are
7794   // less verbose reporting vectorized loops and unvectorized loops that may
7795   // benefit from vectorization, respectively.
7796 
7797   if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
7798     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
7799     return false;
7800   }
7801 
7802   PredicatedScalarEvolution PSE(*SE, *L);
7803 
7804   // Check if it is legal to vectorize the loop.
7805   LoopVectorizationRequirements Requirements(*ORE);
7806   LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE,
7807                                 &Requirements, &Hints, DB, AC, BFI, PSI);
7808   if (!LVL.canVectorize(EnableVPlanNativePath)) {
7809     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
7810     Hints.emitRemarkWithHints();
7811     return false;
7812   }
7813 
7814   // Check the function attributes and profiles to find out if this function
7815   // should be optimized for size.
7816   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
7817       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL);
7818 
7819   // Entrance to the VPlan-native vectorization path. Outer loops are processed
7820   // here. They may require CFG and instruction level transformations before
7821   // even evaluating whether vectorization is profitable. Since we cannot modify
7822   // the incoming IR, we need to build VPlan upfront in the vectorization
7823   // pipeline.
7824   if (!L->empty())
7825     return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
7826                                         ORE, BFI, PSI, Hints);
7827 
7828   assert(L->empty() && "Inner loop expected.");
7829 
7830   // Check the loop for a trip count threshold: vectorize loops with a tiny trip
7831   // count by optimizing for size, to minimize overheads.
7832   auto ExpectedTC = getSmallBestKnownTC(*SE, L);
7833   if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
7834     LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
7835                       << "This loop is worth vectorizing only if no scalar "
7836                       << "iteration overheads are incurred.");
7837     if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
7838       LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
7839     else {
7840       LLVM_DEBUG(dbgs() << "\n");
7841       SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
7842     }
7843   }
7844 
7845   // Check the function attributes to see if implicit floats are allowed.
7846   // FIXME: This check doesn't seem possibly correct -- what if the loop is
7847   // an integer loop and the vector instructions selected are purely integer
7848   // vector instructions?
7849   if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
7850     reportVectorizationFailure(
7851         "Can't vectorize when the NoImplicitFloat attribute is used",
7852         "loop not vectorized due to NoImplicitFloat attribute",
7853         "NoImplicitFloat", ORE, L);
7854     Hints.emitRemarkWithHints();
7855     return false;
7856   }
7857 
7858   // Check if the target supports potentially unsafe FP vectorization.
7859   // FIXME: Add a check for the type of safety issue (denormal, signaling)
7860   // for the target we're vectorizing for, to make sure none of the
7861   // additional fp-math flags can help.
7862   if (Hints.isPotentiallyUnsafe() &&
7863       TTI->isFPVectorizationPotentiallyUnsafe()) {
7864     reportVectorizationFailure(
7865         "Potentially unsafe FP op prevents vectorization",
7866         "loop not vectorized due to unsafe FP support.",
7867         "UnsafeFP", ORE, L);
7868     Hints.emitRemarkWithHints();
7869     return false;
7870   }
7871 
7872   bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
7873   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
7874 
7875   // If an override option has been passed in for interleaved accesses, use it.
7876   if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
7877     UseInterleaved = EnableInterleavedMemAccesses;
7878 
7879   // Analyze interleaved memory accesses.
7880   if (UseInterleaved) {
7881     IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
7882   }
7883 
7884   // Use the cost model.
7885   LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
7886                                 F, &Hints, IAI);
7887   CM.collectValuesToIgnore();
7888 
7889   // Use the planner for vectorization.
7890   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE);
7891 
7892   // Get user vectorization factor and interleave count.
7893   unsigned UserVF = Hints.getWidth();
7894   unsigned UserIC = Hints.getInterleave();
7895 
7896   // Plan how to best vectorize, return the best VF and its cost.
7897   Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC);
7898 
7899   VectorizationFactor VF = VectorizationFactor::Disabled();
7900   unsigned IC = 1;
7901 
7902   if (MaybeVF) {
7903     VF = *MaybeVF;
7904     // Select the interleave count.
7905     IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
7906   }
7907 
7908   // Identify the diagnostic messages that should be produced.
7909   std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
7910   bool VectorizeLoop = true, InterleaveLoop = true;
7911   if (Requirements.doesNotMeet(F, L, Hints)) {
7912     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization "
7913                          "requirements.\n");
7914     Hints.emitRemarkWithHints();
7915     return false;
7916   }
7917 
7918   if (VF.Width == 1) {
7919     LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
7920     VecDiagMsg = std::make_pair(
7921         "VectorizationNotBeneficial",
7922         "the cost-model indicates that vectorization is not beneficial");
7923     VectorizeLoop = false;
7924   }
7925 
7926   if (!MaybeVF && UserIC > 1) {
7927     // Tell the user interleaving was avoided up-front, despite being explicitly
7928     // requested.
7929     LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
7930                          "interleaving should be avoided up front\n");
7931     IntDiagMsg = std::make_pair(
7932         "InterleavingAvoided",
7933         "Ignoring UserIC, because interleaving was avoided up front");
7934     InterleaveLoop = false;
7935   } else if (IC == 1 && UserIC <= 1) {
7936     // Tell the user interleaving is not beneficial.
7937     LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
7938     IntDiagMsg = std::make_pair(
7939         "InterleavingNotBeneficial",
7940         "the cost-model indicates that interleaving is not beneficial");
7941     InterleaveLoop = false;
7942     if (UserIC == 1) {
7943       IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
7944       IntDiagMsg.second +=
7945           " and is explicitly disabled or interleave count is set to 1";
7946     }
7947   } else if (IC > 1 && UserIC == 1) {
7948     // Tell the user interleaving is beneficial, but it explicitly disabled.
7949     LLVM_DEBUG(
7950         dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
7951     IntDiagMsg = std::make_pair(
7952         "InterleavingBeneficialButDisabled",
7953         "the cost-model indicates that interleaving is beneficial "
7954         "but is explicitly disabled or interleave count is set to 1");
7955     InterleaveLoop = false;
7956   }
7957 
7958   // Override IC if user provided an interleave count.
7959   IC = UserIC > 0 ? UserIC : IC;
7960 
7961   // Emit diagnostic messages, if any.
7962   const char *VAPassName = Hints.vectorizeAnalysisPassName();
7963   if (!VectorizeLoop && !InterleaveLoop) {
7964     // Do not vectorize or interleaving the loop.
7965     ORE->emit([&]() {
7966       return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
7967                                       L->getStartLoc(), L->getHeader())
7968              << VecDiagMsg.second;
7969     });
7970     ORE->emit([&]() {
7971       return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
7972                                       L->getStartLoc(), L->getHeader())
7973              << IntDiagMsg.second;
7974     });
7975     return false;
7976   } else if (!VectorizeLoop && InterleaveLoop) {
7977     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
7978     ORE->emit([&]() {
7979       return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
7980                                         L->getStartLoc(), L->getHeader())
7981              << VecDiagMsg.second;
7982     });
7983   } else if (VectorizeLoop && !InterleaveLoop) {
7984     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
7985                       << ") in " << DebugLocStr << '\n');
7986     ORE->emit([&]() {
7987       return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
7988                                         L->getStartLoc(), L->getHeader())
7989              << IntDiagMsg.second;
7990     });
7991   } else if (VectorizeLoop && InterleaveLoop) {
7992     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
7993                       << ") in " << DebugLocStr << '\n');
7994     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
7995   }
7996 
7997   LVP.setBestPlan(VF.Width, IC);
7998 
7999   using namespace ore;
8000   bool DisableRuntimeUnroll = false;
8001   MDNode *OrigLoopID = L->getLoopID();
8002 
8003   if (!VectorizeLoop) {
8004     assert(IC > 1 && "interleave count should not be 1 or 0");
8005     // If we decided that it is not legal to vectorize the loop, then
8006     // interleave it.
8007     InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, &CM,
8008                                BFI, PSI);
8009     LVP.executePlan(Unroller, DT);
8010 
8011     ORE->emit([&]() {
8012       return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
8013                                 L->getHeader())
8014              << "interleaved loop (interleaved count: "
8015              << NV("InterleaveCount", IC) << ")";
8016     });
8017   } else {
8018     // If we decided that it is *legal* to vectorize the loop, then do it.
8019     InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
8020                            &LVL, &CM, BFI, PSI);
8021     LVP.executePlan(LB, DT);
8022     ++LoopsVectorized;
8023 
8024     // Add metadata to disable runtime unrolling a scalar loop when there are
8025     // no runtime checks about strides and memory. A scalar loop that is
8026     // rarely used is not worth unrolling.
8027     if (!LB.areSafetyChecksAdded())
8028       DisableRuntimeUnroll = true;
8029 
8030     // Report the vectorization decision.
8031     ORE->emit([&]() {
8032       return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(),
8033                                 L->getHeader())
8034              << "vectorized loop (vectorization width: "
8035              << NV("VectorizationFactor", VF.Width)
8036              << ", interleaved count: " << NV("InterleaveCount", IC) << ")";
8037     });
8038   }
8039 
8040   Optional<MDNode *> RemainderLoopID =
8041       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
8042                                       LLVMLoopVectorizeFollowupEpilogue});
8043   if (RemainderLoopID.hasValue()) {
8044     L->setLoopID(RemainderLoopID.getValue());
8045   } else {
8046     if (DisableRuntimeUnroll)
8047       AddRuntimeUnrollDisableMetaData(L);
8048 
8049     // Mark the loop as already vectorized to avoid vectorizing again.
8050     Hints.setAlreadyVectorized();
8051   }
8052 
8053   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
8054   return true;
8055 }
8056 
8057 LoopVectorizeResult LoopVectorizePass::runImpl(
8058     Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_,
8059     DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_,
8060     DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_,
8061     std::function<const LoopAccessInfo &(Loop &)> &GetLAA_,
8062     OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) {
8063   SE = &SE_;
8064   LI = &LI_;
8065   TTI = &TTI_;
8066   DT = &DT_;
8067   BFI = &BFI_;
8068   TLI = TLI_;
8069   AA = &AA_;
8070   AC = &AC_;
8071   GetLAA = &GetLAA_;
8072   DB = &DB_;
8073   ORE = &ORE_;
8074   PSI = PSI_;
8075 
8076   // Don't attempt if
8077   // 1. the target claims to have no vector registers, and
8078   // 2. interleaving won't help ILP.
8079   //
8080   // The second condition is necessary because, even if the target has no
8081   // vector registers, loop vectorization may still enable scalar
8082   // interleaving.
8083   if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) &&
8084       TTI->getMaxInterleaveFactor(1) < 2)
8085     return LoopVectorizeResult(false, false);
8086 
8087   bool Changed = false, CFGChanged = false;
8088 
8089   // The vectorizer requires loops to be in simplified form.
8090   // Since simplification may add new inner loops, it has to run before the
8091   // legality and profitability checks. This means running the loop vectorizer
8092   // will simplify all loops, regardless of whether anything end up being
8093   // vectorized.
8094   for (auto &L : *LI)
8095     Changed |= CFGChanged |=
8096         simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
8097 
8098   // Build up a worklist of inner-loops to vectorize. This is necessary as
8099   // the act of vectorizing or partially unrolling a loop creates new loops
8100   // and can invalidate iterators across the loops.
8101   SmallVector<Loop *, 8> Worklist;
8102 
8103   for (Loop *L : *LI)
8104     collectSupportedLoops(*L, LI, ORE, Worklist);
8105 
8106   LoopsAnalyzed += Worklist.size();
8107 
8108   // Now walk the identified inner loops.
8109   while (!Worklist.empty()) {
8110     Loop *L = Worklist.pop_back_val();
8111 
8112     // For the inner loops we actually process, form LCSSA to simplify the
8113     // transform.
8114     Changed |= formLCSSARecursively(*L, *DT, LI, SE);
8115 
8116     Changed |= CFGChanged |= processLoop(L);
8117   }
8118 
8119   // Process each loop nest in the function.
8120   return LoopVectorizeResult(Changed, CFGChanged);
8121 }
8122 
8123 PreservedAnalyses LoopVectorizePass::run(Function &F,
8124                                          FunctionAnalysisManager &AM) {
8125     auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
8126     auto &LI = AM.getResult<LoopAnalysis>(F);
8127     auto &TTI = AM.getResult<TargetIRAnalysis>(F);
8128     auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
8129     auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F);
8130     auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
8131     auto &AA = AM.getResult<AAManager>(F);
8132     auto &AC = AM.getResult<AssumptionAnalysis>(F);
8133     auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
8134     auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
8135     MemorySSA *MSSA = EnableMSSALoopDependency
8136                           ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA()
8137                           : nullptr;
8138 
8139     auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
8140     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
8141         [&](Loop &L) -> const LoopAccessInfo & {
8142       LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI, MSSA};
8143       return LAM.getResult<LoopAccessAnalysis>(L, AR);
8144     };
8145     auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
8146     ProfileSummaryInfo *PSI =
8147         MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
8148     LoopVectorizeResult Result =
8149         runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI);
8150     if (!Result.MadeAnyChange)
8151       return PreservedAnalyses::all();
8152     PreservedAnalyses PA;
8153 
8154     // We currently do not preserve loopinfo/dominator analyses with outer loop
8155     // vectorization. Until this is addressed, mark these analyses as preserved
8156     // only for non-VPlan-native path.
8157     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
8158     if (!EnableVPlanNativePath) {
8159       PA.preserve<LoopAnalysis>();
8160       PA.preserve<DominatorTreeAnalysis>();
8161     }
8162     PA.preserve<BasicAA>();
8163     PA.preserve<GlobalsAA>();
8164     if (!Result.MadeCFGChange)
8165       PA.preserveSet<CFGAnalyses>();
8166     return PA;
8167 }
8168