1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10 // and generates target-independent LLVM-IR.
11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12 // of instructions in order to estimate the profitability of vectorization.
13 //
14 // The loop vectorizer combines consecutive loop iterations into a single
15 // 'wide' iteration. After this transformation the index is incremented
16 // by the SIMD vector width, and not by one.
17 //
18 // This pass has three parts:
19 // 1. The main loop pass that drives the different parts.
20 // 2. LoopVectorizationLegality - A unit that checks for the legality
21 //    of the vectorization.
22 // 3. InnerLoopVectorizer - A unit that performs the actual
23 //    widening of instructions.
24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
25 //    of vectorization. It decides on the optimal vector width, which
26 //    can be one, if vectorization is not profitable.
27 //
28 // There is a development effort going on to migrate loop vectorizer to the
29 // VPlan infrastructure and to introduce outer loop vectorization support (see
30 // docs/Proposal/VectorizationPlan.rst and
31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32 // purpose, we temporarily introduced the VPlan-native vectorization path: an
33 // alternative vectorization path that is natively implemented on top of the
34 // VPlan infrastructure. See EnableVPlanNativePath for enabling.
35 //
36 //===----------------------------------------------------------------------===//
37 //
38 // The reduction-variable vectorization is based on the paper:
39 //  D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40 //
41 // Variable uniformity checks are inspired by:
42 //  Karrenberg, R. and Hack, S. Whole Function Vectorization.
43 //
44 // The interleaved access vectorization is based on the paper:
45 //  Dorit Nuzman, Ira Rosen and Ayal Zaks.  Auto-Vectorization of Interleaved
46 //  Data for SIMD
47 //
48 // Other ideas/concepts are from:
49 //  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50 //
51 //  S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua.  An Evaluation of
52 //  Vectorizing Compilers.
53 //
54 //===----------------------------------------------------------------------===//
55 
56 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
57 #include "LoopVectorizationPlanner.h"
58 #include "VPRecipeBuilder.h"
59 #include "VPlan.h"
60 #include "VPlanHCFGBuilder.h"
61 #include "VPlanPredicator.h"
62 #include "VPlanTransforms.h"
63 #include "llvm/ADT/APInt.h"
64 #include "llvm/ADT/ArrayRef.h"
65 #include "llvm/ADT/DenseMap.h"
66 #include "llvm/ADT/DenseMapInfo.h"
67 #include "llvm/ADT/Hashing.h"
68 #include "llvm/ADT/MapVector.h"
69 #include "llvm/ADT/None.h"
70 #include "llvm/ADT/Optional.h"
71 #include "llvm/ADT/STLExtras.h"
72 #include "llvm/ADT/SetVector.h"
73 #include "llvm/ADT/SmallPtrSet.h"
74 #include "llvm/ADT/SmallVector.h"
75 #include "llvm/ADT/Statistic.h"
76 #include "llvm/ADT/StringRef.h"
77 #include "llvm/ADT/Twine.h"
78 #include "llvm/ADT/iterator_range.h"
79 #include "llvm/Analysis/AssumptionCache.h"
80 #include "llvm/Analysis/BasicAliasAnalysis.h"
81 #include "llvm/Analysis/BlockFrequencyInfo.h"
82 #include "llvm/Analysis/CFG.h"
83 #include "llvm/Analysis/CodeMetrics.h"
84 #include "llvm/Analysis/DemandedBits.h"
85 #include "llvm/Analysis/GlobalsModRef.h"
86 #include "llvm/Analysis/LoopAccessAnalysis.h"
87 #include "llvm/Analysis/LoopAnalysisManager.h"
88 #include "llvm/Analysis/LoopInfo.h"
89 #include "llvm/Analysis/LoopIterator.h"
90 #include "llvm/Analysis/MemorySSA.h"
91 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
92 #include "llvm/Analysis/ProfileSummaryInfo.h"
93 #include "llvm/Analysis/ScalarEvolution.h"
94 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
95 #include "llvm/Analysis/TargetLibraryInfo.h"
96 #include "llvm/Analysis/TargetTransformInfo.h"
97 #include "llvm/Analysis/VectorUtils.h"
98 #include "llvm/IR/Attributes.h"
99 #include "llvm/IR/BasicBlock.h"
100 #include "llvm/IR/CFG.h"
101 #include "llvm/IR/Constant.h"
102 #include "llvm/IR/Constants.h"
103 #include "llvm/IR/DataLayout.h"
104 #include "llvm/IR/DebugInfoMetadata.h"
105 #include "llvm/IR/DebugLoc.h"
106 #include "llvm/IR/DerivedTypes.h"
107 #include "llvm/IR/DiagnosticInfo.h"
108 #include "llvm/IR/Dominators.h"
109 #include "llvm/IR/Function.h"
110 #include "llvm/IR/IRBuilder.h"
111 #include "llvm/IR/InstrTypes.h"
112 #include "llvm/IR/Instruction.h"
113 #include "llvm/IR/Instructions.h"
114 #include "llvm/IR/IntrinsicInst.h"
115 #include "llvm/IR/Intrinsics.h"
116 #include "llvm/IR/LLVMContext.h"
117 #include "llvm/IR/Metadata.h"
118 #include "llvm/IR/Module.h"
119 #include "llvm/IR/Operator.h"
120 #include "llvm/IR/Type.h"
121 #include "llvm/IR/Use.h"
122 #include "llvm/IR/User.h"
123 #include "llvm/IR/Value.h"
124 #include "llvm/IR/ValueHandle.h"
125 #include "llvm/IR/Verifier.h"
126 #include "llvm/InitializePasses.h"
127 #include "llvm/Pass.h"
128 #include "llvm/Support/Casting.h"
129 #include "llvm/Support/CommandLine.h"
130 #include "llvm/Support/Compiler.h"
131 #include "llvm/Support/Debug.h"
132 #include "llvm/Support/ErrorHandling.h"
133 #include "llvm/Support/MathExtras.h"
134 #include "llvm/Support/raw_ostream.h"
135 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
136 #include "llvm/Transforms/Utils/InjectTLIMappings.h"
137 #include "llvm/Transforms/Utils/LoopSimplify.h"
138 #include "llvm/Transforms/Utils/LoopUtils.h"
139 #include "llvm/Transforms/Utils/LoopVersioning.h"
140 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
141 #include "llvm/Transforms/Utils/SizeOpts.h"
142 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
143 #include <algorithm>
144 #include <cassert>
145 #include <cstdint>
146 #include <cstdlib>
147 #include <functional>
148 #include <iterator>
149 #include <limits>
150 #include <memory>
151 #include <string>
152 #include <tuple>
153 #include <utility>
154 
155 using namespace llvm;
156 
157 #define LV_NAME "loop-vectorize"
158 #define DEBUG_TYPE LV_NAME
159 
160 /// @{
161 /// Metadata attribute names
162 static const char *const LLVMLoopVectorizeFollowupAll =
163     "llvm.loop.vectorize.followup_all";
164 static const char *const LLVMLoopVectorizeFollowupVectorized =
165     "llvm.loop.vectorize.followup_vectorized";
166 static const char *const LLVMLoopVectorizeFollowupEpilogue =
167     "llvm.loop.vectorize.followup_epilogue";
168 /// @}
169 
170 STATISTIC(LoopsVectorized, "Number of loops vectorized");
171 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
172 
173 /// Loops with a known constant trip count below this number are vectorized only
174 /// if no scalar iteration overheads are incurred.
175 static cl::opt<unsigned> TinyTripCountVectorThreshold(
176     "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
177     cl::desc("Loops with a constant trip count that is smaller than this "
178              "value are vectorized only if no scalar iteration overheads "
179              "are incurred."));
180 
181 // Indicates that an epilogue is undesired, predication is preferred.
182 // This means that the vectorizer will try to fold the loop-tail (epilogue)
183 // into the loop and predicate the loop body accordingly.
184 static cl::opt<bool> PreferPredicateOverEpilog(
185     "prefer-predicate-over-epilog", cl::init(false), cl::Hidden,
186     cl::desc("Indicate that an epilogue is undesired, predication should be "
187              "used instead."));
188 
189 static cl::opt<bool> MaximizeBandwidth(
190     "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
191     cl::desc("Maximize bandwidth when selecting vectorization factor which "
192              "will be determined by the smallest type in loop."));
193 
194 static cl::opt<bool> EnableInterleavedMemAccesses(
195     "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
196     cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
197 
198 /// An interleave-group may need masking if it resides in a block that needs
199 /// predication, or in order to mask away gaps.
200 static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
201     "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
202     cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
203 
204 static cl::opt<unsigned> TinyTripCountInterleaveThreshold(
205     "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden,
206     cl::desc("We don't interleave loops with a estimated constant trip count "
207              "below this number"));
208 
209 static cl::opt<unsigned> ForceTargetNumScalarRegs(
210     "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
211     cl::desc("A flag that overrides the target's number of scalar registers."));
212 
213 static cl::opt<unsigned> ForceTargetNumVectorRegs(
214     "force-target-num-vector-regs", cl::init(0), cl::Hidden,
215     cl::desc("A flag that overrides the target's number of vector registers."));
216 
217 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
218     "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
219     cl::desc("A flag that overrides the target's max interleave factor for "
220              "scalar loops."));
221 
222 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
223     "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
224     cl::desc("A flag that overrides the target's max interleave factor for "
225              "vectorized loops."));
226 
227 static cl::opt<unsigned> ForceTargetInstructionCost(
228     "force-target-instruction-cost", cl::init(0), cl::Hidden,
229     cl::desc("A flag that overrides the target's expected cost for "
230              "an instruction to a single constant value. Mostly "
231              "useful for getting consistent testing."));
232 
233 static cl::opt<unsigned> SmallLoopCost(
234     "small-loop-cost", cl::init(20), cl::Hidden,
235     cl::desc(
236         "The cost of a loop that is considered 'small' by the interleaver."));
237 
238 static cl::opt<bool> LoopVectorizeWithBlockFrequency(
239     "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
240     cl::desc("Enable the use of the block frequency analysis to access PGO "
241              "heuristics minimizing code growth in cold regions and being more "
242              "aggressive in hot regions."));
243 
244 // Runtime interleave loops for load/store throughput.
245 static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
246     "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
247     cl::desc(
248         "Enable runtime interleaving until load/store ports are saturated"));
249 
250 /// The number of stores in a loop that are allowed to need predication.
251 static cl::opt<unsigned> NumberOfStoresToPredicate(
252     "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
253     cl::desc("Max number of stores to be predicated behind an if."));
254 
255 static cl::opt<bool> EnableIndVarRegisterHeur(
256     "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
257     cl::desc("Count the induction variable only once when interleaving"));
258 
259 static cl::opt<bool> EnableCondStoresVectorization(
260     "enable-cond-stores-vec", cl::init(true), cl::Hidden,
261     cl::desc("Enable if predication of stores during vectorization."));
262 
263 static cl::opt<unsigned> MaxNestedScalarReductionIC(
264     "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
265     cl::desc("The maximum interleave count to use when interleaving a scalar "
266              "reduction in a nested loop."));
267 
268 cl::opt<bool> EnableVPlanNativePath(
269     "enable-vplan-native-path", cl::init(false), cl::Hidden,
270     cl::desc("Enable VPlan-native vectorization path with "
271              "support for outer loop vectorization."));
272 
273 // FIXME: Remove this switch once we have divergence analysis. Currently we
274 // assume divergent non-backedge branches when this switch is true.
275 cl::opt<bool> EnableVPlanPredication(
276     "enable-vplan-predication", cl::init(false), cl::Hidden,
277     cl::desc("Enable VPlan-native vectorization path predicator with "
278              "support for outer loop vectorization."));
279 
280 // This flag enables the stress testing of the VPlan H-CFG construction in the
281 // VPlan-native vectorization path. It must be used in conjuction with
282 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
283 // verification of the H-CFGs built.
284 static cl::opt<bool> VPlanBuildStressTest(
285     "vplan-build-stress-test", cl::init(false), cl::Hidden,
286     cl::desc(
287         "Build VPlan for every supported loop nest in the function and bail "
288         "out right after the build (stress test the VPlan H-CFG construction "
289         "in the VPlan-native vectorization path)."));
290 
291 cl::opt<bool> llvm::EnableLoopInterleaving(
292     "interleave-loops", cl::init(true), cl::Hidden,
293     cl::desc("Enable loop interleaving in Loop vectorization passes"));
294 cl::opt<bool> llvm::EnableLoopVectorization(
295     "vectorize-loops", cl::init(true), cl::Hidden,
296     cl::desc("Run the Loop vectorization passes"));
297 
298 /// A helper function that returns the type of loaded or stored value.
299 static Type *getMemInstValueType(Value *I) {
300   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
301          "Expected Load or Store instruction");
302   if (auto *LI = dyn_cast<LoadInst>(I))
303     return LI->getType();
304   return cast<StoreInst>(I)->getValueOperand()->getType();
305 }
306 
307 /// A helper function that returns true if the given type is irregular. The
308 /// type is irregular if its allocated size doesn't equal the store size of an
309 /// element of the corresponding vector type at the given vectorization factor.
310 static bool hasIrregularType(Type *Ty, const DataLayout &DL, unsigned VF) {
311   // Determine if an array of VF elements of type Ty is "bitcast compatible"
312   // with a <VF x Ty> vector.
313   if (VF > 1) {
314     auto *VectorTy = FixedVectorType::get(Ty, VF);
315     return VF * DL.getTypeAllocSize(Ty) != DL.getTypeStoreSize(VectorTy);
316   }
317 
318   // If the vectorization factor is one, we just check if an array of type Ty
319   // requires padding between elements.
320   return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
321 }
322 
323 /// A helper function that returns the reciprocal of the block probability of
324 /// predicated blocks. If we return X, we are assuming the predicated block
325 /// will execute once for every X iterations of the loop header.
326 ///
327 /// TODO: We should use actual block probability here, if available. Currently,
328 ///       we always assume predicated blocks have a 50% chance of executing.
329 static unsigned getReciprocalPredBlockProb() { return 2; }
330 
331 /// A helper function that adds a 'fast' flag to floating-point operations.
332 static Value *addFastMathFlag(Value *V) {
333   if (isa<FPMathOperator>(V))
334     cast<Instruction>(V)->setFastMathFlags(FastMathFlags::getFast());
335   return V;
336 }
337 
338 static Value *addFastMathFlag(Value *V, FastMathFlags FMF) {
339   if (isa<FPMathOperator>(V))
340     cast<Instruction>(V)->setFastMathFlags(FMF);
341   return V;
342 }
343 
344 /// A helper function that returns an integer or floating-point constant with
345 /// value C.
346 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
347   return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
348                            : ConstantFP::get(Ty, C);
349 }
350 
351 /// Returns "best known" trip count for the specified loop \p L as defined by
352 /// the following procedure:
353 ///   1) Returns exact trip count if it is known.
354 ///   2) Returns expected trip count according to profile data if any.
355 ///   3) Returns upper bound estimate if it is known.
356 ///   4) Returns None if all of the above failed.
357 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) {
358   // Check if exact trip count is known.
359   if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
360     return ExpectedTC;
361 
362   // Check if there is an expected trip count available from profile data.
363   if (LoopVectorizeWithBlockFrequency)
364     if (auto EstimatedTC = getLoopEstimatedTripCount(L))
365       return EstimatedTC;
366 
367   // Check if upper bound estimate is known.
368   if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
369     return ExpectedTC;
370 
371   return None;
372 }
373 
374 namespace llvm {
375 
376 /// InnerLoopVectorizer vectorizes loops which contain only one basic
377 /// block to a specified vectorization factor (VF).
378 /// This class performs the widening of scalars into vectors, or multiple
379 /// scalars. This class also implements the following features:
380 /// * It inserts an epilogue loop for handling loops that don't have iteration
381 ///   counts that are known to be a multiple of the vectorization factor.
382 /// * It handles the code generation for reduction variables.
383 /// * Scalarization (implementation using scalars) of un-vectorizable
384 ///   instructions.
385 /// InnerLoopVectorizer does not perform any vectorization-legality
386 /// checks, and relies on the caller to check for the different legality
387 /// aspects. The InnerLoopVectorizer relies on the
388 /// LoopVectorizationLegality class to provide information about the induction
389 /// and reduction variables that were found to a given vectorization factor.
390 class InnerLoopVectorizer {
391 public:
392   InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
393                       LoopInfo *LI, DominatorTree *DT,
394                       const TargetLibraryInfo *TLI,
395                       const TargetTransformInfo *TTI, AssumptionCache *AC,
396                       OptimizationRemarkEmitter *ORE, unsigned VecWidth,
397                       unsigned UnrollFactor, LoopVectorizationLegality *LVL,
398                       LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
399                       ProfileSummaryInfo *PSI)
400       : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
401         AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
402         Builder(PSE.getSE()->getContext()),
403         VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM),
404         BFI(BFI), PSI(PSI) {}
405   virtual ~InnerLoopVectorizer() = default;
406 
407   /// Create a new empty loop. Unlink the old loop and connect the new one.
408   /// Return the pre-header block of the new loop.
409   BasicBlock *createVectorizedLoopSkeleton();
410 
411   /// Widen a single instruction within the innermost loop.
412   void widenInstruction(Instruction &I, VPUser &Operands,
413                         VPTransformState &State);
414 
415   /// Widen a single call instruction within the innermost loop.
416   void widenCallInstruction(CallInst &I, VPUser &ArgOperands,
417                             VPTransformState &State);
418 
419   /// Widen a single select instruction within the innermost loop.
420   void widenSelectInstruction(SelectInst &I, VPUser &Operands,
421                               bool InvariantCond, VPTransformState &State);
422 
423   /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
424   void fixVectorizedLoop();
425 
426   // Return true if any runtime check is added.
427   bool areSafetyChecksAdded() { return AddedSafetyChecks; }
428 
429   /// A type for vectorized values in the new loop. Each value from the
430   /// original loop, when vectorized, is represented by UF vector values in the
431   /// new unrolled loop, where UF is the unroll factor.
432   using VectorParts = SmallVector<Value *, 2>;
433 
434   /// Vectorize a single GetElementPtrInst based on information gathered and
435   /// decisions taken during planning.
436   void widenGEP(GetElementPtrInst *GEP, VPUser &Indices, unsigned UF,
437                 unsigned VF, bool IsPtrLoopInvariant,
438                 SmallBitVector &IsIndexLoopInvariant, VPTransformState &State);
439 
440   /// Vectorize a single PHINode in a block. This method handles the induction
441   /// variable canonicalization. It supports both VF = 1 for unrolled loops and
442   /// arbitrary length vectors.
443   void widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF);
444 
445   /// A helper function to scalarize a single Instruction in the innermost loop.
446   /// Generates a sequence of scalar instances for each lane between \p MinLane
447   /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
448   /// inclusive. Uses the VPValue operands from \p Operands instead of \p
449   /// Instr's operands.
450   void scalarizeInstruction(Instruction *Instr, VPUser &Operands,
451                             const VPIteration &Instance, bool IfPredicateInstr,
452                             VPTransformState &State);
453 
454   /// Widen an integer or floating-point induction variable \p IV. If \p Trunc
455   /// is provided, the integer induction variable will first be truncated to
456   /// the corresponding type.
457   void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr);
458 
459   /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a
460   /// vector or scalar value on-demand if one is not yet available. When
461   /// vectorizing a loop, we visit the definition of an instruction before its
462   /// uses. When visiting the definition, we either vectorize or scalarize the
463   /// instruction, creating an entry for it in the corresponding map. (In some
464   /// cases, such as induction variables, we will create both vector and scalar
465   /// entries.) Then, as we encounter uses of the definition, we derive values
466   /// for each scalar or vector use unless such a value is already available.
467   /// For example, if we scalarize a definition and one of its uses is vector,
468   /// we build the required vector on-demand with an insertelement sequence
469   /// when visiting the use. Otherwise, if the use is scalar, we can use the
470   /// existing scalar definition.
471   ///
472   /// Return a value in the new loop corresponding to \p V from the original
473   /// loop at unroll index \p Part. If the value has already been vectorized,
474   /// the corresponding vector entry in VectorLoopValueMap is returned. If,
475   /// however, the value has a scalar entry in VectorLoopValueMap, we construct
476   /// a new vector value on-demand by inserting the scalar values into a vector
477   /// with an insertelement sequence. If the value has been neither vectorized
478   /// nor scalarized, it must be loop invariant, so we simply broadcast the
479   /// value into a vector.
480   Value *getOrCreateVectorValue(Value *V, unsigned Part);
481 
482   /// Return a value in the new loop corresponding to \p V from the original
483   /// loop at unroll and vector indices \p Instance. If the value has been
484   /// vectorized but not scalarized, the necessary extractelement instruction
485   /// will be generated.
486   Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance);
487 
488   /// Construct the vector value of a scalarized value \p V one lane at a time.
489   void packScalarIntoVectorValue(Value *V, const VPIteration &Instance);
490 
491   /// Try to vectorize interleaved access group \p Group with the base address
492   /// given in \p Addr, optionally masking the vector operations if \p
493   /// BlockInMask is non-null. Use \p State to translate given VPValues to IR
494   /// values in the vectorized loop.
495   void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group,
496                                 VPTransformState &State, VPValue *Addr,
497                                 VPValue *BlockInMask = nullptr);
498 
499   /// Vectorize Load and Store instructions with the base address given in \p
500   /// Addr, optionally masking the vector operations if \p BlockInMask is
501   /// non-null. Use \p State to translate given VPValues to IR values in the
502   /// vectorized loop.
503   void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State,
504                                   VPValue *Addr, VPValue *StoredValue,
505                                   VPValue *BlockInMask);
506 
507   /// Set the debug location in the builder using the debug location in
508   /// the instruction.
509   void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr);
510 
511   /// Fix the non-induction PHIs in the OrigPHIsToFix vector.
512   void fixNonInductionPHIs(void);
513 
514 protected:
515   friend class LoopVectorizationPlanner;
516 
517   /// A small list of PHINodes.
518   using PhiVector = SmallVector<PHINode *, 4>;
519 
520   /// A type for scalarized values in the new loop. Each value from the
521   /// original loop, when scalarized, is represented by UF x VF scalar values
522   /// in the new unrolled loop, where UF is the unroll factor and VF is the
523   /// vectorization factor.
524   using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
525 
526   /// Set up the values of the IVs correctly when exiting the vector loop.
527   void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
528                     Value *CountRoundDown, Value *EndValue,
529                     BasicBlock *MiddleBlock);
530 
531   /// Create a new induction variable inside L.
532   PHINode *createInductionVariable(Loop *L, Value *Start, Value *End,
533                                    Value *Step, Instruction *DL);
534 
535   /// Handle all cross-iteration phis in the header.
536   void fixCrossIterationPHIs();
537 
538   /// Fix a first-order recurrence. This is the second phase of vectorizing
539   /// this phi node.
540   void fixFirstOrderRecurrence(PHINode *Phi);
541 
542   /// Fix a reduction cross-iteration phi. This is the second phase of
543   /// vectorizing this phi node.
544   void fixReduction(PHINode *Phi);
545 
546   /// Clear NSW/NUW flags from reduction instructions if necessary.
547   void clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc);
548 
549   /// The Loop exit block may have single value PHI nodes with some
550   /// incoming value. While vectorizing we only handled real values
551   /// that were defined inside the loop and we should have one value for
552   /// each predecessor of its parent basic block. See PR14725.
553   void fixLCSSAPHIs();
554 
555   /// Iteratively sink the scalarized operands of a predicated instruction into
556   /// the block that was created for it.
557   void sinkScalarOperands(Instruction *PredInst);
558 
559   /// Shrinks vector element sizes to the smallest bitwidth they can be legally
560   /// represented as.
561   void truncateToMinimalBitwidths();
562 
563   /// Create a broadcast instruction. This method generates a broadcast
564   /// instruction (shuffle) for loop invariant values and for the induction
565   /// value. If this is the induction variable then we extend it to N, N+1, ...
566   /// this is needed because each iteration in the loop corresponds to a SIMD
567   /// element.
568   virtual Value *getBroadcastInstrs(Value *V);
569 
570   /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...)
571   /// to each vector element of Val. The sequence starts at StartIndex.
572   /// \p Opcode is relevant for FP induction variable.
573   virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step,
574                                Instruction::BinaryOps Opcode =
575                                Instruction::BinaryOpsEnd);
576 
577   /// Compute scalar induction steps. \p ScalarIV is the scalar induction
578   /// variable on which to base the steps, \p Step is the size of the step, and
579   /// \p EntryVal is the value from the original loop that maps to the steps.
580   /// Note that \p EntryVal doesn't have to be an induction variable - it
581   /// can also be a truncate instruction.
582   void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal,
583                         const InductionDescriptor &ID);
584 
585   /// Create a vector induction phi node based on an existing scalar one. \p
586   /// EntryVal is the value from the original loop that maps to the vector phi
587   /// node, and \p Step is the loop-invariant step. If \p EntryVal is a
588   /// truncate instruction, instead of widening the original IV, we widen a
589   /// version of the IV truncated to \p EntryVal's type.
590   void createVectorIntOrFpInductionPHI(const InductionDescriptor &II,
591                                        Value *Step, Instruction *EntryVal);
592 
593   /// Returns true if an instruction \p I should be scalarized instead of
594   /// vectorized for the chosen vectorization factor.
595   bool shouldScalarizeInstruction(Instruction *I) const;
596 
597   /// Returns true if we should generate a scalar version of \p IV.
598   bool needsScalarInduction(Instruction *IV) const;
599 
600   /// If there is a cast involved in the induction variable \p ID, which should
601   /// be ignored in the vectorized loop body, this function records the
602   /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the
603   /// cast. We had already proved that the casted Phi is equal to the uncasted
604   /// Phi in the vectorized loop (under a runtime guard), and therefore
605   /// there is no need to vectorize the cast - the same value can be used in the
606   /// vector loop for both the Phi and the cast.
607   /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified,
608   /// Otherwise, \p VectorLoopValue is a widened/vectorized value.
609   ///
610   /// \p EntryVal is the value from the original loop that maps to the vector
611   /// phi node and is used to distinguish what is the IV currently being
612   /// processed - original one (if \p EntryVal is a phi corresponding to the
613   /// original IV) or the "newly-created" one based on the proof mentioned above
614   /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the
615   /// latter case \p EntryVal is a TruncInst and we must not record anything for
616   /// that IV, but it's error-prone to expect callers of this routine to care
617   /// about that, hence this explicit parameter.
618   void recordVectorLoopValueForInductionCast(const InductionDescriptor &ID,
619                                              const Instruction *EntryVal,
620                                              Value *VectorLoopValue,
621                                              unsigned Part,
622                                              unsigned Lane = UINT_MAX);
623 
624   /// Generate a shuffle sequence that will reverse the vector Vec.
625   virtual Value *reverseVector(Value *Vec);
626 
627   /// Returns (and creates if needed) the original loop trip count.
628   Value *getOrCreateTripCount(Loop *NewLoop);
629 
630   /// Returns (and creates if needed) the trip count of the widened loop.
631   Value *getOrCreateVectorTripCount(Loop *NewLoop);
632 
633   /// Returns a bitcasted value to the requested vector type.
634   /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
635   Value *createBitOrPointerCast(Value *V, VectorType *DstVTy,
636                                 const DataLayout &DL);
637 
638   /// Emit a bypass check to see if the vector trip count is zero, including if
639   /// it overflows.
640   void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass);
641 
642   /// Emit a bypass check to see if all of the SCEV assumptions we've
643   /// had to make are correct.
644   void emitSCEVChecks(Loop *L, BasicBlock *Bypass);
645 
646   /// Emit bypass checks to check any memory assumptions we may have made.
647   void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass);
648 
649   /// Compute the transformed value of Index at offset StartValue using step
650   /// StepValue.
651   /// For integer induction, returns StartValue + Index * StepValue.
652   /// For pointer induction, returns StartValue[Index * StepValue].
653   /// FIXME: The newly created binary instructions should contain nsw/nuw
654   /// flags, which can be found from the original scalar operations.
655   Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE,
656                               const DataLayout &DL,
657                               const InductionDescriptor &ID) const;
658 
659   /// Add additional metadata to \p To that was not present on \p Orig.
660   ///
661   /// Currently this is used to add the noalias annotations based on the
662   /// inserted memchecks.  Use this for instructions that are *cloned* into the
663   /// vector loop.
664   void addNewMetadata(Instruction *To, const Instruction *Orig);
665 
666   /// Add metadata from one instruction to another.
667   ///
668   /// This includes both the original MDs from \p From and additional ones (\see
669   /// addNewMetadata).  Use this for *newly created* instructions in the vector
670   /// loop.
671   void addMetadata(Instruction *To, Instruction *From);
672 
673   /// Similar to the previous function but it adds the metadata to a
674   /// vector of instructions.
675   void addMetadata(ArrayRef<Value *> To, Instruction *From);
676 
677   /// The original loop.
678   Loop *OrigLoop;
679 
680   /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
681   /// dynamic knowledge to simplify SCEV expressions and converts them to a
682   /// more usable form.
683   PredicatedScalarEvolution &PSE;
684 
685   /// Loop Info.
686   LoopInfo *LI;
687 
688   /// Dominator Tree.
689   DominatorTree *DT;
690 
691   /// Alias Analysis.
692   AAResults *AA;
693 
694   /// Target Library Info.
695   const TargetLibraryInfo *TLI;
696 
697   /// Target Transform Info.
698   const TargetTransformInfo *TTI;
699 
700   /// Assumption Cache.
701   AssumptionCache *AC;
702 
703   /// Interface to emit optimization remarks.
704   OptimizationRemarkEmitter *ORE;
705 
706   /// LoopVersioning.  It's only set up (non-null) if memchecks were
707   /// used.
708   ///
709   /// This is currently only used to add no-alias metadata based on the
710   /// memchecks.  The actually versioning is performed manually.
711   std::unique_ptr<LoopVersioning> LVer;
712 
713   /// The vectorization SIMD factor to use. Each vector will have this many
714   /// vector elements.
715   unsigned VF;
716 
717   /// The vectorization unroll factor to use. Each scalar is vectorized to this
718   /// many different vector instructions.
719   unsigned UF;
720 
721   /// The builder that we use
722   IRBuilder<> Builder;
723 
724   // --- Vectorization state ---
725 
726   /// The vector-loop preheader.
727   BasicBlock *LoopVectorPreHeader;
728 
729   /// The scalar-loop preheader.
730   BasicBlock *LoopScalarPreHeader;
731 
732   /// Middle Block between the vector and the scalar.
733   BasicBlock *LoopMiddleBlock;
734 
735   /// The ExitBlock of the scalar loop.
736   BasicBlock *LoopExitBlock;
737 
738   /// The vector loop body.
739   BasicBlock *LoopVectorBody;
740 
741   /// The scalar loop body.
742   BasicBlock *LoopScalarBody;
743 
744   /// A list of all bypass blocks. The first block is the entry of the loop.
745   SmallVector<BasicBlock *, 4> LoopBypassBlocks;
746 
747   /// The new Induction variable which was added to the new block.
748   PHINode *Induction = nullptr;
749 
750   /// The induction variable of the old basic block.
751   PHINode *OldInduction = nullptr;
752 
753   /// Maps values from the original loop to their corresponding values in the
754   /// vectorized loop. A key value can map to either vector values, scalar
755   /// values or both kinds of values, depending on whether the key was
756   /// vectorized and scalarized.
757   VectorizerValueMap VectorLoopValueMap;
758 
759   /// Store instructions that were predicated.
760   SmallVector<Instruction *, 4> PredicatedInstructions;
761 
762   /// Trip count of the original loop.
763   Value *TripCount = nullptr;
764 
765   /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
766   Value *VectorTripCount = nullptr;
767 
768   /// The legality analysis.
769   LoopVectorizationLegality *Legal;
770 
771   /// The profitablity analysis.
772   LoopVectorizationCostModel *Cost;
773 
774   // Record whether runtime checks are added.
775   bool AddedSafetyChecks = false;
776 
777   // Holds the end values for each induction variable. We save the end values
778   // so we can later fix-up the external users of the induction variables.
779   DenseMap<PHINode *, Value *> IVEndValues;
780 
781   // Vector of original scalar PHIs whose corresponding widened PHIs need to be
782   // fixed up at the end of vector code generation.
783   SmallVector<PHINode *, 8> OrigPHIsToFix;
784 
785   /// BFI and PSI are used to check for profile guided size optimizations.
786   BlockFrequencyInfo *BFI;
787   ProfileSummaryInfo *PSI;
788 };
789 
790 class InnerLoopUnroller : public InnerLoopVectorizer {
791 public:
792   InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
793                     LoopInfo *LI, DominatorTree *DT,
794                     const TargetLibraryInfo *TLI,
795                     const TargetTransformInfo *TTI, AssumptionCache *AC,
796                     OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
797                     LoopVectorizationLegality *LVL,
798                     LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
799                     ProfileSummaryInfo *PSI)
800       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1,
801                             UnrollFactor, LVL, CM, BFI, PSI) {}
802 
803 private:
804   Value *getBroadcastInstrs(Value *V) override;
805   Value *getStepVector(Value *Val, int StartIdx, Value *Step,
806                        Instruction::BinaryOps Opcode =
807                        Instruction::BinaryOpsEnd) override;
808   Value *reverseVector(Value *Vec) override;
809 };
810 
811 } // end namespace llvm
812 
813 /// Look for a meaningful debug location on the instruction or it's
814 /// operands.
815 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
816   if (!I)
817     return I;
818 
819   DebugLoc Empty;
820   if (I->getDebugLoc() != Empty)
821     return I;
822 
823   for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) {
824     if (Instruction *OpInst = dyn_cast<Instruction>(*OI))
825       if (OpInst->getDebugLoc() != Empty)
826         return OpInst;
827   }
828 
829   return I;
830 }
831 
832 void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) {
833   if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) {
834     const DILocation *DIL = Inst->getDebugLoc();
835     if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
836         !isa<DbgInfoIntrinsic>(Inst)) {
837       auto NewDIL = DIL->cloneByMultiplyingDuplicationFactor(UF * VF);
838       if (NewDIL)
839         B.SetCurrentDebugLocation(NewDIL.getValue());
840       else
841         LLVM_DEBUG(dbgs()
842                    << "Failed to create new discriminator: "
843                    << DIL->getFilename() << " Line: " << DIL->getLine());
844     }
845     else
846       B.SetCurrentDebugLocation(DIL);
847   } else
848     B.SetCurrentDebugLocation(DebugLoc());
849 }
850 
851 /// Write a record \p DebugMsg about vectorization failure to the debug
852 /// output stream. If \p I is passed, it is an instruction that prevents
853 /// vectorization.
854 #ifndef NDEBUG
855 static void debugVectorizationFailure(const StringRef DebugMsg,
856     Instruction *I) {
857   dbgs() << "LV: Not vectorizing: " << DebugMsg;
858   if (I != nullptr)
859     dbgs() << " " << *I;
860   else
861     dbgs() << '.';
862   dbgs() << '\n';
863 }
864 #endif
865 
866 /// Create an analysis remark that explains why vectorization failed
867 ///
868 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint).  \p
869 /// RemarkName is the identifier for the remark.  If \p I is passed it is an
870 /// instruction that prevents vectorization.  Otherwise \p TheLoop is used for
871 /// the location of the remark.  \return the remark object that can be
872 /// streamed to.
873 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
874     StringRef RemarkName, Loop *TheLoop, Instruction *I) {
875   Value *CodeRegion = TheLoop->getHeader();
876   DebugLoc DL = TheLoop->getStartLoc();
877 
878   if (I) {
879     CodeRegion = I->getParent();
880     // If there is no debug location attached to the instruction, revert back to
881     // using the loop's.
882     if (I->getDebugLoc())
883       DL = I->getDebugLoc();
884   }
885 
886   OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion);
887   R << "loop not vectorized: ";
888   return R;
889 }
890 
891 namespace llvm {
892 
893 void reportVectorizationFailure(const StringRef DebugMsg,
894     const StringRef OREMsg, const StringRef ORETag,
895     OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) {
896   LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I));
897   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
898   ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(),
899                 ORETag, TheLoop, I) << OREMsg);
900 }
901 
902 } // end namespace llvm
903 
904 #ifndef NDEBUG
905 /// \return string containing a file name and a line # for the given loop.
906 static std::string getDebugLocString(const Loop *L) {
907   std::string Result;
908   if (L) {
909     raw_string_ostream OS(Result);
910     if (const DebugLoc LoopDbgLoc = L->getStartLoc())
911       LoopDbgLoc.print(OS);
912     else
913       // Just print the module name.
914       OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
915     OS.flush();
916   }
917   return Result;
918 }
919 #endif
920 
921 void InnerLoopVectorizer::addNewMetadata(Instruction *To,
922                                          const Instruction *Orig) {
923   // If the loop was versioned with memchecks, add the corresponding no-alias
924   // metadata.
925   if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig)))
926     LVer->annotateInstWithNoAlias(To, Orig);
927 }
928 
929 void InnerLoopVectorizer::addMetadata(Instruction *To,
930                                       Instruction *From) {
931   propagateMetadata(To, From);
932   addNewMetadata(To, From);
933 }
934 
935 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,
936                                       Instruction *From) {
937   for (Value *V : To) {
938     if (Instruction *I = dyn_cast<Instruction>(V))
939       addMetadata(I, From);
940   }
941 }
942 
943 namespace llvm {
944 
945 // Loop vectorization cost-model hints how the scalar epilogue loop should be
946 // lowered.
947 enum ScalarEpilogueLowering {
948 
949   // The default: allowing scalar epilogues.
950   CM_ScalarEpilogueAllowed,
951 
952   // Vectorization with OptForSize: don't allow epilogues.
953   CM_ScalarEpilogueNotAllowedOptSize,
954 
955   // A special case of vectorisation with OptForSize: loops with a very small
956   // trip count are considered for vectorization under OptForSize, thereby
957   // making sure the cost of their loop body is dominant, free of runtime
958   // guards and scalar iteration overheads.
959   CM_ScalarEpilogueNotAllowedLowTripLoop,
960 
961   // Loop hint predicate indicating an epilogue is undesired.
962   CM_ScalarEpilogueNotNeededUsePredicate
963 };
964 
965 /// LoopVectorizationCostModel - estimates the expected speedups due to
966 /// vectorization.
967 /// In many cases vectorization is not profitable. This can happen because of
968 /// a number of reasons. In this class we mainly attempt to predict the
969 /// expected speedup/slowdowns due to the supported instruction set. We use the
970 /// TargetTransformInfo to query the different backends for the cost of
971 /// different operations.
972 class LoopVectorizationCostModel {
973 public:
974   LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
975                              PredicatedScalarEvolution &PSE, LoopInfo *LI,
976                              LoopVectorizationLegality *Legal,
977                              const TargetTransformInfo &TTI,
978                              const TargetLibraryInfo *TLI, DemandedBits *DB,
979                              AssumptionCache *AC,
980                              OptimizationRemarkEmitter *ORE, const Function *F,
981                              const LoopVectorizeHints *Hints,
982                              InterleavedAccessInfo &IAI)
983       : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
984         TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
985         Hints(Hints), InterleaveInfo(IAI) {}
986 
987   /// \return An upper bound for the vectorization factor, or None if
988   /// vectorization and interleaving should be avoided up front.
989   Optional<unsigned> computeMaxVF(unsigned UserVF, unsigned UserIC);
990 
991   /// \return True if runtime checks are required for vectorization, and false
992   /// otherwise.
993   bool runtimeChecksRequired();
994 
995   /// \return The most profitable vectorization factor and the cost of that VF.
996   /// This method checks every power of two up to MaxVF. If UserVF is not ZERO
997   /// then this vectorization factor will be selected if vectorization is
998   /// possible.
999   VectorizationFactor selectVectorizationFactor(unsigned MaxVF);
1000 
1001   /// Setup cost-based decisions for user vectorization factor.
1002   void selectUserVectorizationFactor(unsigned UserVF) {
1003     collectUniformsAndScalars(UserVF);
1004     collectInstsToScalarize(UserVF);
1005   }
1006 
1007   /// \return The size (in bits) of the smallest and widest types in the code
1008   /// that needs to be vectorized. We ignore values that remain scalar such as
1009   /// 64 bit loop indices.
1010   std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
1011 
1012   /// \return The desired interleave count.
1013   /// If interleave count has been specified by metadata it will be returned.
1014   /// Otherwise, the interleave count is computed and returned. VF and LoopCost
1015   /// are the selected vectorization factor and the cost of the selected VF.
1016   unsigned selectInterleaveCount(unsigned VF, unsigned LoopCost);
1017 
1018   /// Memory access instruction may be vectorized in more than one way.
1019   /// Form of instruction after vectorization depends on cost.
1020   /// This function takes cost-based decisions for Load/Store instructions
1021   /// and collects them in a map. This decisions map is used for building
1022   /// the lists of loop-uniform and loop-scalar instructions.
1023   /// The calculated cost is saved with widening decision in order to
1024   /// avoid redundant calculations.
1025   void setCostBasedWideningDecision(unsigned VF);
1026 
1027   /// A struct that represents some properties of the register usage
1028   /// of a loop.
1029   struct RegisterUsage {
1030     /// Holds the number of loop invariant values that are used in the loop.
1031     /// The key is ClassID of target-provided register class.
1032     SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs;
1033     /// Holds the maximum number of concurrent live intervals in the loop.
1034     /// The key is ClassID of target-provided register class.
1035     SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
1036   };
1037 
1038   /// \return Returns information about the register usages of the loop for the
1039   /// given vectorization factors.
1040   SmallVector<RegisterUsage, 8> calculateRegisterUsage(ArrayRef<unsigned> VFs);
1041 
1042   /// Collect values we want to ignore in the cost model.
1043   void collectValuesToIgnore();
1044 
1045   /// \returns The smallest bitwidth each instruction can be represented with.
1046   /// The vector equivalents of these instructions should be truncated to this
1047   /// type.
1048   const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
1049     return MinBWs;
1050   }
1051 
1052   /// \returns True if it is more profitable to scalarize instruction \p I for
1053   /// vectorization factor \p VF.
1054   bool isProfitableToScalarize(Instruction *I, unsigned VF) const {
1055     assert(VF > 1 && "Profitable to scalarize relevant only for VF > 1.");
1056 
1057     // Cost model is not run in the VPlan-native path - return conservative
1058     // result until this changes.
1059     if (EnableVPlanNativePath)
1060       return false;
1061 
1062     auto Scalars = InstsToScalarize.find(VF);
1063     assert(Scalars != InstsToScalarize.end() &&
1064            "VF not yet analyzed for scalarization profitability");
1065     return Scalars->second.find(I) != Scalars->second.end();
1066   }
1067 
1068   /// Returns true if \p I is known to be uniform after vectorization.
1069   bool isUniformAfterVectorization(Instruction *I, unsigned VF) const {
1070     if (VF == 1)
1071       return true;
1072 
1073     // Cost model is not run in the VPlan-native path - return conservative
1074     // result until this changes.
1075     if (EnableVPlanNativePath)
1076       return false;
1077 
1078     auto UniformsPerVF = Uniforms.find(VF);
1079     assert(UniformsPerVF != Uniforms.end() &&
1080            "VF not yet analyzed for uniformity");
1081     return UniformsPerVF->second.count(I);
1082   }
1083 
1084   /// Returns true if \p I is known to be scalar after vectorization.
1085   bool isScalarAfterVectorization(Instruction *I, unsigned VF) const {
1086     if (VF == 1)
1087       return true;
1088 
1089     // Cost model is not run in the VPlan-native path - return conservative
1090     // result until this changes.
1091     if (EnableVPlanNativePath)
1092       return false;
1093 
1094     auto ScalarsPerVF = Scalars.find(VF);
1095     assert(ScalarsPerVF != Scalars.end() &&
1096            "Scalar values are not calculated for VF");
1097     return ScalarsPerVF->second.count(I);
1098   }
1099 
1100   /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1101   /// for vectorization factor \p VF.
1102   bool canTruncateToMinimalBitwidth(Instruction *I, unsigned VF) const {
1103     return VF > 1 && MinBWs.find(I) != MinBWs.end() &&
1104            !isProfitableToScalarize(I, VF) &&
1105            !isScalarAfterVectorization(I, VF);
1106   }
1107 
1108   /// Decision that was taken during cost calculation for memory instruction.
1109   enum InstWidening {
1110     CM_Unknown,
1111     CM_Widen,         // For consecutive accesses with stride +1.
1112     CM_Widen_Reverse, // For consecutive accesses with stride -1.
1113     CM_Interleave,
1114     CM_GatherScatter,
1115     CM_Scalarize
1116   };
1117 
1118   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1119   /// instruction \p I and vector width \p VF.
1120   void setWideningDecision(Instruction *I, unsigned VF, InstWidening W,
1121                            unsigned Cost) {
1122     assert(VF >= 2 && "Expected VF >=2");
1123     WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1124   }
1125 
1126   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1127   /// interleaving group \p Grp and vector width \p VF.
1128   void setWideningDecision(const InterleaveGroup<Instruction> *Grp, unsigned VF,
1129                            InstWidening W, unsigned Cost) {
1130     assert(VF >= 2 && "Expected VF >=2");
1131     /// Broadcast this decicion to all instructions inside the group.
1132     /// But the cost will be assigned to one instruction only.
1133     for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1134       if (auto *I = Grp->getMember(i)) {
1135         if (Grp->getInsertPos() == I)
1136           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1137         else
1138           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1139       }
1140     }
1141   }
1142 
1143   /// Return the cost model decision for the given instruction \p I and vector
1144   /// width \p VF. Return CM_Unknown if this instruction did not pass
1145   /// through the cost modeling.
1146   InstWidening getWideningDecision(Instruction *I, unsigned VF) {
1147     assert(VF >= 2 && "Expected VF >=2");
1148 
1149     // Cost model is not run in the VPlan-native path - return conservative
1150     // result until this changes.
1151     if (EnableVPlanNativePath)
1152       return CM_GatherScatter;
1153 
1154     std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
1155     auto Itr = WideningDecisions.find(InstOnVF);
1156     if (Itr == WideningDecisions.end())
1157       return CM_Unknown;
1158     return Itr->second.first;
1159   }
1160 
1161   /// Return the vectorization cost for the given instruction \p I and vector
1162   /// width \p VF.
1163   unsigned getWideningCost(Instruction *I, unsigned VF) {
1164     assert(VF >= 2 && "Expected VF >=2");
1165     std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
1166     assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&
1167            "The cost is not calculated");
1168     return WideningDecisions[InstOnVF].second;
1169   }
1170 
1171   /// Return True if instruction \p I is an optimizable truncate whose operand
1172   /// is an induction variable. Such a truncate will be removed by adding a new
1173   /// induction variable with the destination type.
1174   bool isOptimizableIVTruncate(Instruction *I, unsigned VF) {
1175     // If the instruction is not a truncate, return false.
1176     auto *Trunc = dyn_cast<TruncInst>(I);
1177     if (!Trunc)
1178       return false;
1179 
1180     // Get the source and destination types of the truncate.
1181     Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1182     Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1183 
1184     // If the truncate is free for the given types, return false. Replacing a
1185     // free truncate with an induction variable would add an induction variable
1186     // update instruction to each iteration of the loop. We exclude from this
1187     // check the primary induction variable since it will need an update
1188     // instruction regardless.
1189     Value *Op = Trunc->getOperand(0);
1190     if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1191       return false;
1192 
1193     // If the truncated value is not an induction variable, return false.
1194     return Legal->isInductionPhi(Op);
1195   }
1196 
1197   /// Collects the instructions to scalarize for each predicated instruction in
1198   /// the loop.
1199   void collectInstsToScalarize(unsigned VF);
1200 
1201   /// Collect Uniform and Scalar values for the given \p VF.
1202   /// The sets depend on CM decision for Load/Store instructions
1203   /// that may be vectorized as interleave, gather-scatter or scalarized.
1204   void collectUniformsAndScalars(unsigned VF) {
1205     // Do the analysis once.
1206     if (VF == 1 || Uniforms.find(VF) != Uniforms.end())
1207       return;
1208     setCostBasedWideningDecision(VF);
1209     collectLoopUniforms(VF);
1210     collectLoopScalars(VF);
1211   }
1212 
1213   /// Returns true if the target machine supports masked store operation
1214   /// for the given \p DataType and kind of access to \p Ptr.
1215   bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) {
1216     return Legal->isConsecutivePtr(Ptr) &&
1217            TTI.isLegalMaskedStore(DataType, Alignment);
1218   }
1219 
1220   /// Returns true if the target machine supports masked load operation
1221   /// for the given \p DataType and kind of access to \p Ptr.
1222   bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) {
1223     return Legal->isConsecutivePtr(Ptr) &&
1224            TTI.isLegalMaskedLoad(DataType, Alignment);
1225   }
1226 
1227   /// Returns true if the target machine supports masked scatter operation
1228   /// for the given \p DataType.
1229   bool isLegalMaskedScatter(Type *DataType, Align Alignment) {
1230     return TTI.isLegalMaskedScatter(DataType, Alignment);
1231   }
1232 
1233   /// Returns true if the target machine supports masked gather operation
1234   /// for the given \p DataType.
1235   bool isLegalMaskedGather(Type *DataType, Align Alignment) {
1236     return TTI.isLegalMaskedGather(DataType, Alignment);
1237   }
1238 
1239   /// Returns true if the target machine can represent \p V as a masked gather
1240   /// or scatter operation.
1241   bool isLegalGatherOrScatter(Value *V) {
1242     bool LI = isa<LoadInst>(V);
1243     bool SI = isa<StoreInst>(V);
1244     if (!LI && !SI)
1245       return false;
1246     auto *Ty = getMemInstValueType(V);
1247     Align Align = getLoadStoreAlignment(V);
1248     return (LI && isLegalMaskedGather(Ty, Align)) ||
1249            (SI && isLegalMaskedScatter(Ty, Align));
1250   }
1251 
1252   /// Returns true if \p I is an instruction that will be scalarized with
1253   /// predication. Such instructions include conditional stores and
1254   /// instructions that may divide by zero.
1255   /// If a non-zero VF has been calculated, we check if I will be scalarized
1256   /// predication for that VF.
1257   bool isScalarWithPredication(Instruction *I, unsigned VF = 1);
1258 
1259   // Returns true if \p I is an instruction that will be predicated either
1260   // through scalar predication or masked load/store or masked gather/scatter.
1261   // Superset of instructions that return true for isScalarWithPredication.
1262   bool isPredicatedInst(Instruction *I) {
1263     if (!blockNeedsPredication(I->getParent()))
1264       return false;
1265     // Loads and stores that need some form of masked operation are predicated
1266     // instructions.
1267     if (isa<LoadInst>(I) || isa<StoreInst>(I))
1268       return Legal->isMaskRequired(I);
1269     return isScalarWithPredication(I);
1270   }
1271 
1272   /// Returns true if \p I is a memory instruction with consecutive memory
1273   /// access that can be widened.
1274   bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1);
1275 
1276   /// Returns true if \p I is a memory instruction in an interleaved-group
1277   /// of memory accesses that can be vectorized with wide vector loads/stores
1278   /// and shuffles.
1279   bool interleavedAccessCanBeWidened(Instruction *I, unsigned VF = 1);
1280 
1281   /// Check if \p Instr belongs to any interleaved access group.
1282   bool isAccessInterleaved(Instruction *Instr) {
1283     return InterleaveInfo.isInterleaved(Instr);
1284   }
1285 
1286   /// Get the interleaved access group that \p Instr belongs to.
1287   const InterleaveGroup<Instruction> *
1288   getInterleavedAccessGroup(Instruction *Instr) {
1289     return InterleaveInfo.getInterleaveGroup(Instr);
1290   }
1291 
1292   /// Returns true if an interleaved group requires a scalar iteration
1293   /// to handle accesses with gaps, and there is nothing preventing us from
1294   /// creating a scalar epilogue.
1295   bool requiresScalarEpilogue() const {
1296     return isScalarEpilogueAllowed() && InterleaveInfo.requiresScalarEpilogue();
1297   }
1298 
1299   /// Returns true if a scalar epilogue is not allowed due to optsize or a
1300   /// loop hint annotation.
1301   bool isScalarEpilogueAllowed() const {
1302     return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1303   }
1304 
1305   /// Returns true if all loop blocks should be masked to fold tail loop.
1306   bool foldTailByMasking() const { return FoldTailByMasking; }
1307 
1308   bool blockNeedsPredication(BasicBlock *BB) {
1309     return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1310   }
1311 
1312   /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1313   /// with factor VF.  Return the cost of the instruction, including
1314   /// scalarization overhead if it's needed.
1315   unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF);
1316 
1317   /// Estimate cost of a call instruction CI if it were vectorized with factor
1318   /// VF. Return the cost of the instruction, including scalarization overhead
1319   /// if it's needed. The flag NeedToScalarize shows if the call needs to be
1320   /// scalarized -
1321   /// i.e. either vector version isn't available, or is too expensive.
1322   unsigned getVectorCallCost(CallInst *CI, unsigned VF, bool &NeedToScalarize);
1323 
1324   /// Invalidates decisions already taken by the cost model.
1325   void invalidateCostModelingDecisions() {
1326     WideningDecisions.clear();
1327     Uniforms.clear();
1328     Scalars.clear();
1329   }
1330 
1331 private:
1332   unsigned NumPredStores = 0;
1333 
1334   /// \return An upper bound for the vectorization factor, a power-of-2 larger
1335   /// than zero. One is returned if vectorization should best be avoided due
1336   /// to cost.
1337   unsigned computeFeasibleMaxVF(unsigned ConstTripCount);
1338 
1339   /// The vectorization cost is a combination of the cost itself and a boolean
1340   /// indicating whether any of the contributing operations will actually
1341   /// operate on
1342   /// vector values after type legalization in the backend. If this latter value
1343   /// is
1344   /// false, then all operations will be scalarized (i.e. no vectorization has
1345   /// actually taken place).
1346   using VectorizationCostTy = std::pair<unsigned, bool>;
1347 
1348   /// Returns the expected execution cost. The unit of the cost does
1349   /// not matter because we use the 'cost' units to compare different
1350   /// vector widths. The cost that is returned is *not* normalized by
1351   /// the factor width.
1352   VectorizationCostTy expectedCost(unsigned VF);
1353 
1354   /// Returns the execution time cost of an instruction for a given vector
1355   /// width. Vector width of one means scalar.
1356   VectorizationCostTy getInstructionCost(Instruction *I, unsigned VF);
1357 
1358   /// The cost-computation logic from getInstructionCost which provides
1359   /// the vector type as an output parameter.
1360   unsigned getInstructionCost(Instruction *I, unsigned VF, Type *&VectorTy);
1361 
1362   /// Calculate vectorization cost of memory instruction \p I.
1363   unsigned getMemoryInstructionCost(Instruction *I, unsigned VF);
1364 
1365   /// The cost computation for scalarized memory instruction.
1366   unsigned getMemInstScalarizationCost(Instruction *I, unsigned VF);
1367 
1368   /// The cost computation for interleaving group of memory instructions.
1369   unsigned getInterleaveGroupCost(Instruction *I, unsigned VF);
1370 
1371   /// The cost computation for Gather/Scatter instruction.
1372   unsigned getGatherScatterCost(Instruction *I, unsigned VF);
1373 
1374   /// The cost computation for widening instruction \p I with consecutive
1375   /// memory access.
1376   unsigned getConsecutiveMemOpCost(Instruction *I, unsigned VF);
1377 
1378   /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1379   /// Load: scalar load + broadcast.
1380   /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1381   /// element)
1382   unsigned getUniformMemOpCost(Instruction *I, unsigned VF);
1383 
1384   /// Estimate the overhead of scalarizing an instruction. This is a
1385   /// convenience wrapper for the type-based getScalarizationOverhead API.
1386   unsigned getScalarizationOverhead(Instruction *I, unsigned VF);
1387 
1388   /// Returns whether the instruction is a load or store and will be a emitted
1389   /// as a vector operation.
1390   bool isConsecutiveLoadOrStore(Instruction *I);
1391 
1392   /// Returns true if an artificially high cost for emulated masked memrefs
1393   /// should be used.
1394   bool useEmulatedMaskMemRefHack(Instruction *I);
1395 
1396   /// Map of scalar integer values to the smallest bitwidth they can be legally
1397   /// represented as. The vector equivalents of these values should be truncated
1398   /// to this type.
1399   MapVector<Instruction *, uint64_t> MinBWs;
1400 
1401   /// A type representing the costs for instructions if they were to be
1402   /// scalarized rather than vectorized. The entries are Instruction-Cost
1403   /// pairs.
1404   using ScalarCostsTy = DenseMap<Instruction *, unsigned>;
1405 
1406   /// A set containing all BasicBlocks that are known to present after
1407   /// vectorization as a predicated block.
1408   SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;
1409 
1410   /// Records whether it is allowed to have the original scalar loop execute at
1411   /// least once. This may be needed as a fallback loop in case runtime
1412   /// aliasing/dependence checks fail, or to handle the tail/remainder
1413   /// iterations when the trip count is unknown or doesn't divide by the VF,
1414   /// or as a peel-loop to handle gaps in interleave-groups.
1415   /// Under optsize and when the trip count is very small we don't allow any
1416   /// iterations to execute in the scalar loop.
1417   ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1418 
1419   /// All blocks of loop are to be masked to fold tail of scalar iterations.
1420   bool FoldTailByMasking = false;
1421 
1422   /// A map holding scalar costs for different vectorization factors. The
1423   /// presence of a cost for an instruction in the mapping indicates that the
1424   /// instruction will be scalarized when vectorizing with the associated
1425   /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1426   DenseMap<unsigned, ScalarCostsTy> InstsToScalarize;
1427 
1428   /// Holds the instructions known to be uniform after vectorization.
1429   /// The data is collected per VF.
1430   DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Uniforms;
1431 
1432   /// Holds the instructions known to be scalar after vectorization.
1433   /// The data is collected per VF.
1434   DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Scalars;
1435 
1436   /// Holds the instructions (address computations) that are forced to be
1437   /// scalarized.
1438   DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1439 
1440   /// Returns the expected difference in cost from scalarizing the expression
1441   /// feeding a predicated instruction \p PredInst. The instructions to
1442   /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1443   /// non-negative return value implies the expression will be scalarized.
1444   /// Currently, only single-use chains are considered for scalarization.
1445   int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
1446                               unsigned VF);
1447 
1448   /// Collect the instructions that are uniform after vectorization. An
1449   /// instruction is uniform if we represent it with a single scalar value in
1450   /// the vectorized loop corresponding to each vector iteration. Examples of
1451   /// uniform instructions include pointer operands of consecutive or
1452   /// interleaved memory accesses. Note that although uniformity implies an
1453   /// instruction will be scalar, the reverse is not true. In general, a
1454   /// scalarized instruction will be represented by VF scalar values in the
1455   /// vectorized loop, each corresponding to an iteration of the original
1456   /// scalar loop.
1457   void collectLoopUniforms(unsigned VF);
1458 
1459   /// Collect the instructions that are scalar after vectorization. An
1460   /// instruction is scalar if it is known to be uniform or will be scalarized
1461   /// during vectorization. Non-uniform scalarized instructions will be
1462   /// represented by VF values in the vectorized loop, each corresponding to an
1463   /// iteration of the original scalar loop.
1464   void collectLoopScalars(unsigned VF);
1465 
1466   /// Keeps cost model vectorization decision and cost for instructions.
1467   /// Right now it is used for memory instructions only.
1468   using DecisionList = DenseMap<std::pair<Instruction *, unsigned>,
1469                                 std::pair<InstWidening, unsigned>>;
1470 
1471   DecisionList WideningDecisions;
1472 
1473   /// Returns true if \p V is expected to be vectorized and it needs to be
1474   /// extracted.
1475   bool needsExtract(Value *V, unsigned VF) const {
1476     Instruction *I = dyn_cast<Instruction>(V);
1477     if (VF == 1 || !I || !TheLoop->contains(I) || TheLoop->isLoopInvariant(I))
1478       return false;
1479 
1480     // Assume we can vectorize V (and hence we need extraction) if the
1481     // scalars are not computed yet. This can happen, because it is called
1482     // via getScalarizationOverhead from setCostBasedWideningDecision, before
1483     // the scalars are collected. That should be a safe assumption in most
1484     // cases, because we check if the operands have vectorizable types
1485     // beforehand in LoopVectorizationLegality.
1486     return Scalars.find(VF) == Scalars.end() ||
1487            !isScalarAfterVectorization(I, VF);
1488   };
1489 
1490   /// Returns a range containing only operands needing to be extracted.
1491   SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1492                                                    unsigned VF) {
1493     return SmallVector<Value *, 4>(make_filter_range(
1494         Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1495   }
1496 
1497 public:
1498   /// The loop that we evaluate.
1499   Loop *TheLoop;
1500 
1501   /// Predicated scalar evolution analysis.
1502   PredicatedScalarEvolution &PSE;
1503 
1504   /// Loop Info analysis.
1505   LoopInfo *LI;
1506 
1507   /// Vectorization legality.
1508   LoopVectorizationLegality *Legal;
1509 
1510   /// Vector target information.
1511   const TargetTransformInfo &TTI;
1512 
1513   /// Target Library Info.
1514   const TargetLibraryInfo *TLI;
1515 
1516   /// Demanded bits analysis.
1517   DemandedBits *DB;
1518 
1519   /// Assumption cache.
1520   AssumptionCache *AC;
1521 
1522   /// Interface to emit optimization remarks.
1523   OptimizationRemarkEmitter *ORE;
1524 
1525   const Function *TheFunction;
1526 
1527   /// Loop Vectorize Hint.
1528   const LoopVectorizeHints *Hints;
1529 
1530   /// The interleave access information contains groups of interleaved accesses
1531   /// with the same stride and close to each other.
1532   InterleavedAccessInfo &InterleaveInfo;
1533 
1534   /// Values to ignore in the cost model.
1535   SmallPtrSet<const Value *, 16> ValuesToIgnore;
1536 
1537   /// Values to ignore in the cost model when VF > 1.
1538   SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1539 };
1540 
1541 } // end namespace llvm
1542 
1543 // Return true if \p OuterLp is an outer loop annotated with hints for explicit
1544 // vectorization. The loop needs to be annotated with #pragma omp simd
1545 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
1546 // vector length information is not provided, vectorization is not considered
1547 // explicit. Interleave hints are not allowed either. These limitations will be
1548 // relaxed in the future.
1549 // Please, note that we are currently forced to abuse the pragma 'clang
1550 // vectorize' semantics. This pragma provides *auto-vectorization hints*
1551 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
1552 // provides *explicit vectorization hints* (LV can bypass legal checks and
1553 // assume that vectorization is legal). However, both hints are implemented
1554 // using the same metadata (llvm.loop.vectorize, processed by
1555 // LoopVectorizeHints). This will be fixed in the future when the native IR
1556 // representation for pragma 'omp simd' is introduced.
1557 static bool isExplicitVecOuterLoop(Loop *OuterLp,
1558                                    OptimizationRemarkEmitter *ORE) {
1559   assert(!OuterLp->empty() && "This is not an outer loop");
1560   LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
1561 
1562   // Only outer loops with an explicit vectorization hint are supported.
1563   // Unannotated outer loops are ignored.
1564   if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
1565     return false;
1566 
1567   Function *Fn = OuterLp->getHeader()->getParent();
1568   if (!Hints.allowVectorization(Fn, OuterLp,
1569                                 true /*VectorizeOnlyWhenForced*/)) {
1570     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
1571     return false;
1572   }
1573 
1574   if (Hints.getInterleave() > 1) {
1575     // TODO: Interleave support is future work.
1576     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
1577                          "outer loops.\n");
1578     Hints.emitRemarkWithHints();
1579     return false;
1580   }
1581 
1582   return true;
1583 }
1584 
1585 static void collectSupportedLoops(Loop &L, LoopInfo *LI,
1586                                   OptimizationRemarkEmitter *ORE,
1587                                   SmallVectorImpl<Loop *> &V) {
1588   // Collect inner loops and outer loops without irreducible control flow. For
1589   // now, only collect outer loops that have explicit vectorization hints. If we
1590   // are stress testing the VPlan H-CFG construction, we collect the outermost
1591   // loop of every loop nest.
1592   if (L.empty() || VPlanBuildStressTest ||
1593       (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
1594     LoopBlocksRPO RPOT(&L);
1595     RPOT.perform(LI);
1596     if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
1597       V.push_back(&L);
1598       // TODO: Collect inner loops inside marked outer loops in case
1599       // vectorization fails for the outer loop. Do not invoke
1600       // 'containsIrreducibleCFG' again for inner loops when the outer loop is
1601       // already known to be reducible. We can use an inherited attribute for
1602       // that.
1603       return;
1604     }
1605   }
1606   for (Loop *InnerL : L)
1607     collectSupportedLoops(*InnerL, LI, ORE, V);
1608 }
1609 
1610 namespace {
1611 
1612 /// The LoopVectorize Pass.
1613 struct LoopVectorize : public FunctionPass {
1614   /// Pass identification, replacement for typeid
1615   static char ID;
1616 
1617   LoopVectorizePass Impl;
1618 
1619   explicit LoopVectorize(bool InterleaveOnlyWhenForced = false,
1620                          bool VectorizeOnlyWhenForced = false)
1621       : FunctionPass(ID),
1622         Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) {
1623     initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
1624   }
1625 
1626   bool runOnFunction(Function &F) override {
1627     if (skipFunction(F))
1628       return false;
1629 
1630     auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
1631     auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
1632     auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
1633     auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
1634     auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
1635     auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
1636     auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
1637     auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
1638     auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
1639     auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
1640     auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
1641     auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
1642     auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
1643 
1644     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
1645         [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
1646 
1647     return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC,
1648                         GetLAA, *ORE, PSI).MadeAnyChange;
1649   }
1650 
1651   void getAnalysisUsage(AnalysisUsage &AU) const override {
1652     AU.addRequired<AssumptionCacheTracker>();
1653     AU.addRequired<BlockFrequencyInfoWrapperPass>();
1654     AU.addRequired<DominatorTreeWrapperPass>();
1655     AU.addRequired<LoopInfoWrapperPass>();
1656     AU.addRequired<ScalarEvolutionWrapperPass>();
1657     AU.addRequired<TargetTransformInfoWrapperPass>();
1658     AU.addRequired<AAResultsWrapperPass>();
1659     AU.addRequired<LoopAccessLegacyAnalysis>();
1660     AU.addRequired<DemandedBitsWrapperPass>();
1661     AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
1662     AU.addRequired<InjectTLIMappingsLegacy>();
1663 
1664     // We currently do not preserve loopinfo/dominator analyses with outer loop
1665     // vectorization. Until this is addressed, mark these analyses as preserved
1666     // only for non-VPlan-native path.
1667     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
1668     if (!EnableVPlanNativePath) {
1669       AU.addPreserved<LoopInfoWrapperPass>();
1670       AU.addPreserved<DominatorTreeWrapperPass>();
1671     }
1672 
1673     AU.addPreserved<BasicAAWrapperPass>();
1674     AU.addPreserved<GlobalsAAWrapperPass>();
1675     AU.addRequired<ProfileSummaryInfoWrapperPass>();
1676   }
1677 };
1678 
1679 } // end anonymous namespace
1680 
1681 //===----------------------------------------------------------------------===//
1682 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
1683 // LoopVectorizationCostModel and LoopVectorizationPlanner.
1684 //===----------------------------------------------------------------------===//
1685 
1686 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
1687   // We need to place the broadcast of invariant variables outside the loop,
1688   // but only if it's proven safe to do so. Else, broadcast will be inside
1689   // vector loop body.
1690   Instruction *Instr = dyn_cast<Instruction>(V);
1691   bool SafeToHoist = OrigLoop->isLoopInvariant(V) &&
1692                      (!Instr ||
1693                       DT->dominates(Instr->getParent(), LoopVectorPreHeader));
1694   // Place the code for broadcasting invariant variables in the new preheader.
1695   IRBuilder<>::InsertPointGuard Guard(Builder);
1696   if (SafeToHoist)
1697     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
1698 
1699   // Broadcast the scalar into all locations in the vector.
1700   Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
1701 
1702   return Shuf;
1703 }
1704 
1705 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
1706     const InductionDescriptor &II, Value *Step, Instruction *EntryVal) {
1707   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1708          "Expected either an induction phi-node or a truncate of it!");
1709   Value *Start = II.getStartValue();
1710 
1711   // Construct the initial value of the vector IV in the vector loop preheader
1712   auto CurrIP = Builder.saveIP();
1713   Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
1714   if (isa<TruncInst>(EntryVal)) {
1715     assert(Start->getType()->isIntegerTy() &&
1716            "Truncation requires an integer type");
1717     auto *TruncType = cast<IntegerType>(EntryVal->getType());
1718     Step = Builder.CreateTrunc(Step, TruncType);
1719     Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
1720   }
1721   Value *SplatStart = Builder.CreateVectorSplat(VF, Start);
1722   Value *SteppedStart =
1723       getStepVector(SplatStart, 0, Step, II.getInductionOpcode());
1724 
1725   // We create vector phi nodes for both integer and floating-point induction
1726   // variables. Here, we determine the kind of arithmetic we will perform.
1727   Instruction::BinaryOps AddOp;
1728   Instruction::BinaryOps MulOp;
1729   if (Step->getType()->isIntegerTy()) {
1730     AddOp = Instruction::Add;
1731     MulOp = Instruction::Mul;
1732   } else {
1733     AddOp = II.getInductionOpcode();
1734     MulOp = Instruction::FMul;
1735   }
1736 
1737   // Multiply the vectorization factor by the step using integer or
1738   // floating-point arithmetic as appropriate.
1739   Value *ConstVF = getSignedIntOrFpConstant(Step->getType(), VF);
1740   Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF));
1741 
1742   // Create a vector splat to use in the induction update.
1743   //
1744   // FIXME: If the step is non-constant, we create the vector splat with
1745   //        IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
1746   //        handle a constant vector splat.
1747   Value *SplatVF =
1748       isa<Constant>(Mul)
1749           ? ConstantVector::getSplat({VF, false}, cast<Constant>(Mul))
1750           : Builder.CreateVectorSplat(VF, Mul);
1751   Builder.restoreIP(CurrIP);
1752 
1753   // We may need to add the step a number of times, depending on the unroll
1754   // factor. The last of those goes into the PHI.
1755   PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
1756                                     &*LoopVectorBody->getFirstInsertionPt());
1757   VecInd->setDebugLoc(EntryVal->getDebugLoc());
1758   Instruction *LastInduction = VecInd;
1759   for (unsigned Part = 0; Part < UF; ++Part) {
1760     VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction);
1761 
1762     if (isa<TruncInst>(EntryVal))
1763       addMetadata(LastInduction, EntryVal);
1764     recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, Part);
1765 
1766     LastInduction = cast<Instruction>(addFastMathFlag(
1767         Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")));
1768     LastInduction->setDebugLoc(EntryVal->getDebugLoc());
1769   }
1770 
1771   // Move the last step to the end of the latch block. This ensures consistent
1772   // placement of all induction updates.
1773   auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
1774   auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator());
1775   auto *ICmp = cast<Instruction>(Br->getCondition());
1776   LastInduction->moveBefore(ICmp);
1777   LastInduction->setName("vec.ind.next");
1778 
1779   VecInd->addIncoming(SteppedStart, LoopVectorPreHeader);
1780   VecInd->addIncoming(LastInduction, LoopVectorLatch);
1781 }
1782 
1783 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const {
1784   return Cost->isScalarAfterVectorization(I, VF) ||
1785          Cost->isProfitableToScalarize(I, VF);
1786 }
1787 
1788 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const {
1789   if (shouldScalarizeInstruction(IV))
1790     return true;
1791   auto isScalarInst = [&](User *U) -> bool {
1792     auto *I = cast<Instruction>(U);
1793     return (OrigLoop->contains(I) && shouldScalarizeInstruction(I));
1794   };
1795   return llvm::any_of(IV->users(), isScalarInst);
1796 }
1797 
1798 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast(
1799     const InductionDescriptor &ID, const Instruction *EntryVal,
1800     Value *VectorLoopVal, unsigned Part, unsigned Lane) {
1801   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1802          "Expected either an induction phi-node or a truncate of it!");
1803 
1804   // This induction variable is not the phi from the original loop but the
1805   // newly-created IV based on the proof that casted Phi is equal to the
1806   // uncasted Phi in the vectorized loop (under a runtime guard possibly). It
1807   // re-uses the same InductionDescriptor that original IV uses but we don't
1808   // have to do any recording in this case - that is done when original IV is
1809   // processed.
1810   if (isa<TruncInst>(EntryVal))
1811     return;
1812 
1813   const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts();
1814   if (Casts.empty())
1815     return;
1816   // Only the first Cast instruction in the Casts vector is of interest.
1817   // The rest of the Casts (if exist) have no uses outside the
1818   // induction update chain itself.
1819   Instruction *CastInst = *Casts.begin();
1820   if (Lane < UINT_MAX)
1821     VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal);
1822   else
1823     VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal);
1824 }
1825 
1826 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) {
1827   assert((IV->getType()->isIntegerTy() || IV != OldInduction) &&
1828          "Primary induction variable must have an integer type");
1829 
1830   auto II = Legal->getInductionVars().find(IV);
1831   assert(II != Legal->getInductionVars().end() && "IV is not an induction");
1832 
1833   auto ID = II->second;
1834   assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
1835 
1836   // The value from the original loop to which we are mapping the new induction
1837   // variable.
1838   Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
1839 
1840   auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
1841 
1842   // Generate code for the induction step. Note that induction steps are
1843   // required to be loop-invariant
1844   auto CreateStepValue = [&](const SCEV *Step) -> Value * {
1845     assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) &&
1846            "Induction step should be loop invariant");
1847     if (PSE.getSE()->isSCEVable(IV->getType())) {
1848       SCEVExpander Exp(*PSE.getSE(), DL, "induction");
1849       return Exp.expandCodeFor(Step, Step->getType(),
1850                                LoopVectorPreHeader->getTerminator());
1851     }
1852     return cast<SCEVUnknown>(Step)->getValue();
1853   };
1854 
1855   // The scalar value to broadcast. This is derived from the canonical
1856   // induction variable. If a truncation type is given, truncate the canonical
1857   // induction variable and step. Otherwise, derive these values from the
1858   // induction descriptor.
1859   auto CreateScalarIV = [&](Value *&Step) -> Value * {
1860     Value *ScalarIV = Induction;
1861     if (IV != OldInduction) {
1862       ScalarIV = IV->getType()->isIntegerTy()
1863                      ? Builder.CreateSExtOrTrunc(Induction, IV->getType())
1864                      : Builder.CreateCast(Instruction::SIToFP, Induction,
1865                                           IV->getType());
1866       ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID);
1867       ScalarIV->setName("offset.idx");
1868     }
1869     if (Trunc) {
1870       auto *TruncType = cast<IntegerType>(Trunc->getType());
1871       assert(Step->getType()->isIntegerTy() &&
1872              "Truncation requires an integer step");
1873       ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType);
1874       Step = Builder.CreateTrunc(Step, TruncType);
1875     }
1876     return ScalarIV;
1877   };
1878 
1879   // Create the vector values from the scalar IV, in the absence of creating a
1880   // vector IV.
1881   auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) {
1882     Value *Broadcasted = getBroadcastInstrs(ScalarIV);
1883     for (unsigned Part = 0; Part < UF; ++Part) {
1884       Value *EntryPart =
1885           getStepVector(Broadcasted, VF * Part, Step, ID.getInductionOpcode());
1886       VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart);
1887       if (Trunc)
1888         addMetadata(EntryPart, Trunc);
1889       recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part);
1890     }
1891   };
1892 
1893   // Now do the actual transformations, and start with creating the step value.
1894   Value *Step = CreateStepValue(ID.getStep());
1895   if (VF <= 1) {
1896     Value *ScalarIV = CreateScalarIV(Step);
1897     CreateSplatIV(ScalarIV, Step);
1898     return;
1899   }
1900 
1901   // Determine if we want a scalar version of the induction variable. This is
1902   // true if the induction variable itself is not widened, or if it has at
1903   // least one user in the loop that is not widened.
1904   auto NeedsScalarIV = needsScalarInduction(EntryVal);
1905   if (!NeedsScalarIV) {
1906     createVectorIntOrFpInductionPHI(ID, Step, EntryVal);
1907     return;
1908   }
1909 
1910   // Try to create a new independent vector induction variable. If we can't
1911   // create the phi node, we will splat the scalar induction variable in each
1912   // loop iteration.
1913   if (!shouldScalarizeInstruction(EntryVal)) {
1914     createVectorIntOrFpInductionPHI(ID, Step, EntryVal);
1915     Value *ScalarIV = CreateScalarIV(Step);
1916     // Create scalar steps that can be used by instructions we will later
1917     // scalarize. Note that the addition of the scalar steps will not increase
1918     // the number of instructions in the loop in the common case prior to
1919     // InstCombine. We will be trading one vector extract for each scalar step.
1920     buildScalarSteps(ScalarIV, Step, EntryVal, ID);
1921     return;
1922   }
1923 
1924   // All IV users are scalar instructions, so only emit a scalar IV, not a
1925   // vectorised IV. Except when we tail-fold, then the splat IV feeds the
1926   // predicate used by the masked loads/stores.
1927   Value *ScalarIV = CreateScalarIV(Step);
1928   if (!Cost->isScalarEpilogueAllowed())
1929     CreateSplatIV(ScalarIV, Step);
1930   buildScalarSteps(ScalarIV, Step, EntryVal, ID);
1931 }
1932 
1933 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step,
1934                                           Instruction::BinaryOps BinOp) {
1935   // Create and check the types.
1936   auto *ValVTy = cast<VectorType>(Val->getType());
1937   int VLen = ValVTy->getNumElements();
1938 
1939   Type *STy = Val->getType()->getScalarType();
1940   assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
1941          "Induction Step must be an integer or FP");
1942   assert(Step->getType() == STy && "Step has wrong type");
1943 
1944   SmallVector<Constant *, 8> Indices;
1945 
1946   if (STy->isIntegerTy()) {
1947     // Create a vector of consecutive numbers from zero to VF.
1948     for (int i = 0; i < VLen; ++i)
1949       Indices.push_back(ConstantInt::get(STy, StartIdx + i));
1950 
1951     // Add the consecutive indices to the vector value.
1952     Constant *Cv = ConstantVector::get(Indices);
1953     assert(Cv->getType() == Val->getType() && "Invalid consecutive vec");
1954     Step = Builder.CreateVectorSplat(VLen, Step);
1955     assert(Step->getType() == Val->getType() && "Invalid step vec");
1956     // FIXME: The newly created binary instructions should contain nsw/nuw flags,
1957     // which can be found from the original scalar operations.
1958     Step = Builder.CreateMul(Cv, Step);
1959     return Builder.CreateAdd(Val, Step, "induction");
1960   }
1961 
1962   // Floating point induction.
1963   assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
1964          "Binary Opcode should be specified for FP induction");
1965   // Create a vector of consecutive numbers from zero to VF.
1966   for (int i = 0; i < VLen; ++i)
1967     Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i)));
1968 
1969   // Add the consecutive indices to the vector value.
1970   Constant *Cv = ConstantVector::get(Indices);
1971 
1972   Step = Builder.CreateVectorSplat(VLen, Step);
1973 
1974   // Floating point operations had to be 'fast' to enable the induction.
1975   FastMathFlags Flags;
1976   Flags.setFast();
1977 
1978   Value *MulOp = Builder.CreateFMul(Cv, Step);
1979   if (isa<Instruction>(MulOp))
1980     // Have to check, MulOp may be a constant
1981     cast<Instruction>(MulOp)->setFastMathFlags(Flags);
1982 
1983   Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
1984   if (isa<Instruction>(BOp))
1985     cast<Instruction>(BOp)->setFastMathFlags(Flags);
1986   return BOp;
1987 }
1988 
1989 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
1990                                            Instruction *EntryVal,
1991                                            const InductionDescriptor &ID) {
1992   // We shouldn't have to build scalar steps if we aren't vectorizing.
1993   assert(VF > 1 && "VF should be greater than one");
1994 
1995   // Get the value type and ensure it and the step have the same integer type.
1996   Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
1997   assert(ScalarIVTy == Step->getType() &&
1998          "Val and Step should have the same type");
1999 
2000   // We build scalar steps for both integer and floating-point induction
2001   // variables. Here, we determine the kind of arithmetic we will perform.
2002   Instruction::BinaryOps AddOp;
2003   Instruction::BinaryOps MulOp;
2004   if (ScalarIVTy->isIntegerTy()) {
2005     AddOp = Instruction::Add;
2006     MulOp = Instruction::Mul;
2007   } else {
2008     AddOp = ID.getInductionOpcode();
2009     MulOp = Instruction::FMul;
2010   }
2011 
2012   // Determine the number of scalars we need to generate for each unroll
2013   // iteration. If EntryVal is uniform, we only need to generate the first
2014   // lane. Otherwise, we generate all VF values.
2015   unsigned Lanes =
2016       Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) ? 1
2017                                                                          : VF;
2018   // Compute the scalar steps and save the results in VectorLoopValueMap.
2019   for (unsigned Part = 0; Part < UF; ++Part) {
2020     for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
2021       auto *StartIdx = getSignedIntOrFpConstant(ScalarIVTy, VF * Part + Lane);
2022       auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step));
2023       auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul));
2024       VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add);
2025       recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane);
2026     }
2027   }
2028 }
2029 
2030 Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) {
2031   assert(V != Induction && "The new induction variable should not be used.");
2032   assert(!V->getType()->isVectorTy() && "Can't widen a vector");
2033   assert(!V->getType()->isVoidTy() && "Type does not produce a value");
2034 
2035   // If we have a stride that is replaced by one, do it here. Defer this for
2036   // the VPlan-native path until we start running Legal checks in that path.
2037   if (!EnableVPlanNativePath && Legal->hasStride(V))
2038     V = ConstantInt::get(V->getType(), 1);
2039 
2040   // If we have a vector mapped to this value, return it.
2041   if (VectorLoopValueMap.hasVectorValue(V, Part))
2042     return VectorLoopValueMap.getVectorValue(V, Part);
2043 
2044   // If the value has not been vectorized, check if it has been scalarized
2045   // instead. If it has been scalarized, and we actually need the value in
2046   // vector form, we will construct the vector values on demand.
2047   if (VectorLoopValueMap.hasAnyScalarValue(V)) {
2048     Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0});
2049 
2050     // If we've scalarized a value, that value should be an instruction.
2051     auto *I = cast<Instruction>(V);
2052 
2053     // If we aren't vectorizing, we can just copy the scalar map values over to
2054     // the vector map.
2055     if (VF == 1) {
2056       VectorLoopValueMap.setVectorValue(V, Part, ScalarValue);
2057       return ScalarValue;
2058     }
2059 
2060     // Get the last scalar instruction we generated for V and Part. If the value
2061     // is known to be uniform after vectorization, this corresponds to lane zero
2062     // of the Part unroll iteration. Otherwise, the last instruction is the one
2063     // we created for the last vector lane of the Part unroll iteration.
2064     unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) ? 0 : VF - 1;
2065     auto *LastInst = cast<Instruction>(
2066         VectorLoopValueMap.getScalarValue(V, {Part, LastLane}));
2067 
2068     // Set the insert point after the last scalarized instruction. This ensures
2069     // the insertelement sequence will directly follow the scalar definitions.
2070     auto OldIP = Builder.saveIP();
2071     auto NewIP = std::next(BasicBlock::iterator(LastInst));
2072     Builder.SetInsertPoint(&*NewIP);
2073 
2074     // However, if we are vectorizing, we need to construct the vector values.
2075     // If the value is known to be uniform after vectorization, we can just
2076     // broadcast the scalar value corresponding to lane zero for each unroll
2077     // iteration. Otherwise, we construct the vector values using insertelement
2078     // instructions. Since the resulting vectors are stored in
2079     // VectorLoopValueMap, we will only generate the insertelements once.
2080     Value *VectorValue = nullptr;
2081     if (Cost->isUniformAfterVectorization(I, VF)) {
2082       VectorValue = getBroadcastInstrs(ScalarValue);
2083       VectorLoopValueMap.setVectorValue(V, Part, VectorValue);
2084     } else {
2085       // Initialize packing with insertelements to start from undef.
2086       Value *Undef = UndefValue::get(FixedVectorType::get(V->getType(), VF));
2087       VectorLoopValueMap.setVectorValue(V, Part, Undef);
2088       for (unsigned Lane = 0; Lane < VF; ++Lane)
2089         packScalarIntoVectorValue(V, {Part, Lane});
2090       VectorValue = VectorLoopValueMap.getVectorValue(V, Part);
2091     }
2092     Builder.restoreIP(OldIP);
2093     return VectorValue;
2094   }
2095 
2096   // If this scalar is unknown, assume that it is a constant or that it is
2097   // loop invariant. Broadcast V and save the value for future uses.
2098   Value *B = getBroadcastInstrs(V);
2099   VectorLoopValueMap.setVectorValue(V, Part, B);
2100   return B;
2101 }
2102 
2103 Value *
2104 InnerLoopVectorizer::getOrCreateScalarValue(Value *V,
2105                                             const VPIteration &Instance) {
2106   // If the value is not an instruction contained in the loop, it should
2107   // already be scalar.
2108   if (OrigLoop->isLoopInvariant(V))
2109     return V;
2110 
2111   assert(Instance.Lane > 0
2112              ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF)
2113              : true && "Uniform values only have lane zero");
2114 
2115   // If the value from the original loop has not been vectorized, it is
2116   // represented by UF x VF scalar values in the new loop. Return the requested
2117   // scalar value.
2118   if (VectorLoopValueMap.hasScalarValue(V, Instance))
2119     return VectorLoopValueMap.getScalarValue(V, Instance);
2120 
2121   // If the value has not been scalarized, get its entry in VectorLoopValueMap
2122   // for the given unroll part. If this entry is not a vector type (i.e., the
2123   // vectorization factor is one), there is no need to generate an
2124   // extractelement instruction.
2125   auto *U = getOrCreateVectorValue(V, Instance.Part);
2126   if (!U->getType()->isVectorTy()) {
2127     assert(VF == 1 && "Value not scalarized has non-vector type");
2128     return U;
2129   }
2130 
2131   // Otherwise, the value from the original loop has been vectorized and is
2132   // represented by UF vector values. Extract and return the requested scalar
2133   // value from the appropriate vector lane.
2134   return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane));
2135 }
2136 
2137 void InnerLoopVectorizer::packScalarIntoVectorValue(
2138     Value *V, const VPIteration &Instance) {
2139   assert(V != Induction && "The new induction variable should not be used.");
2140   assert(!V->getType()->isVectorTy() && "Can't pack a vector");
2141   assert(!V->getType()->isVoidTy() && "Type does not produce a value");
2142 
2143   Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance);
2144   Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part);
2145   VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst,
2146                                             Builder.getInt32(Instance.Lane));
2147   VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue);
2148 }
2149 
2150 Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
2151   assert(Vec->getType()->isVectorTy() && "Invalid type");
2152   SmallVector<int, 8> ShuffleMask;
2153   for (unsigned i = 0; i < VF; ++i)
2154     ShuffleMask.push_back(VF - i - 1);
2155 
2156   return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()),
2157                                      ShuffleMask, "reverse");
2158 }
2159 
2160 // Return whether we allow using masked interleave-groups (for dealing with
2161 // strided loads/stores that reside in predicated blocks, or for dealing
2162 // with gaps).
2163 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2164   // If an override option has been passed in for interleaved accesses, use it.
2165   if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2166     return EnableMaskedInterleavedMemAccesses;
2167 
2168   return TTI.enableMaskedInterleavedAccessVectorization();
2169 }
2170 
2171 // Try to vectorize the interleave group that \p Instr belongs to.
2172 //
2173 // E.g. Translate following interleaved load group (factor = 3):
2174 //   for (i = 0; i < N; i+=3) {
2175 //     R = Pic[i];             // Member of index 0
2176 //     G = Pic[i+1];           // Member of index 1
2177 //     B = Pic[i+2];           // Member of index 2
2178 //     ... // do something to R, G, B
2179 //   }
2180 // To:
2181 //   %wide.vec = load <12 x i32>                       ; Read 4 tuples of R,G,B
2182 //   %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9>   ; R elements
2183 //   %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10>  ; G elements
2184 //   %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11>  ; B elements
2185 //
2186 // Or translate following interleaved store group (factor = 3):
2187 //   for (i = 0; i < N; i+=3) {
2188 //     ... do something to R, G, B
2189 //     Pic[i]   = R;           // Member of index 0
2190 //     Pic[i+1] = G;           // Member of index 1
2191 //     Pic[i+2] = B;           // Member of index 2
2192 //   }
2193 // To:
2194 //   %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2195 //   %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u>
2196 //   %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2197 //        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements
2198 //   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
2199 void InnerLoopVectorizer::vectorizeInterleaveGroup(
2200     const InterleaveGroup<Instruction> *Group, VPTransformState &State,
2201     VPValue *Addr, VPValue *BlockInMask) {
2202   Instruction *Instr = Group->getInsertPos();
2203   const DataLayout &DL = Instr->getModule()->getDataLayout();
2204 
2205   // Prepare for the vector type of the interleaved load/store.
2206   Type *ScalarTy = getMemInstValueType(Instr);
2207   unsigned InterleaveFactor = Group->getFactor();
2208   auto *VecTy = FixedVectorType::get(ScalarTy, InterleaveFactor * VF);
2209 
2210   // Prepare for the new pointers.
2211   SmallVector<Value *, 2> AddrParts;
2212   unsigned Index = Group->getIndex(Instr);
2213 
2214   // TODO: extend the masked interleaved-group support to reversed access.
2215   assert((!BlockInMask || !Group->isReverse()) &&
2216          "Reversed masked interleave-group not supported.");
2217 
2218   // If the group is reverse, adjust the index to refer to the last vector lane
2219   // instead of the first. We adjust the index from the first vector lane,
2220   // rather than directly getting the pointer for lane VF - 1, because the
2221   // pointer operand of the interleaved access is supposed to be uniform. For
2222   // uniform instructions, we're only required to generate a value for the
2223   // first vector lane in each unroll iteration.
2224   if (Group->isReverse())
2225     Index += (VF - 1) * Group->getFactor();
2226 
2227   for (unsigned Part = 0; Part < UF; Part++) {
2228     Value *AddrPart = State.get(Addr, {Part, 0});
2229     setDebugLocFromInst(Builder, AddrPart);
2230 
2231     // Notice current instruction could be any index. Need to adjust the address
2232     // to the member of index 0.
2233     //
2234     // E.g.  a = A[i+1];     // Member of index 1 (Current instruction)
2235     //       b = A[i];       // Member of index 0
2236     // Current pointer is pointed to A[i+1], adjust it to A[i].
2237     //
2238     // E.g.  A[i+1] = a;     // Member of index 1
2239     //       A[i]   = b;     // Member of index 0
2240     //       A[i+2] = c;     // Member of index 2 (Current instruction)
2241     // Current pointer is pointed to A[i+2], adjust it to A[i].
2242 
2243     bool InBounds = false;
2244     if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts()))
2245       InBounds = gep->isInBounds();
2246     AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index));
2247     cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds);
2248 
2249     // Cast to the vector pointer type.
2250     unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace();
2251     Type *PtrTy = VecTy->getPointerTo(AddressSpace);
2252     AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy));
2253   }
2254 
2255   setDebugLocFromInst(Builder, Instr);
2256   Value *UndefVec = UndefValue::get(VecTy);
2257 
2258   Value *MaskForGaps = nullptr;
2259   if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) {
2260     MaskForGaps = createBitMaskForGaps(Builder, VF, *Group);
2261     assert(MaskForGaps && "Mask for Gaps is required but it is null");
2262   }
2263 
2264   // Vectorize the interleaved load group.
2265   if (isa<LoadInst>(Instr)) {
2266     // For each unroll part, create a wide load for the group.
2267     SmallVector<Value *, 2> NewLoads;
2268     for (unsigned Part = 0; Part < UF; Part++) {
2269       Instruction *NewLoad;
2270       if (BlockInMask || MaskForGaps) {
2271         assert(useMaskedInterleavedAccesses(*TTI) &&
2272                "masked interleaved groups are not allowed.");
2273         Value *GroupMask = MaskForGaps;
2274         if (BlockInMask) {
2275           Value *BlockInMaskPart = State.get(BlockInMask, Part);
2276           auto *Undefs = UndefValue::get(BlockInMaskPart->getType());
2277           Value *ShuffledMask = Builder.CreateShuffleVector(
2278               BlockInMaskPart, Undefs,
2279               createReplicatedMask(InterleaveFactor, VF), "interleaved.mask");
2280           GroupMask = MaskForGaps
2281                           ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
2282                                                 MaskForGaps)
2283                           : ShuffledMask;
2284         }
2285         NewLoad =
2286             Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlign(),
2287                                      GroupMask, UndefVec, "wide.masked.vec");
2288       }
2289       else
2290         NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part],
2291                                             Group->getAlign(), "wide.vec");
2292       Group->addMetadata(NewLoad);
2293       NewLoads.push_back(NewLoad);
2294     }
2295 
2296     // For each member in the group, shuffle out the appropriate data from the
2297     // wide loads.
2298     for (unsigned I = 0; I < InterleaveFactor; ++I) {
2299       Instruction *Member = Group->getMember(I);
2300 
2301       // Skip the gaps in the group.
2302       if (!Member)
2303         continue;
2304 
2305       auto StrideMask = createStrideMask(I, InterleaveFactor, VF);
2306       for (unsigned Part = 0; Part < UF; Part++) {
2307         Value *StridedVec = Builder.CreateShuffleVector(
2308             NewLoads[Part], UndefVec, StrideMask, "strided.vec");
2309 
2310         // If this member has different type, cast the result type.
2311         if (Member->getType() != ScalarTy) {
2312           VectorType *OtherVTy = FixedVectorType::get(Member->getType(), VF);
2313           StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2314         }
2315 
2316         if (Group->isReverse())
2317           StridedVec = reverseVector(StridedVec);
2318 
2319         VectorLoopValueMap.setVectorValue(Member, Part, StridedVec);
2320       }
2321     }
2322     return;
2323   }
2324 
2325   // The sub vector type for current instruction.
2326   auto *SubVT = FixedVectorType::get(ScalarTy, VF);
2327 
2328   // Vectorize the interleaved store group.
2329   for (unsigned Part = 0; Part < UF; Part++) {
2330     // Collect the stored vector from each member.
2331     SmallVector<Value *, 4> StoredVecs;
2332     for (unsigned i = 0; i < InterleaveFactor; i++) {
2333       // Interleaved store group doesn't allow a gap, so each index has a member
2334       Instruction *Member = Group->getMember(i);
2335       assert(Member && "Fail to get a member from an interleaved store group");
2336 
2337       Value *StoredVec = getOrCreateVectorValue(
2338           cast<StoreInst>(Member)->getValueOperand(), Part);
2339       if (Group->isReverse())
2340         StoredVec = reverseVector(StoredVec);
2341 
2342       // If this member has different type, cast it to a unified type.
2343 
2344       if (StoredVec->getType() != SubVT)
2345         StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2346 
2347       StoredVecs.push_back(StoredVec);
2348     }
2349 
2350     // Concatenate all vectors into a wide vector.
2351     Value *WideVec = concatenateVectors(Builder, StoredVecs);
2352 
2353     // Interleave the elements in the wide vector.
2354     Value *IVec = Builder.CreateShuffleVector(
2355         WideVec, UndefVec, createInterleaveMask(VF, InterleaveFactor),
2356         "interleaved.vec");
2357 
2358     Instruction *NewStoreInstr;
2359     if (BlockInMask) {
2360       Value *BlockInMaskPart = State.get(BlockInMask, Part);
2361       auto *Undefs = UndefValue::get(BlockInMaskPart->getType());
2362       Value *ShuffledMask = Builder.CreateShuffleVector(
2363           BlockInMaskPart, Undefs, createReplicatedMask(InterleaveFactor, VF),
2364           "interleaved.mask");
2365       NewStoreInstr = Builder.CreateMaskedStore(
2366           IVec, AddrParts[Part], Group->getAlign(), ShuffledMask);
2367     }
2368     else
2369       NewStoreInstr =
2370           Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign());
2371 
2372     Group->addMetadata(NewStoreInstr);
2373   }
2374 }
2375 
2376 void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
2377                                                      VPTransformState &State,
2378                                                      VPValue *Addr,
2379                                                      VPValue *StoredValue,
2380                                                      VPValue *BlockInMask) {
2381   // Attempt to issue a wide load.
2382   LoadInst *LI = dyn_cast<LoadInst>(Instr);
2383   StoreInst *SI = dyn_cast<StoreInst>(Instr);
2384 
2385   assert((LI || SI) && "Invalid Load/Store instruction");
2386   assert((!SI || StoredValue) && "No stored value provided for widened store");
2387   assert((!LI || !StoredValue) && "Stored value provided for widened load");
2388 
2389   LoopVectorizationCostModel::InstWidening Decision =
2390       Cost->getWideningDecision(Instr, VF);
2391   assert((Decision == LoopVectorizationCostModel::CM_Widen ||
2392           Decision == LoopVectorizationCostModel::CM_Widen_Reverse ||
2393           Decision == LoopVectorizationCostModel::CM_GatherScatter) &&
2394          "CM decision is not to widen the memory instruction");
2395 
2396   Type *ScalarDataTy = getMemInstValueType(Instr);
2397   auto *DataTy = FixedVectorType::get(ScalarDataTy, VF);
2398   const Align Alignment = getLoadStoreAlignment(Instr);
2399 
2400   // Determine if the pointer operand of the access is either consecutive or
2401   // reverse consecutive.
2402   bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse);
2403   bool ConsecutiveStride =
2404       Reverse || (Decision == LoopVectorizationCostModel::CM_Widen);
2405   bool CreateGatherScatter =
2406       (Decision == LoopVectorizationCostModel::CM_GatherScatter);
2407 
2408   // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector
2409   // gather/scatter. Otherwise Decision should have been to Scalarize.
2410   assert((ConsecutiveStride || CreateGatherScatter) &&
2411          "The instruction should be scalarized");
2412   (void)ConsecutiveStride;
2413 
2414   VectorParts BlockInMaskParts(UF);
2415   bool isMaskRequired = BlockInMask;
2416   if (isMaskRequired)
2417     for (unsigned Part = 0; Part < UF; ++Part)
2418       BlockInMaskParts[Part] = State.get(BlockInMask, Part);
2419 
2420   const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
2421     // Calculate the pointer for the specific unroll-part.
2422     GetElementPtrInst *PartPtr = nullptr;
2423 
2424     bool InBounds = false;
2425     if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
2426       InBounds = gep->isInBounds();
2427 
2428     if (Reverse) {
2429       // If the address is consecutive but reversed, then the
2430       // wide store needs to start at the last vector element.
2431       PartPtr = cast<GetElementPtrInst>(
2432           Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(-Part * VF)));
2433       PartPtr->setIsInBounds(InBounds);
2434       PartPtr = cast<GetElementPtrInst>(
2435           Builder.CreateGEP(ScalarDataTy, PartPtr, Builder.getInt32(1 - VF)));
2436       PartPtr->setIsInBounds(InBounds);
2437       if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
2438         BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]);
2439     } else {
2440       PartPtr = cast<GetElementPtrInst>(
2441           Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(Part * VF)));
2442       PartPtr->setIsInBounds(InBounds);
2443     }
2444 
2445     unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
2446     return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
2447   };
2448 
2449   // Handle Stores:
2450   if (SI) {
2451     setDebugLocFromInst(Builder, SI);
2452 
2453     for (unsigned Part = 0; Part < UF; ++Part) {
2454       Instruction *NewSI = nullptr;
2455       Value *StoredVal = State.get(StoredValue, Part);
2456       if (CreateGatherScatter) {
2457         Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
2458         Value *VectorGep = State.get(Addr, Part);
2459         NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
2460                                             MaskPart);
2461       } else {
2462         if (Reverse) {
2463           // If we store to reverse consecutive memory locations, then we need
2464           // to reverse the order of elements in the stored value.
2465           StoredVal = reverseVector(StoredVal);
2466           // We don't want to update the value in the map as it might be used in
2467           // another expression. So don't call resetVectorValue(StoredVal).
2468         }
2469         auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0}));
2470         if (isMaskRequired)
2471           NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
2472                                             BlockInMaskParts[Part]);
2473         else
2474           NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
2475       }
2476       addMetadata(NewSI, SI);
2477     }
2478     return;
2479   }
2480 
2481   // Handle loads.
2482   assert(LI && "Must have a load instruction");
2483   setDebugLocFromInst(Builder, LI);
2484   for (unsigned Part = 0; Part < UF; ++Part) {
2485     Value *NewLI;
2486     if (CreateGatherScatter) {
2487       Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
2488       Value *VectorGep = State.get(Addr, Part);
2489       NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart,
2490                                          nullptr, "wide.masked.gather");
2491       addMetadata(NewLI, LI);
2492     } else {
2493       auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0}));
2494       if (isMaskRequired)
2495         NewLI = Builder.CreateMaskedLoad(
2496             VecPtr, Alignment, BlockInMaskParts[Part], UndefValue::get(DataTy),
2497             "wide.masked.load");
2498       else
2499         NewLI =
2500             Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");
2501 
2502       // Add metadata to the load, but setVectorValue to the reverse shuffle.
2503       addMetadata(NewLI, LI);
2504       if (Reverse)
2505         NewLI = reverseVector(NewLI);
2506     }
2507     VectorLoopValueMap.setVectorValue(Instr, Part, NewLI);
2508   }
2509 }
2510 
2511 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPUser &User,
2512                                                const VPIteration &Instance,
2513                                                bool IfPredicateInstr,
2514                                                VPTransformState &State) {
2515   assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
2516 
2517   setDebugLocFromInst(Builder, Instr);
2518 
2519   // Does this instruction return a value ?
2520   bool IsVoidRetTy = Instr->getType()->isVoidTy();
2521 
2522   Instruction *Cloned = Instr->clone();
2523   if (!IsVoidRetTy)
2524     Cloned->setName(Instr->getName() + ".cloned");
2525 
2526   // Replace the operands of the cloned instructions with their scalar
2527   // equivalents in the new loop.
2528   for (unsigned op = 0, e = User.getNumOperands(); op != e; ++op) {
2529     auto *NewOp = State.get(User.getOperand(op), Instance);
2530     Cloned->setOperand(op, NewOp);
2531   }
2532   addNewMetadata(Cloned, Instr);
2533 
2534   // Place the cloned scalar in the new loop.
2535   Builder.Insert(Cloned);
2536 
2537   // Add the cloned scalar to the scalar map entry.
2538   VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned);
2539 
2540   // If we just cloned a new assumption, add it the assumption cache.
2541   if (auto *II = dyn_cast<IntrinsicInst>(Cloned))
2542     if (II->getIntrinsicID() == Intrinsic::assume)
2543       AC->registerAssumption(II);
2544 
2545   // End if-block.
2546   if (IfPredicateInstr)
2547     PredicatedInstructions.push_back(Cloned);
2548 }
2549 
2550 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start,
2551                                                       Value *End, Value *Step,
2552                                                       Instruction *DL) {
2553   BasicBlock *Header = L->getHeader();
2554   BasicBlock *Latch = L->getLoopLatch();
2555   // As we're just creating this loop, it's possible no latch exists
2556   // yet. If so, use the header as this will be a single block loop.
2557   if (!Latch)
2558     Latch = Header;
2559 
2560   IRBuilder<> Builder(&*Header->getFirstInsertionPt());
2561   Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction);
2562   setDebugLocFromInst(Builder, OldInst);
2563   auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index");
2564 
2565   Builder.SetInsertPoint(Latch->getTerminator());
2566   setDebugLocFromInst(Builder, OldInst);
2567 
2568   // Create i+1 and fill the PHINode.
2569   Value *Next = Builder.CreateAdd(Induction, Step, "index.next");
2570   Induction->addIncoming(Start, L->getLoopPreheader());
2571   Induction->addIncoming(Next, Latch);
2572   // Create the compare.
2573   Value *ICmp = Builder.CreateICmpEQ(Next, End);
2574   Builder.CreateCondBr(ICmp, L->getExitBlock(), Header);
2575 
2576   // Now we have two terminators. Remove the old one from the block.
2577   Latch->getTerminator()->eraseFromParent();
2578 
2579   return Induction;
2580 }
2581 
2582 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
2583   if (TripCount)
2584     return TripCount;
2585 
2586   assert(L && "Create Trip Count for null loop.");
2587   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2588   // Find the loop boundaries.
2589   ScalarEvolution *SE = PSE.getSE();
2590   const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
2591   assert(BackedgeTakenCount != SE->getCouldNotCompute() &&
2592          "Invalid loop count");
2593 
2594   Type *IdxTy = Legal->getWidestInductionType();
2595   assert(IdxTy && "No type for induction");
2596 
2597   // The exit count might have the type of i64 while the phi is i32. This can
2598   // happen if we have an induction variable that is sign extended before the
2599   // compare. The only way that we get a backedge taken count is that the
2600   // induction variable was signed and as such will not overflow. In such a case
2601   // truncation is legal.
2602   if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) >
2603       IdxTy->getPrimitiveSizeInBits())
2604     BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
2605   BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
2606 
2607   // Get the total trip count from the count by adding 1.
2608   const SCEV *ExitCount = SE->getAddExpr(
2609       BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
2610 
2611   const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
2612 
2613   // Expand the trip count and place the new instructions in the preheader.
2614   // Notice that the pre-header does not change, only the loop body.
2615   SCEVExpander Exp(*SE, DL, "induction");
2616 
2617   // Count holds the overall loop count (N).
2618   TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
2619                                 L->getLoopPreheader()->getTerminator());
2620 
2621   if (TripCount->getType()->isPointerTy())
2622     TripCount =
2623         CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
2624                                     L->getLoopPreheader()->getTerminator());
2625 
2626   return TripCount;
2627 }
2628 
2629 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
2630   if (VectorTripCount)
2631     return VectorTripCount;
2632 
2633   Value *TC = getOrCreateTripCount(L);
2634   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2635 
2636   Type *Ty = TC->getType();
2637   Constant *Step = ConstantInt::get(Ty, VF * UF);
2638 
2639   // If the tail is to be folded by masking, round the number of iterations N
2640   // up to a multiple of Step instead of rounding down. This is done by first
2641   // adding Step-1 and then rounding down. Note that it's ok if this addition
2642   // overflows: the vector induction variable will eventually wrap to zero given
2643   // that it starts at zero and its Step is a power of two; the loop will then
2644   // exit, with the last early-exit vector comparison also producing all-true.
2645   if (Cost->foldTailByMasking()) {
2646     assert(isPowerOf2_32(VF * UF) &&
2647            "VF*UF must be a power of 2 when folding tail by masking");
2648     TC = Builder.CreateAdd(TC, ConstantInt::get(Ty, VF * UF - 1), "n.rnd.up");
2649   }
2650 
2651   // Now we need to generate the expression for the part of the loop that the
2652   // vectorized body will execute. This is equal to N - (N % Step) if scalar
2653   // iterations are not required for correctness, or N - Step, otherwise. Step
2654   // is equal to the vectorization factor (number of SIMD elements) times the
2655   // unroll factor (number of SIMD instructions).
2656   Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
2657 
2658   // If there is a non-reversed interleaved group that may speculatively access
2659   // memory out-of-bounds, we need to ensure that there will be at least one
2660   // iteration of the scalar epilogue loop. Thus, if the step evenly divides
2661   // the trip count, we set the remainder to be equal to the step. If the step
2662   // does not evenly divide the trip count, no adjustment is necessary since
2663   // there will already be scalar iterations. Note that the minimum iterations
2664   // check ensures that N >= Step.
2665   if (VF > 1 && Cost->requiresScalarEpilogue()) {
2666     auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
2667     R = Builder.CreateSelect(IsZero, Step, R);
2668   }
2669 
2670   VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
2671 
2672   return VectorTripCount;
2673 }
2674 
2675 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
2676                                                    const DataLayout &DL) {
2677   // Verify that V is a vector type with same number of elements as DstVTy.
2678   unsigned VF = DstVTy->getNumElements();
2679   VectorType *SrcVecTy = cast<VectorType>(V->getType());
2680   assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match");
2681   Type *SrcElemTy = SrcVecTy->getElementType();
2682   Type *DstElemTy = DstVTy->getElementType();
2683   assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
2684          "Vector elements must have same size");
2685 
2686   // Do a direct cast if element types are castable.
2687   if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
2688     return Builder.CreateBitOrPointerCast(V, DstVTy);
2689   }
2690   // V cannot be directly casted to desired vector type.
2691   // May happen when V is a floating point vector but DstVTy is a vector of
2692   // pointers or vice-versa. Handle this using a two-step bitcast using an
2693   // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
2694   assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
2695          "Only one type should be a pointer type");
2696   assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
2697          "Only one type should be a floating point type");
2698   Type *IntTy =
2699       IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
2700   auto *VecIntTy = FixedVectorType::get(IntTy, VF);
2701   Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
2702   return Builder.CreateBitOrPointerCast(CastVal, DstVTy);
2703 }
2704 
2705 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
2706                                                          BasicBlock *Bypass) {
2707   Value *Count = getOrCreateTripCount(L);
2708   // Reuse existing vector loop preheader for TC checks.
2709   // Note that new preheader block is generated for vector loop.
2710   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
2711   IRBuilder<> Builder(TCCheckBlock->getTerminator());
2712 
2713   // Generate code to check if the loop's trip count is less than VF * UF, or
2714   // equal to it in case a scalar epilogue is required; this implies that the
2715   // vector trip count is zero. This check also covers the case where adding one
2716   // to the backedge-taken count overflowed leading to an incorrect trip count
2717   // of zero. In this case we will also jump to the scalar loop.
2718   auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE
2719                                           : ICmpInst::ICMP_ULT;
2720 
2721   // If tail is to be folded, vector loop takes care of all iterations.
2722   Value *CheckMinIters = Builder.getFalse();
2723   if (!Cost->foldTailByMasking())
2724     CheckMinIters = Builder.CreateICmp(
2725         P, Count, ConstantInt::get(Count->getType(), VF * UF),
2726         "min.iters.check");
2727 
2728   // Create new preheader for vector loop.
2729   LoopVectorPreHeader =
2730       SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
2731                  "vector.ph");
2732 
2733   assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
2734                                DT->getNode(Bypass)->getIDom()) &&
2735          "TC check is expected to dominate Bypass");
2736 
2737   // Update dominator for Bypass & LoopExit.
2738   DT->changeImmediateDominator(Bypass, TCCheckBlock);
2739   DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
2740 
2741   ReplaceInstWithInst(
2742       TCCheckBlock->getTerminator(),
2743       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
2744   LoopBypassBlocks.push_back(TCCheckBlock);
2745 }
2746 
2747 void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
2748   // Reuse existing vector loop preheader for SCEV checks.
2749   // Note that new preheader block is generated for vector loop.
2750   BasicBlock *const SCEVCheckBlock = LoopVectorPreHeader;
2751 
2752   // Generate the code to check that the SCEV assumptions that we made.
2753   // We want the new basic block to start at the first instruction in a
2754   // sequence of instructions that form a check.
2755   SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(),
2756                    "scev.check");
2757   Value *SCEVCheck = Exp.expandCodeForPredicate(
2758       &PSE.getUnionPredicate(), SCEVCheckBlock->getTerminator());
2759 
2760   if (auto *C = dyn_cast<ConstantInt>(SCEVCheck))
2761     if (C->isZero())
2762       return;
2763 
2764   assert(!(SCEVCheckBlock->getParent()->hasOptSize() ||
2765            llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
2766                                        PGSOQueryType::IRPass)) &&
2767          "Cannot SCEV check stride or overflow when optimizing for size");
2768 
2769   SCEVCheckBlock->setName("vector.scevcheck");
2770   // Create new preheader for vector loop.
2771   LoopVectorPreHeader =
2772       SplitBlock(SCEVCheckBlock, SCEVCheckBlock->getTerminator(), DT, LI,
2773                  nullptr, "vector.ph");
2774 
2775   // Update dominator only if this is first RT check.
2776   if (LoopBypassBlocks.empty()) {
2777     DT->changeImmediateDominator(Bypass, SCEVCheckBlock);
2778     DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock);
2779   }
2780 
2781   ReplaceInstWithInst(
2782       SCEVCheckBlock->getTerminator(),
2783       BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheck));
2784   LoopBypassBlocks.push_back(SCEVCheckBlock);
2785   AddedSafetyChecks = true;
2786 }
2787 
2788 void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {
2789   // VPlan-native path does not do any analysis for runtime checks currently.
2790   if (EnableVPlanNativePath)
2791     return;
2792 
2793   // Reuse existing vector loop preheader for runtime memory checks.
2794   // Note that new preheader block is generated for vector loop.
2795   BasicBlock *const MemCheckBlock = L->getLoopPreheader();
2796 
2797   // Generate the code that checks in runtime if arrays overlap. We put the
2798   // checks into a separate block to make the more common case of few elements
2799   // faster.
2800   auto *LAI = Legal->getLAI();
2801   const auto &RtPtrChecking = *LAI->getRuntimePointerChecking();
2802   if (!RtPtrChecking.Need)
2803     return;
2804   Instruction *FirstCheckInst;
2805   Instruction *MemRuntimeCheck;
2806   std::tie(FirstCheckInst, MemRuntimeCheck) =
2807       addRuntimeChecks(MemCheckBlock->getTerminator(), OrigLoop,
2808                        RtPtrChecking.getChecks(), RtPtrChecking.getSE());
2809   assert(MemRuntimeCheck && "no RT checks generated although RtPtrChecking "
2810                             "claimed checks are required");
2811 
2812   if (MemCheckBlock->getParent()->hasOptSize() ||
2813       llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
2814                                   PGSOQueryType::IRPass)) {
2815     assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
2816            "Cannot emit memory checks when optimizing for size, unless forced "
2817            "to vectorize.");
2818     ORE->emit([&]() {
2819       return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
2820                                         L->getStartLoc(), L->getHeader())
2821              << "Code-size may be reduced by not forcing "
2822                 "vectorization, or by source-code modifications "
2823                 "eliminating the need for runtime checks "
2824                 "(e.g., adding 'restrict').";
2825     });
2826   }
2827 
2828   MemCheckBlock->setName("vector.memcheck");
2829   // Create new preheader for vector loop.
2830   LoopVectorPreHeader =
2831       SplitBlock(MemCheckBlock, MemCheckBlock->getTerminator(), DT, LI, nullptr,
2832                  "vector.ph");
2833 
2834   // Update dominator only if this is first RT check.
2835   if (LoopBypassBlocks.empty()) {
2836     DT->changeImmediateDominator(Bypass, MemCheckBlock);
2837     DT->changeImmediateDominator(LoopExitBlock, MemCheckBlock);
2838   }
2839 
2840   ReplaceInstWithInst(
2841       MemCheckBlock->getTerminator(),
2842       BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheck));
2843   LoopBypassBlocks.push_back(MemCheckBlock);
2844   AddedSafetyChecks = true;
2845 
2846   // We currently don't use LoopVersioning for the actual loop cloning but we
2847   // still use it to add the noalias metadata.
2848   LVer = std::make_unique<LoopVersioning>(*Legal->getLAI(), OrigLoop, LI, DT,
2849                                           PSE.getSE());
2850   LVer->prepareNoAliasMetadata();
2851 }
2852 
2853 Value *InnerLoopVectorizer::emitTransformedIndex(
2854     IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL,
2855     const InductionDescriptor &ID) const {
2856 
2857   SCEVExpander Exp(*SE, DL, "induction");
2858   auto Step = ID.getStep();
2859   auto StartValue = ID.getStartValue();
2860   assert(Index->getType() == Step->getType() &&
2861          "Index type does not match StepValue type");
2862 
2863   // Note: the IR at this point is broken. We cannot use SE to create any new
2864   // SCEV and then expand it, hoping that SCEV's simplification will give us
2865   // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
2866   // lead to various SCEV crashes. So all we can do is to use builder and rely
2867   // on InstCombine for future simplifications. Here we handle some trivial
2868   // cases only.
2869   auto CreateAdd = [&B](Value *X, Value *Y) {
2870     assert(X->getType() == Y->getType() && "Types don't match!");
2871     if (auto *CX = dyn_cast<ConstantInt>(X))
2872       if (CX->isZero())
2873         return Y;
2874     if (auto *CY = dyn_cast<ConstantInt>(Y))
2875       if (CY->isZero())
2876         return X;
2877     return B.CreateAdd(X, Y);
2878   };
2879 
2880   auto CreateMul = [&B](Value *X, Value *Y) {
2881     assert(X->getType() == Y->getType() && "Types don't match!");
2882     if (auto *CX = dyn_cast<ConstantInt>(X))
2883       if (CX->isOne())
2884         return Y;
2885     if (auto *CY = dyn_cast<ConstantInt>(Y))
2886       if (CY->isOne())
2887         return X;
2888     return B.CreateMul(X, Y);
2889   };
2890 
2891   // Get a suitable insert point for SCEV expansion. For blocks in the vector
2892   // loop, choose the end of the vector loop header (=LoopVectorBody), because
2893   // the DomTree is not kept up-to-date for additional blocks generated in the
2894   // vector loop. By using the header as insertion point, we guarantee that the
2895   // expanded instructions dominate all their uses.
2896   auto GetInsertPoint = [this, &B]() {
2897     BasicBlock *InsertBB = B.GetInsertPoint()->getParent();
2898     if (InsertBB != LoopVectorBody &&
2899         LI->getLoopFor(LoopVectorBody) == LI->getLoopFor(InsertBB))
2900       return LoopVectorBody->getTerminator();
2901     return &*B.GetInsertPoint();
2902   };
2903   switch (ID.getKind()) {
2904   case InductionDescriptor::IK_IntInduction: {
2905     assert(Index->getType() == StartValue->getType() &&
2906            "Index type does not match StartValue type");
2907     if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne())
2908       return B.CreateSub(StartValue, Index);
2909     auto *Offset = CreateMul(
2910         Index, Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint()));
2911     return CreateAdd(StartValue, Offset);
2912   }
2913   case InductionDescriptor::IK_PtrInduction: {
2914     assert(isa<SCEVConstant>(Step) &&
2915            "Expected constant step for pointer induction");
2916     return B.CreateGEP(
2917         StartValue->getType()->getPointerElementType(), StartValue,
2918         CreateMul(Index,
2919                   Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint())));
2920   }
2921   case InductionDescriptor::IK_FpInduction: {
2922     assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
2923     auto InductionBinOp = ID.getInductionBinOp();
2924     assert(InductionBinOp &&
2925            (InductionBinOp->getOpcode() == Instruction::FAdd ||
2926             InductionBinOp->getOpcode() == Instruction::FSub) &&
2927            "Original bin op should be defined for FP induction");
2928 
2929     Value *StepValue = cast<SCEVUnknown>(Step)->getValue();
2930 
2931     // Floating point operations had to be 'fast' to enable the induction.
2932     FastMathFlags Flags;
2933     Flags.setFast();
2934 
2935     Value *MulExp = B.CreateFMul(StepValue, Index);
2936     if (isa<Instruction>(MulExp))
2937       // We have to check, the MulExp may be a constant.
2938       cast<Instruction>(MulExp)->setFastMathFlags(Flags);
2939 
2940     Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
2941                                "induction");
2942     if (isa<Instruction>(BOp))
2943       cast<Instruction>(BOp)->setFastMathFlags(Flags);
2944 
2945     return BOp;
2946   }
2947   case InductionDescriptor::IK_NoInduction:
2948     return nullptr;
2949   }
2950   llvm_unreachable("invalid enum");
2951 }
2952 
2953 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
2954   /*
2955    In this function we generate a new loop. The new loop will contain
2956    the vectorized instructions while the old loop will continue to run the
2957    scalar remainder.
2958 
2959        [ ] <-- loop iteration number check.
2960     /   |
2961    /    v
2962   |    [ ] <-- vector loop bypass (may consist of multiple blocks).
2963   |  /  |
2964   | /   v
2965   ||   [ ]     <-- vector pre header.
2966   |/    |
2967   |     v
2968   |    [  ] \
2969   |    [  ]_|   <-- vector loop.
2970   |     |
2971   |     v
2972   |   -[ ]   <--- middle-block.
2973   |  /  |
2974   | /   v
2975   -|- >[ ]     <--- new preheader.
2976    |    |
2977    |    v
2978    |   [ ] \
2979    |   [ ]_|   <-- old scalar loop to handle remainder.
2980     \   |
2981      \  v
2982       >[ ]     <-- exit block.
2983    ...
2984    */
2985 
2986   MDNode *OrigLoopID = OrigLoop->getLoopID();
2987 
2988   // Some loops have a single integer induction variable, while other loops
2989   // don't. One example is c++ iterators that often have multiple pointer
2990   // induction variables. In the code below we also support a case where we
2991   // don't have a single induction variable.
2992   //
2993   // We try to obtain an induction variable from the original loop as hard
2994   // as possible. However if we don't find one that:
2995   //   - is an integer
2996   //   - counts from zero, stepping by one
2997   //   - is the size of the widest induction variable type
2998   // then we create a new one.
2999   OldInduction = Legal->getPrimaryInduction();
3000   Type *IdxTy = Legal->getWidestInductionType();
3001 
3002   // Split the single block loop into the two loop structure described above.
3003   LoopScalarBody = OrigLoop->getHeader();
3004   LoopVectorPreHeader = OrigLoop->getLoopPreheader();
3005   LoopExitBlock = OrigLoop->getExitBlock();
3006   assert(LoopExitBlock && "Must have an exit block");
3007   assert(LoopVectorPreHeader && "Invalid loop structure");
3008 
3009   LoopMiddleBlock =
3010       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
3011                  LI, nullptr, "middle.block");
3012   LoopScalarPreHeader =
3013       SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,
3014                  nullptr, "scalar.ph");
3015   // We intentionally don't let SplitBlock to update LoopInfo since
3016   // LoopVectorBody should belong to another loop than LoopVectorPreHeader.
3017   // LoopVectorBody is explicitly added to the correct place few lines later.
3018   LoopVectorBody =
3019       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
3020                  nullptr, nullptr, "vector.body");
3021 
3022   // Update dominator for loop exit.
3023   DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
3024 
3025   // Create and register the new vector loop.
3026   Loop *Lp = LI->AllocateLoop();
3027   Loop *ParentLoop = OrigLoop->getParentLoop();
3028 
3029   // Insert the new loop into the loop nest and register the new basic blocks
3030   // before calling any utilities such as SCEV that require valid LoopInfo.
3031   if (ParentLoop) {
3032     ParentLoop->addChildLoop(Lp);
3033   } else {
3034     LI->addTopLevelLoop(Lp);
3035   }
3036   Lp->addBasicBlockToLoop(LoopVectorBody, *LI);
3037 
3038   // Find the loop boundaries.
3039   Value *Count = getOrCreateTripCount(Lp);
3040 
3041   Value *StartIdx = ConstantInt::get(IdxTy, 0);
3042 
3043   // Now, compare the new count to zero. If it is zero skip the vector loop and
3044   // jump to the scalar loop. This check also covers the case where the
3045   // backedge-taken count is uint##_max: adding one to it will overflow leading
3046   // to an incorrect trip count of zero. In this (rare) case we will also jump
3047   // to the scalar loop.
3048   emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader);
3049 
3050   // Generate the code to check any assumptions that we've made for SCEV
3051   // expressions.
3052   emitSCEVChecks(Lp, LoopScalarPreHeader);
3053 
3054   // Generate the code that checks in runtime if arrays overlap. We put the
3055   // checks into a separate block to make the more common case of few elements
3056   // faster.
3057   emitMemRuntimeChecks(Lp, LoopScalarPreHeader);
3058 
3059   // Generate the induction variable.
3060   // The loop step is equal to the vectorization factor (num of SIMD elements)
3061   // times the unroll factor (num of SIMD instructions).
3062   Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
3063   Constant *Step = ConstantInt::get(IdxTy, VF * UF);
3064   Induction =
3065       createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
3066                               getDebugLocFromInstOrOperands(OldInduction));
3067 
3068   // We are going to resume the execution of the scalar loop.
3069   // Go over all of the induction variables that we found and fix the
3070   // PHIs that are left in the scalar version of the loop.
3071   // The starting values of PHI nodes depend on the counter of the last
3072   // iteration in the vectorized loop.
3073   // If we come from a bypass edge then we need to start from the original
3074   // start value.
3075 
3076   // This variable saves the new starting index for the scalar loop. It is used
3077   // to test if there are any tail iterations left once the vector loop has
3078   // completed.
3079   for (auto &InductionEntry : Legal->getInductionVars()) {
3080     PHINode *OrigPhi = InductionEntry.first;
3081     InductionDescriptor II = InductionEntry.second;
3082 
3083     // Create phi nodes to merge from the  backedge-taken check block.
3084     PHINode *BCResumeVal =
3085         PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
3086                         LoopScalarPreHeader->getTerminator());
3087     // Copy original phi DL over to the new one.
3088     BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
3089     Value *&EndValue = IVEndValues[OrigPhi];
3090     if (OrigPhi == OldInduction) {
3091       // We know what the end value is.
3092       EndValue = CountRoundDown;
3093     } else {
3094       IRBuilder<> B(Lp->getLoopPreheader()->getTerminator());
3095       Type *StepType = II.getStep()->getType();
3096       Instruction::CastOps CastOp =
3097           CastInst::getCastOpcode(CountRoundDown, true, StepType, true);
3098       Value *CRD = B.CreateCast(CastOp, CountRoundDown, StepType, "cast.crd");
3099       const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout();
3100       EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
3101       EndValue->setName("ind.end");
3102     }
3103 
3104     // The new PHI merges the original incoming value, in case of a bypass,
3105     // or the value at the end of the vectorized loop.
3106     BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);
3107 
3108     // Fix the scalar body counter (PHI node).
3109     // The old induction's phi node in the scalar body needs the truncated
3110     // value.
3111     for (BasicBlock *BB : LoopBypassBlocks)
3112       BCResumeVal->addIncoming(II.getStartValue(), BB);
3113     OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
3114   }
3115 
3116   // We need the OrigLoop (scalar loop part) latch terminator to help
3117   // produce correct debug info for the middle block BB instructions.
3118   // The legality check stage guarantees that the loop will have a single
3119   // latch.
3120   assert(isa<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()) &&
3121          "Scalar loop latch terminator isn't a branch");
3122   BranchInst *ScalarLatchBr =
3123       cast<BranchInst>(OrigLoop->getLoopLatch()->getTerminator());
3124 
3125   // Add a check in the middle block to see if we have completed
3126   // all of the iterations in the first vector loop.
3127   // If (N - N%VF) == N, then we *don't* need to run the remainder.
3128   // If tail is to be folded, we know we don't need to run the remainder.
3129   Value *CmpN = Builder.getTrue();
3130   if (!Cost->foldTailByMasking()) {
3131     CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count,
3132                            CountRoundDown, "cmp.n",
3133                            LoopMiddleBlock->getTerminator());
3134 
3135     // Here we use the same DebugLoc as the scalar loop latch branch instead
3136     // of the corresponding compare because they may have ended up with
3137     // different line numbers and we want to avoid awkward line stepping while
3138     // debugging. Eg. if the compare has got a line number inside the loop.
3139     cast<Instruction>(CmpN)->setDebugLoc(ScalarLatchBr->getDebugLoc());
3140   }
3141 
3142   BranchInst *BrInst =
3143       BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, CmpN);
3144   BrInst->setDebugLoc(ScalarLatchBr->getDebugLoc());
3145   ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst);
3146 
3147   // Get ready to start creating new instructions into the vectorized body.
3148   assert(LoopVectorPreHeader == Lp->getLoopPreheader() &&
3149          "Inconsistent vector loop preheader");
3150   Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt());
3151 
3152   Optional<MDNode *> VectorizedLoopID =
3153       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
3154                                       LLVMLoopVectorizeFollowupVectorized});
3155   if (VectorizedLoopID.hasValue()) {
3156     Lp->setLoopID(VectorizedLoopID.getValue());
3157 
3158     // Do not setAlreadyVectorized if loop attributes have been defined
3159     // explicitly.
3160     return LoopVectorPreHeader;
3161   }
3162 
3163   // Keep all loop hints from the original loop on the vector loop (we'll
3164   // replace the vectorizer-specific hints below).
3165   if (MDNode *LID = OrigLoop->getLoopID())
3166     Lp->setLoopID(LID);
3167 
3168   LoopVectorizeHints Hints(Lp, true, *ORE);
3169   Hints.setAlreadyVectorized();
3170 
3171 #ifdef EXPENSIVE_CHECKS
3172   assert(DT->verify(DominatorTree::VerificationLevel::Fast));
3173   LI->verify(*DT);
3174 #endif
3175 
3176   return LoopVectorPreHeader;
3177 }
3178 
3179 // Fix up external users of the induction variable. At this point, we are
3180 // in LCSSA form, with all external PHIs that use the IV having one input value,
3181 // coming from the remainder loop. We need those PHIs to also have a correct
3182 // value for the IV when arriving directly from the middle block.
3183 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
3184                                        const InductionDescriptor &II,
3185                                        Value *CountRoundDown, Value *EndValue,
3186                                        BasicBlock *MiddleBlock) {
3187   // There are two kinds of external IV usages - those that use the value
3188   // computed in the last iteration (the PHI) and those that use the penultimate
3189   // value (the value that feeds into the phi from the loop latch).
3190   // We allow both, but they, obviously, have different values.
3191 
3192   assert(OrigLoop->getExitBlock() && "Expected a single exit block");
3193 
3194   DenseMap<Value *, Value *> MissingVals;
3195 
3196   // An external user of the last iteration's value should see the value that
3197   // the remainder loop uses to initialize its own IV.
3198   Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
3199   for (User *U : PostInc->users()) {
3200     Instruction *UI = cast<Instruction>(U);
3201     if (!OrigLoop->contains(UI)) {
3202       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3203       MissingVals[UI] = EndValue;
3204     }
3205   }
3206 
3207   // An external user of the penultimate value need to see EndValue - Step.
3208   // The simplest way to get this is to recompute it from the constituent SCEVs,
3209   // that is Start + (Step * (CRD - 1)).
3210   for (User *U : OrigPhi->users()) {
3211     auto *UI = cast<Instruction>(U);
3212     if (!OrigLoop->contains(UI)) {
3213       const DataLayout &DL =
3214           OrigLoop->getHeader()->getModule()->getDataLayout();
3215       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3216 
3217       IRBuilder<> B(MiddleBlock->getTerminator());
3218       Value *CountMinusOne = B.CreateSub(
3219           CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1));
3220       Value *CMO =
3221           !II.getStep()->getType()->isIntegerTy()
3222               ? B.CreateCast(Instruction::SIToFP, CountMinusOne,
3223                              II.getStep()->getType())
3224               : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType());
3225       CMO->setName("cast.cmo");
3226       Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II);
3227       Escape->setName("ind.escape");
3228       MissingVals[UI] = Escape;
3229     }
3230   }
3231 
3232   for (auto &I : MissingVals) {
3233     PHINode *PHI = cast<PHINode>(I.first);
3234     // One corner case we have to handle is two IVs "chasing" each-other,
3235     // that is %IV2 = phi [...], [ %IV1, %latch ]
3236     // In this case, if IV1 has an external use, we need to avoid adding both
3237     // "last value of IV1" and "penultimate value of IV2". So, verify that we
3238     // don't already have an incoming value for the middle block.
3239     if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
3240       PHI->addIncoming(I.second, MiddleBlock);
3241   }
3242 }
3243 
3244 namespace {
3245 
3246 struct CSEDenseMapInfo {
3247   static bool canHandle(const Instruction *I) {
3248     return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
3249            isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
3250   }
3251 
3252   static inline Instruction *getEmptyKey() {
3253     return DenseMapInfo<Instruction *>::getEmptyKey();
3254   }
3255 
3256   static inline Instruction *getTombstoneKey() {
3257     return DenseMapInfo<Instruction *>::getTombstoneKey();
3258   }
3259 
3260   static unsigned getHashValue(const Instruction *I) {
3261     assert(canHandle(I) && "Unknown instruction!");
3262     return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
3263                                                            I->value_op_end()));
3264   }
3265 
3266   static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
3267     if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3268         LHS == getTombstoneKey() || RHS == getTombstoneKey())
3269       return LHS == RHS;
3270     return LHS->isIdenticalTo(RHS);
3271   }
3272 };
3273 
3274 } // end anonymous namespace
3275 
3276 ///Perform cse of induction variable instructions.
3277 static void cse(BasicBlock *BB) {
3278   // Perform simple cse.
3279   SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
3280   for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
3281     Instruction *In = &*I++;
3282 
3283     if (!CSEDenseMapInfo::canHandle(In))
3284       continue;
3285 
3286     // Check if we can replace this instruction with any of the
3287     // visited instructions.
3288     if (Instruction *V = CSEMap.lookup(In)) {
3289       In->replaceAllUsesWith(V);
3290       In->eraseFromParent();
3291       continue;
3292     }
3293 
3294     CSEMap[In] = In;
3295   }
3296 }
3297 
3298 unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
3299                                                        unsigned VF,
3300                                                        bool &NeedToScalarize) {
3301   Function *F = CI->getCalledFunction();
3302   Type *ScalarRetTy = CI->getType();
3303   SmallVector<Type *, 4> Tys, ScalarTys;
3304   for (auto &ArgOp : CI->arg_operands())
3305     ScalarTys.push_back(ArgOp->getType());
3306 
3307   // Estimate cost of scalarized vector call. The source operands are assumed
3308   // to be vectors, so we need to extract individual elements from there,
3309   // execute VF scalar calls, and then gather the result into the vector return
3310   // value.
3311   unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys,
3312                                                  TTI::TCK_RecipThroughput);
3313   if (VF == 1)
3314     return ScalarCallCost;
3315 
3316   // Compute corresponding vector type for return value and arguments.
3317   Type *RetTy = ToVectorTy(ScalarRetTy, VF);
3318   for (Type *ScalarTy : ScalarTys)
3319     Tys.push_back(ToVectorTy(ScalarTy, VF));
3320 
3321   // Compute costs of unpacking argument values for the scalar calls and
3322   // packing the return values to a vector.
3323   unsigned ScalarizationCost = getScalarizationOverhead(CI, VF);
3324 
3325   unsigned Cost = ScalarCallCost * VF + ScalarizationCost;
3326 
3327   // If we can't emit a vector call for this function, then the currently found
3328   // cost is the cost we need to return.
3329   NeedToScalarize = true;
3330   VFShape Shape = VFShape::get(*CI, {VF, false}, false /*HasGlobalPred*/);
3331   Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
3332 
3333   if (!TLI || CI->isNoBuiltin() || !VecFunc)
3334     return Cost;
3335 
3336   // If the corresponding vector cost is cheaper, return its cost.
3337   unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys,
3338                                                  TTI::TCK_RecipThroughput);
3339   if (VectorCallCost < Cost) {
3340     NeedToScalarize = false;
3341     return VectorCallCost;
3342   }
3343   return Cost;
3344 }
3345 
3346 unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
3347                                                             unsigned VF) {
3348   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
3349   assert(ID && "Expected intrinsic call!");
3350 
3351   IntrinsicCostAttributes CostAttrs(ID, *CI, VF);
3352   return TTI.getIntrinsicInstrCost(CostAttrs,
3353                                    TargetTransformInfo::TCK_RecipThroughput);
3354 }
3355 
3356 static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
3357   auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3358   auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3359   return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3360 }
3361 
3362 static Type *largestIntegerVectorType(Type *T1, Type *T2) {
3363   auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3364   auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3365   return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3366 }
3367 
3368 void InnerLoopVectorizer::truncateToMinimalBitwidths() {
3369   // For every instruction `I` in MinBWs, truncate the operands, create a
3370   // truncated version of `I` and reextend its result. InstCombine runs
3371   // later and will remove any ext/trunc pairs.
3372   SmallPtrSet<Value *, 4> Erased;
3373   for (const auto &KV : Cost->getMinimalBitwidths()) {
3374     // If the value wasn't vectorized, we must maintain the original scalar
3375     // type. The absence of the value from VectorLoopValueMap indicates that it
3376     // wasn't vectorized.
3377     if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3378       continue;
3379     for (unsigned Part = 0; Part < UF; ++Part) {
3380       Value *I = getOrCreateVectorValue(KV.first, Part);
3381       if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I))
3382         continue;
3383       Type *OriginalTy = I->getType();
3384       Type *ScalarTruncatedTy =
3385           IntegerType::get(OriginalTy->getContext(), KV.second);
3386       auto *TruncatedTy = FixedVectorType::get(
3387           ScalarTruncatedTy, cast<VectorType>(OriginalTy)->getNumElements());
3388       if (TruncatedTy == OriginalTy)
3389         continue;
3390 
3391       IRBuilder<> B(cast<Instruction>(I));
3392       auto ShrinkOperand = [&](Value *V) -> Value * {
3393         if (auto *ZI = dyn_cast<ZExtInst>(V))
3394           if (ZI->getSrcTy() == TruncatedTy)
3395             return ZI->getOperand(0);
3396         return B.CreateZExtOrTrunc(V, TruncatedTy);
3397       };
3398 
3399       // The actual instruction modification depends on the instruction type,
3400       // unfortunately.
3401       Value *NewI = nullptr;
3402       if (auto *BO = dyn_cast<BinaryOperator>(I)) {
3403         NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
3404                              ShrinkOperand(BO->getOperand(1)));
3405 
3406         // Any wrapping introduced by shrinking this operation shouldn't be
3407         // considered undefined behavior. So, we can't unconditionally copy
3408         // arithmetic wrapping flags to NewI.
3409         cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
3410       } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
3411         NewI =
3412             B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
3413                          ShrinkOperand(CI->getOperand(1)));
3414       } else if (auto *SI = dyn_cast<SelectInst>(I)) {
3415         NewI = B.CreateSelect(SI->getCondition(),
3416                               ShrinkOperand(SI->getTrueValue()),
3417                               ShrinkOperand(SI->getFalseValue()));
3418       } else if (auto *CI = dyn_cast<CastInst>(I)) {
3419         switch (CI->getOpcode()) {
3420         default:
3421           llvm_unreachable("Unhandled cast!");
3422         case Instruction::Trunc:
3423           NewI = ShrinkOperand(CI->getOperand(0));
3424           break;
3425         case Instruction::SExt:
3426           NewI = B.CreateSExtOrTrunc(
3427               CI->getOperand(0),
3428               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3429           break;
3430         case Instruction::ZExt:
3431           NewI = B.CreateZExtOrTrunc(
3432               CI->getOperand(0),
3433               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3434           break;
3435         }
3436       } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
3437         auto Elements0 =
3438             cast<VectorType>(SI->getOperand(0)->getType())->getNumElements();
3439         auto *O0 = B.CreateZExtOrTrunc(
3440             SI->getOperand(0),
3441             FixedVectorType::get(ScalarTruncatedTy, Elements0));
3442         auto Elements1 =
3443             cast<VectorType>(SI->getOperand(1)->getType())->getNumElements();
3444         auto *O1 = B.CreateZExtOrTrunc(
3445             SI->getOperand(1),
3446             FixedVectorType::get(ScalarTruncatedTy, Elements1));
3447 
3448         NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask());
3449       } else if (isa<LoadInst>(I) || isa<PHINode>(I)) {
3450         // Don't do anything with the operands, just extend the result.
3451         continue;
3452       } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
3453         auto Elements =
3454             cast<VectorType>(IE->getOperand(0)->getType())->getNumElements();
3455         auto *O0 = B.CreateZExtOrTrunc(
3456             IE->getOperand(0),
3457             FixedVectorType::get(ScalarTruncatedTy, Elements));
3458         auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
3459         NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
3460       } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
3461         auto Elements =
3462             cast<VectorType>(EE->getOperand(0)->getType())->getNumElements();
3463         auto *O0 = B.CreateZExtOrTrunc(
3464             EE->getOperand(0),
3465             FixedVectorType::get(ScalarTruncatedTy, Elements));
3466         NewI = B.CreateExtractElement(O0, EE->getOperand(2));
3467       } else {
3468         // If we don't know what to do, be conservative and don't do anything.
3469         continue;
3470       }
3471 
3472       // Lastly, extend the result.
3473       NewI->takeName(cast<Instruction>(I));
3474       Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
3475       I->replaceAllUsesWith(Res);
3476       cast<Instruction>(I)->eraseFromParent();
3477       Erased.insert(I);
3478       VectorLoopValueMap.resetVectorValue(KV.first, Part, Res);
3479     }
3480   }
3481 
3482   // We'll have created a bunch of ZExts that are now parentless. Clean up.
3483   for (const auto &KV : Cost->getMinimalBitwidths()) {
3484     // If the value wasn't vectorized, we must maintain the original scalar
3485     // type. The absence of the value from VectorLoopValueMap indicates that it
3486     // wasn't vectorized.
3487     if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3488       continue;
3489     for (unsigned Part = 0; Part < UF; ++Part) {
3490       Value *I = getOrCreateVectorValue(KV.first, Part);
3491       ZExtInst *Inst = dyn_cast<ZExtInst>(I);
3492       if (Inst && Inst->use_empty()) {
3493         Value *NewI = Inst->getOperand(0);
3494         Inst->eraseFromParent();
3495         VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI);
3496       }
3497     }
3498   }
3499 }
3500 
3501 void InnerLoopVectorizer::fixVectorizedLoop() {
3502   // Insert truncates and extends for any truncated instructions as hints to
3503   // InstCombine.
3504   if (VF > 1)
3505     truncateToMinimalBitwidths();
3506 
3507   // Fix widened non-induction PHIs by setting up the PHI operands.
3508   if (OrigPHIsToFix.size()) {
3509     assert(EnableVPlanNativePath &&
3510            "Unexpected non-induction PHIs for fixup in non VPlan-native path");
3511     fixNonInductionPHIs();
3512   }
3513 
3514   // At this point every instruction in the original loop is widened to a
3515   // vector form. Now we need to fix the recurrences in the loop. These PHI
3516   // nodes are currently empty because we did not want to introduce cycles.
3517   // This is the second stage of vectorizing recurrences.
3518   fixCrossIterationPHIs();
3519 
3520   // Forget the original basic block.
3521   PSE.getSE()->forgetLoop(OrigLoop);
3522 
3523   // Fix-up external users of the induction variables.
3524   for (auto &Entry : Legal->getInductionVars())
3525     fixupIVUsers(Entry.first, Entry.second,
3526                  getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)),
3527                  IVEndValues[Entry.first], LoopMiddleBlock);
3528 
3529   fixLCSSAPHIs();
3530   for (Instruction *PI : PredicatedInstructions)
3531     sinkScalarOperands(&*PI);
3532 
3533   // Remove redundant induction instructions.
3534   cse(LoopVectorBody);
3535 
3536   // Set/update profile weights for the vector and remainder loops as original
3537   // loop iterations are now distributed among them. Note that original loop
3538   // represented by LoopScalarBody becomes remainder loop after vectorization.
3539   //
3540   // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
3541   // end up getting slightly roughened result but that should be OK since
3542   // profile is not inherently precise anyway. Note also possible bypass of
3543   // vector code caused by legality checks is ignored, assigning all the weight
3544   // to the vector loop, optimistically.
3545   setProfileInfoAfterUnrolling(LI->getLoopFor(LoopScalarBody),
3546                                LI->getLoopFor(LoopVectorBody),
3547                                LI->getLoopFor(LoopScalarBody), VF * UF);
3548 }
3549 
3550 void InnerLoopVectorizer::fixCrossIterationPHIs() {
3551   // In order to support recurrences we need to be able to vectorize Phi nodes.
3552   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3553   // stage #2: We now need to fix the recurrences by adding incoming edges to
3554   // the currently empty PHI nodes. At this point every instruction in the
3555   // original loop is widened to a vector form so we can use them to construct
3556   // the incoming edges.
3557   for (PHINode &Phi : OrigLoop->getHeader()->phis()) {
3558     // Handle first-order recurrences and reductions that need to be fixed.
3559     if (Legal->isFirstOrderRecurrence(&Phi))
3560       fixFirstOrderRecurrence(&Phi);
3561     else if (Legal->isReductionVariable(&Phi))
3562       fixReduction(&Phi);
3563   }
3564 }
3565 
3566 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
3567   // This is the second phase of vectorizing first-order recurrences. An
3568   // overview of the transformation is described below. Suppose we have the
3569   // following loop.
3570   //
3571   //   for (int i = 0; i < n; ++i)
3572   //     b[i] = a[i] - a[i - 1];
3573   //
3574   // There is a first-order recurrence on "a". For this loop, the shorthand
3575   // scalar IR looks like:
3576   //
3577   //   scalar.ph:
3578   //     s_init = a[-1]
3579   //     br scalar.body
3580   //
3581   //   scalar.body:
3582   //     i = phi [0, scalar.ph], [i+1, scalar.body]
3583   //     s1 = phi [s_init, scalar.ph], [s2, scalar.body]
3584   //     s2 = a[i]
3585   //     b[i] = s2 - s1
3586   //     br cond, scalar.body, ...
3587   //
3588   // In this example, s1 is a recurrence because it's value depends on the
3589   // previous iteration. In the first phase of vectorization, we created a
3590   // temporary value for s1. We now complete the vectorization and produce the
3591   // shorthand vector IR shown below (for VF = 4, UF = 1).
3592   //
3593   //   vector.ph:
3594   //     v_init = vector(..., ..., ..., a[-1])
3595   //     br vector.body
3596   //
3597   //   vector.body
3598   //     i = phi [0, vector.ph], [i+4, vector.body]
3599   //     v1 = phi [v_init, vector.ph], [v2, vector.body]
3600   //     v2 = a[i, i+1, i+2, i+3];
3601   //     v3 = vector(v1(3), v2(0, 1, 2))
3602   //     b[i, i+1, i+2, i+3] = v2 - v3
3603   //     br cond, vector.body, middle.block
3604   //
3605   //   middle.block:
3606   //     x = v2(3)
3607   //     br scalar.ph
3608   //
3609   //   scalar.ph:
3610   //     s_init = phi [x, middle.block], [a[-1], otherwise]
3611   //     br scalar.body
3612   //
3613   // After execution completes the vector loop, we extract the next value of
3614   // the recurrence (x) to use as the initial value in the scalar loop.
3615 
3616   // Get the original loop preheader and single loop latch.
3617   auto *Preheader = OrigLoop->getLoopPreheader();
3618   auto *Latch = OrigLoop->getLoopLatch();
3619 
3620   // Get the initial and previous values of the scalar recurrence.
3621   auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader);
3622   auto *Previous = Phi->getIncomingValueForBlock(Latch);
3623 
3624   // Create a vector from the initial value.
3625   auto *VectorInit = ScalarInit;
3626   if (VF > 1) {
3627     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
3628     VectorInit = Builder.CreateInsertElement(
3629         UndefValue::get(FixedVectorType::get(VectorInit->getType(), VF)),
3630         VectorInit, Builder.getInt32(VF - 1), "vector.recur.init");
3631   }
3632 
3633   // We constructed a temporary phi node in the first phase of vectorization.
3634   // This phi node will eventually be deleted.
3635   Builder.SetInsertPoint(
3636       cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0)));
3637 
3638   // Create a phi node for the new recurrence. The current value will either be
3639   // the initial value inserted into a vector or loop-varying vector value.
3640   auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur");
3641   VecPhi->addIncoming(VectorInit, LoopVectorPreHeader);
3642 
3643   // Get the vectorized previous value of the last part UF - 1. It appears last
3644   // among all unrolled iterations, due to the order of their construction.
3645   Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1);
3646 
3647   // Find and set the insertion point after the previous value if it is an
3648   // instruction.
3649   BasicBlock::iterator InsertPt;
3650   // Note that the previous value may have been constant-folded so it is not
3651   // guaranteed to be an instruction in the vector loop.
3652   // FIXME: Loop invariant values do not form recurrences. We should deal with
3653   //        them earlier.
3654   if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart))
3655     InsertPt = LoopVectorBody->getFirstInsertionPt();
3656   else {
3657     Instruction *PreviousInst = cast<Instruction>(PreviousLastPart);
3658     if (isa<PHINode>(PreviousLastPart))
3659       // If the previous value is a phi node, we should insert after all the phi
3660       // nodes in the block containing the PHI to avoid breaking basic block
3661       // verification. Note that the basic block may be different to
3662       // LoopVectorBody, in case we predicate the loop.
3663       InsertPt = PreviousInst->getParent()->getFirstInsertionPt();
3664     else
3665       InsertPt = ++PreviousInst->getIterator();
3666   }
3667   Builder.SetInsertPoint(&*InsertPt);
3668 
3669   // We will construct a vector for the recurrence by combining the values for
3670   // the current and previous iterations. This is the required shuffle mask.
3671   SmallVector<int, 8> ShuffleMask(VF);
3672   ShuffleMask[0] = VF - 1;
3673   for (unsigned I = 1; I < VF; ++I)
3674     ShuffleMask[I] = I + VF - 1;
3675 
3676   // The vector from which to take the initial value for the current iteration
3677   // (actual or unrolled). Initially, this is the vector phi node.
3678   Value *Incoming = VecPhi;
3679 
3680   // Shuffle the current and previous vector and update the vector parts.
3681   for (unsigned Part = 0; Part < UF; ++Part) {
3682     Value *PreviousPart = getOrCreateVectorValue(Previous, Part);
3683     Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part);
3684     auto *Shuffle = VF > 1 ? Builder.CreateShuffleVector(Incoming, PreviousPart,
3685                                                          ShuffleMask)
3686                            : Incoming;
3687     PhiPart->replaceAllUsesWith(Shuffle);
3688     cast<Instruction>(PhiPart)->eraseFromParent();
3689     VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle);
3690     Incoming = PreviousPart;
3691   }
3692 
3693   // Fix the latch value of the new recurrence in the vector loop.
3694   VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
3695 
3696   // Extract the last vector element in the middle block. This will be the
3697   // initial value for the recurrence when jumping to the scalar loop.
3698   auto *ExtractForScalar = Incoming;
3699   if (VF > 1) {
3700     Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
3701     ExtractForScalar = Builder.CreateExtractElement(
3702         ExtractForScalar, Builder.getInt32(VF - 1), "vector.recur.extract");
3703   }
3704   // Extract the second last element in the middle block if the
3705   // Phi is used outside the loop. We need to extract the phi itself
3706   // and not the last element (the phi update in the current iteration). This
3707   // will be the value when jumping to the exit block from the LoopMiddleBlock,
3708   // when the scalar loop is not run at all.
3709   Value *ExtractForPhiUsedOutsideLoop = nullptr;
3710   if (VF > 1)
3711     ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
3712         Incoming, Builder.getInt32(VF - 2), "vector.recur.extract.for.phi");
3713   // When loop is unrolled without vectorizing, initialize
3714   // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of
3715   // `Incoming`. This is analogous to the vectorized case above: extracting the
3716   // second last element when VF > 1.
3717   else if (UF > 1)
3718     ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2);
3719 
3720   // Fix the initial value of the original recurrence in the scalar loop.
3721   Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
3722   auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
3723   for (auto *BB : predecessors(LoopScalarPreHeader)) {
3724     auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
3725     Start->addIncoming(Incoming, BB);
3726   }
3727 
3728   Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start);
3729   Phi->setName("scalar.recur");
3730 
3731   // Finally, fix users of the recurrence outside the loop. The users will need
3732   // either the last value of the scalar recurrence or the last value of the
3733   // vector recurrence we extracted in the middle block. Since the loop is in
3734   // LCSSA form, we just need to find all the phi nodes for the original scalar
3735   // recurrence in the exit block, and then add an edge for the middle block.
3736   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3737     if (LCSSAPhi.getIncomingValue(0) == Phi) {
3738       LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
3739     }
3740   }
3741 }
3742 
3743 void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
3744   Constant *Zero = Builder.getInt32(0);
3745 
3746   // Get it's reduction variable descriptor.
3747   assert(Legal->isReductionVariable(Phi) &&
3748          "Unable to find the reduction variable");
3749   RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi];
3750 
3751   RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();
3752   TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
3753   Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
3754   RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind =
3755     RdxDesc.getMinMaxRecurrenceKind();
3756   setDebugLocFromInst(Builder, ReductionStartValue);
3757 
3758   // We need to generate a reduction vector from the incoming scalar.
3759   // To do so, we need to generate the 'identity' vector and override
3760   // one of the elements with the incoming scalar reduction. We need
3761   // to do it in the vector-loop preheader.
3762   Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
3763 
3764   // This is the vector-clone of the value that leaves the loop.
3765   Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType();
3766 
3767   // Find the reduction identity variable. Zero for addition, or, xor,
3768   // one for multiplication, -1 for And.
3769   Value *Identity;
3770   Value *VectorStart;
3771   if (RK == RecurrenceDescriptor::RK_IntegerMinMax ||
3772       RK == RecurrenceDescriptor::RK_FloatMinMax) {
3773     // MinMax reduction have the start value as their identify.
3774     if (VF == 1) {
3775       VectorStart = Identity = ReductionStartValue;
3776     } else {
3777       VectorStart = Identity =
3778         Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident");
3779     }
3780   } else {
3781     // Handle other reduction kinds:
3782     Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity(
3783         RK, VecTy->getScalarType());
3784     if (VF == 1) {
3785       Identity = Iden;
3786       // This vector is the Identity vector where the first element is the
3787       // incoming scalar reduction.
3788       VectorStart = ReductionStartValue;
3789     } else {
3790       Identity = ConstantVector::getSplat({VF, false}, Iden);
3791 
3792       // This vector is the Identity vector where the first element is the
3793       // incoming scalar reduction.
3794       VectorStart =
3795         Builder.CreateInsertElement(Identity, ReductionStartValue, Zero);
3796     }
3797   }
3798 
3799   // Wrap flags are in general invalid after vectorization, clear them.
3800   clearReductionWrapFlags(RdxDesc);
3801 
3802   // Fix the vector-loop phi.
3803 
3804   // Reductions do not have to start at zero. They can start with
3805   // any loop invariant values.
3806   BasicBlock *Latch = OrigLoop->getLoopLatch();
3807   Value *LoopVal = Phi->getIncomingValueForBlock(Latch);
3808 
3809   for (unsigned Part = 0; Part < UF; ++Part) {
3810     Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part);
3811     Value *Val = getOrCreateVectorValue(LoopVal, Part);
3812     // Make sure to add the reduction start value only to the
3813     // first unroll part.
3814     Value *StartVal = (Part == 0) ? VectorStart : Identity;
3815     cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader);
3816     cast<PHINode>(VecRdxPhi)
3817       ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
3818   }
3819 
3820   // Before each round, move the insertion point right between
3821   // the PHIs and the values we are going to write.
3822   // This allows us to write both PHINodes and the extractelement
3823   // instructions.
3824   Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3825 
3826   setDebugLocFromInst(Builder, LoopExitInst);
3827 
3828   // If tail is folded by masking, the vector value to leave the loop should be
3829   // a Select choosing between the vectorized LoopExitInst and vectorized Phi,
3830   // instead of the former.
3831   if (Cost->foldTailByMasking()) {
3832     for (unsigned Part = 0; Part < UF; ++Part) {
3833       Value *VecLoopExitInst =
3834           VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3835       Value *Sel = nullptr;
3836       for (User *U : VecLoopExitInst->users()) {
3837         if (isa<SelectInst>(U)) {
3838           assert(!Sel && "Reduction exit feeding two selects");
3839           Sel = U;
3840         } else
3841           assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select");
3842       }
3843       assert(Sel && "Reduction exit feeds no select");
3844       VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, Sel);
3845     }
3846   }
3847 
3848   // If the vector reduction can be performed in a smaller type, we truncate
3849   // then extend the loop exit value to enable InstCombine to evaluate the
3850   // entire expression in the smaller type.
3851   if (VF > 1 && Phi->getType() != RdxDesc.getRecurrenceType()) {
3852     Type *RdxVecTy = FixedVectorType::get(RdxDesc.getRecurrenceType(), VF);
3853     Builder.SetInsertPoint(
3854         LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator());
3855     VectorParts RdxParts(UF);
3856     for (unsigned Part = 0; Part < UF; ++Part) {
3857       RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3858       Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3859       Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
3860                                         : Builder.CreateZExt(Trunc, VecTy);
3861       for (Value::user_iterator UI = RdxParts[Part]->user_begin();
3862            UI != RdxParts[Part]->user_end();)
3863         if (*UI != Trunc) {
3864           (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd);
3865           RdxParts[Part] = Extnd;
3866         } else {
3867           ++UI;
3868         }
3869     }
3870     Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3871     for (unsigned Part = 0; Part < UF; ++Part) {
3872       RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3873       VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]);
3874     }
3875   }
3876 
3877   // Reduce all of the unrolled parts into a single vector.
3878   Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0);
3879   unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK);
3880 
3881   // The middle block terminator has already been assigned a DebugLoc here (the
3882   // OrigLoop's single latch terminator). We want the whole middle block to
3883   // appear to execute on this line because: (a) it is all compiler generated,
3884   // (b) these instructions are always executed after evaluating the latch
3885   // conditional branch, and (c) other passes may add new predecessors which
3886   // terminate on this line. This is the easiest way to ensure we don't
3887   // accidentally cause an extra step back into the loop while debugging.
3888   setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator());
3889   for (unsigned Part = 1; Part < UF; ++Part) {
3890     Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3891     if (Op != Instruction::ICmp && Op != Instruction::FCmp)
3892       // Floating point operations had to be 'fast' to enable the reduction.
3893       ReducedPartRdx = addFastMathFlag(
3894           Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxPart,
3895                               ReducedPartRdx, "bin.rdx"),
3896           RdxDesc.getFastMathFlags());
3897     else
3898       ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx,
3899                                       RdxPart);
3900   }
3901 
3902   if (VF > 1) {
3903     bool NoNaN = Legal->hasFunNoNaNAttr();
3904     ReducedPartRdx =
3905         createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN);
3906     // If the reduction can be performed in a smaller type, we need to extend
3907     // the reduction to the wider type before we branch to the original loop.
3908     if (Phi->getType() != RdxDesc.getRecurrenceType())
3909       ReducedPartRdx =
3910         RdxDesc.isSigned()
3911         ? Builder.CreateSExt(ReducedPartRdx, Phi->getType())
3912         : Builder.CreateZExt(ReducedPartRdx, Phi->getType());
3913   }
3914 
3915   // Create a phi node that merges control-flow from the backedge-taken check
3916   // block and the middle block.
3917   PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx",
3918                                         LoopScalarPreHeader->getTerminator());
3919   for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
3920     BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]);
3921   BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
3922 
3923   // Now, we need to fix the users of the reduction variable
3924   // inside and outside of the scalar remainder loop.
3925   // We know that the loop is in LCSSA form. We need to update the
3926   // PHI nodes in the exit blocks.
3927   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3928     // All PHINodes need to have a single entry edge, or two if
3929     // we already fixed them.
3930     assert(LCSSAPhi.getNumIncomingValues() < 3 && "Invalid LCSSA PHI");
3931 
3932     // We found a reduction value exit-PHI. Update it with the
3933     // incoming bypass edge.
3934     if (LCSSAPhi.getIncomingValue(0) == LoopExitInst)
3935       LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
3936   } // end of the LCSSA phi scan.
3937 
3938     // Fix the scalar loop reduction variable with the incoming reduction sum
3939     // from the vector body and from the backedge value.
3940   int IncomingEdgeBlockIdx =
3941     Phi->getBasicBlockIndex(OrigLoop->getLoopLatch());
3942   assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
3943   // Pick the other block.
3944   int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
3945   Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
3946   Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
3947 }
3948 
3949 void InnerLoopVectorizer::clearReductionWrapFlags(
3950     RecurrenceDescriptor &RdxDesc) {
3951   RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();
3952   if (RK != RecurrenceDescriptor::RK_IntegerAdd &&
3953       RK != RecurrenceDescriptor::RK_IntegerMult)
3954     return;
3955 
3956   Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr();
3957   assert(LoopExitInstr && "null loop exit instruction");
3958   SmallVector<Instruction *, 8> Worklist;
3959   SmallPtrSet<Instruction *, 8> Visited;
3960   Worklist.push_back(LoopExitInstr);
3961   Visited.insert(LoopExitInstr);
3962 
3963   while (!Worklist.empty()) {
3964     Instruction *Cur = Worklist.pop_back_val();
3965     if (isa<OverflowingBinaryOperator>(Cur))
3966       for (unsigned Part = 0; Part < UF; ++Part) {
3967         Value *V = getOrCreateVectorValue(Cur, Part);
3968         cast<Instruction>(V)->dropPoisonGeneratingFlags();
3969       }
3970 
3971     for (User *U : Cur->users()) {
3972       Instruction *UI = cast<Instruction>(U);
3973       if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) &&
3974           Visited.insert(UI).second)
3975         Worklist.push_back(UI);
3976     }
3977   }
3978 }
3979 
3980 void InnerLoopVectorizer::fixLCSSAPHIs() {
3981   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3982     if (LCSSAPhi.getNumIncomingValues() == 1) {
3983       auto *IncomingValue = LCSSAPhi.getIncomingValue(0);
3984       // Non-instruction incoming values will have only one value.
3985       unsigned LastLane = 0;
3986       if (isa<Instruction>(IncomingValue))
3987           LastLane = Cost->isUniformAfterVectorization(
3988                          cast<Instruction>(IncomingValue), VF)
3989                          ? 0
3990                          : VF - 1;
3991       // Can be a loop invariant incoming value or the last scalar value to be
3992       // extracted from the vectorized loop.
3993       Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
3994       Value *lastIncomingValue =
3995           getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane });
3996       LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock);
3997     }
3998   }
3999 }
4000 
4001 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
4002   // The basic block and loop containing the predicated instruction.
4003   auto *PredBB = PredInst->getParent();
4004   auto *VectorLoop = LI->getLoopFor(PredBB);
4005 
4006   // Initialize a worklist with the operands of the predicated instruction.
4007   SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
4008 
4009   // Holds instructions that we need to analyze again. An instruction may be
4010   // reanalyzed if we don't yet know if we can sink it or not.
4011   SmallVector<Instruction *, 8> InstsToReanalyze;
4012 
4013   // Returns true if a given use occurs in the predicated block. Phi nodes use
4014   // their operands in their corresponding predecessor blocks.
4015   auto isBlockOfUsePredicated = [&](Use &U) -> bool {
4016     auto *I = cast<Instruction>(U.getUser());
4017     BasicBlock *BB = I->getParent();
4018     if (auto *Phi = dyn_cast<PHINode>(I))
4019       BB = Phi->getIncomingBlock(
4020           PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
4021     return BB == PredBB;
4022   };
4023 
4024   // Iteratively sink the scalarized operands of the predicated instruction
4025   // into the block we created for it. When an instruction is sunk, it's
4026   // operands are then added to the worklist. The algorithm ends after one pass
4027   // through the worklist doesn't sink a single instruction.
4028   bool Changed;
4029   do {
4030     // Add the instructions that need to be reanalyzed to the worklist, and
4031     // reset the changed indicator.
4032     Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
4033     InstsToReanalyze.clear();
4034     Changed = false;
4035 
4036     while (!Worklist.empty()) {
4037       auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
4038 
4039       // We can't sink an instruction if it is a phi node, is already in the
4040       // predicated block, is not in the loop, or may have side effects.
4041       if (!I || isa<PHINode>(I) || I->getParent() == PredBB ||
4042           !VectorLoop->contains(I) || I->mayHaveSideEffects())
4043         continue;
4044 
4045       // It's legal to sink the instruction if all its uses occur in the
4046       // predicated block. Otherwise, there's nothing to do yet, and we may
4047       // need to reanalyze the instruction.
4048       if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
4049         InstsToReanalyze.push_back(I);
4050         continue;
4051       }
4052 
4053       // Move the instruction to the beginning of the predicated block, and add
4054       // it's operands to the worklist.
4055       I->moveBefore(&*PredBB->getFirstInsertionPt());
4056       Worklist.insert(I->op_begin(), I->op_end());
4057 
4058       // The sinking may have enabled other instructions to be sunk, so we will
4059       // need to iterate.
4060       Changed = true;
4061     }
4062   } while (Changed);
4063 }
4064 
4065 void InnerLoopVectorizer::fixNonInductionPHIs() {
4066   for (PHINode *OrigPhi : OrigPHIsToFix) {
4067     PHINode *NewPhi =
4068         cast<PHINode>(VectorLoopValueMap.getVectorValue(OrigPhi, 0));
4069     unsigned NumIncomingValues = OrigPhi->getNumIncomingValues();
4070 
4071     SmallVector<BasicBlock *, 2> ScalarBBPredecessors(
4072         predecessors(OrigPhi->getParent()));
4073     SmallVector<BasicBlock *, 2> VectorBBPredecessors(
4074         predecessors(NewPhi->getParent()));
4075     assert(ScalarBBPredecessors.size() == VectorBBPredecessors.size() &&
4076            "Scalar and Vector BB should have the same number of predecessors");
4077 
4078     // The insertion point in Builder may be invalidated by the time we get
4079     // here. Force the Builder insertion point to something valid so that we do
4080     // not run into issues during insertion point restore in
4081     // getOrCreateVectorValue calls below.
4082     Builder.SetInsertPoint(NewPhi);
4083 
4084     // The predecessor order is preserved and we can rely on mapping between
4085     // scalar and vector block predecessors.
4086     for (unsigned i = 0; i < NumIncomingValues; ++i) {
4087       BasicBlock *NewPredBB = VectorBBPredecessors[i];
4088 
4089       // When looking up the new scalar/vector values to fix up, use incoming
4090       // values from original phi.
4091       Value *ScIncV =
4092           OrigPhi->getIncomingValueForBlock(ScalarBBPredecessors[i]);
4093 
4094       // Scalar incoming value may need a broadcast
4095       Value *NewIncV = getOrCreateVectorValue(ScIncV, 0);
4096       NewPhi->addIncoming(NewIncV, NewPredBB);
4097     }
4098   }
4099 }
4100 
4101 void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPUser &Operands,
4102                                    unsigned UF, unsigned VF,
4103                                    bool IsPtrLoopInvariant,
4104                                    SmallBitVector &IsIndexLoopInvariant,
4105                                    VPTransformState &State) {
4106   // Construct a vector GEP by widening the operands of the scalar GEP as
4107   // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
4108   // results in a vector of pointers when at least one operand of the GEP
4109   // is vector-typed. Thus, to keep the representation compact, we only use
4110   // vector-typed operands for loop-varying values.
4111 
4112   if (VF > 1 && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) {
4113     // If we are vectorizing, but the GEP has only loop-invariant operands,
4114     // the GEP we build (by only using vector-typed operands for
4115     // loop-varying values) would be a scalar pointer. Thus, to ensure we
4116     // produce a vector of pointers, we need to either arbitrarily pick an
4117     // operand to broadcast, or broadcast a clone of the original GEP.
4118     // Here, we broadcast a clone of the original.
4119     //
4120     // TODO: If at some point we decide to scalarize instructions having
4121     //       loop-invariant operands, this special case will no longer be
4122     //       required. We would add the scalarization decision to
4123     //       collectLoopScalars() and teach getVectorValue() to broadcast
4124     //       the lane-zero scalar value.
4125     auto *Clone = Builder.Insert(GEP->clone());
4126     for (unsigned Part = 0; Part < UF; ++Part) {
4127       Value *EntryPart = Builder.CreateVectorSplat(VF, Clone);
4128       VectorLoopValueMap.setVectorValue(GEP, Part, EntryPart);
4129       addMetadata(EntryPart, GEP);
4130     }
4131   } else {
4132     // If the GEP has at least one loop-varying operand, we are sure to
4133     // produce a vector of pointers. But if we are only unrolling, we want
4134     // to produce a scalar GEP for each unroll part. Thus, the GEP we
4135     // produce with the code below will be scalar (if VF == 1) or vector
4136     // (otherwise). Note that for the unroll-only case, we still maintain
4137     // values in the vector mapping with initVector, as we do for other
4138     // instructions.
4139     for (unsigned Part = 0; Part < UF; ++Part) {
4140       // The pointer operand of the new GEP. If it's loop-invariant, we
4141       // won't broadcast it.
4142       auto *Ptr = IsPtrLoopInvariant ? State.get(Operands.getOperand(0), {0, 0})
4143                                      : State.get(Operands.getOperand(0), Part);
4144 
4145       // Collect all the indices for the new GEP. If any index is
4146       // loop-invariant, we won't broadcast it.
4147       SmallVector<Value *, 4> Indices;
4148       for (unsigned I = 1, E = Operands.getNumOperands(); I < E; I++) {
4149         VPValue *Operand = Operands.getOperand(I);
4150         if (IsIndexLoopInvariant[I - 1])
4151           Indices.push_back(State.get(Operand, {0, 0}));
4152         else
4153           Indices.push_back(State.get(Operand, Part));
4154       }
4155 
4156       // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
4157       // but it should be a vector, otherwise.
4158       auto *NewGEP =
4159           GEP->isInBounds()
4160               ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr,
4161                                           Indices)
4162               : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices);
4163       assert((VF == 1 || NewGEP->getType()->isVectorTy()) &&
4164              "NewGEP is not a pointer vector");
4165       VectorLoopValueMap.setVectorValue(GEP, Part, NewGEP);
4166       addMetadata(NewGEP, GEP);
4167     }
4168   }
4169 }
4170 
4171 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
4172                                               unsigned VF) {
4173   PHINode *P = cast<PHINode>(PN);
4174   if (EnableVPlanNativePath) {
4175     // Currently we enter here in the VPlan-native path for non-induction
4176     // PHIs where all control flow is uniform. We simply widen these PHIs.
4177     // Create a vector phi with no operands - the vector phi operands will be
4178     // set at the end of vector code generation.
4179     Type *VecTy =
4180         (VF == 1) ? PN->getType() : FixedVectorType::get(PN->getType(), VF);
4181     Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi");
4182     VectorLoopValueMap.setVectorValue(P, 0, VecPhi);
4183     OrigPHIsToFix.push_back(P);
4184 
4185     return;
4186   }
4187 
4188   assert(PN->getParent() == OrigLoop->getHeader() &&
4189          "Non-header phis should have been handled elsewhere");
4190 
4191   // In order to support recurrences we need to be able to vectorize Phi nodes.
4192   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
4193   // stage #1: We create a new vector PHI node with no incoming edges. We'll use
4194   // this value when we vectorize all of the instructions that use the PHI.
4195   if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) {
4196     for (unsigned Part = 0; Part < UF; ++Part) {
4197       // This is phase one of vectorizing PHIs.
4198       Type *VecTy =
4199           (VF == 1) ? PN->getType() : FixedVectorType::get(PN->getType(), VF);
4200       Value *EntryPart = PHINode::Create(
4201           VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt());
4202       VectorLoopValueMap.setVectorValue(P, Part, EntryPart);
4203     }
4204     return;
4205   }
4206 
4207   setDebugLocFromInst(Builder, P);
4208 
4209   // This PHINode must be an induction variable.
4210   // Make sure that we know about it.
4211   assert(Legal->getInductionVars().count(P) && "Not an induction variable");
4212 
4213   InductionDescriptor II = Legal->getInductionVars().lookup(P);
4214   const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
4215 
4216   // FIXME: The newly created binary instructions should contain nsw/nuw flags,
4217   // which can be found from the original scalar operations.
4218   switch (II.getKind()) {
4219   case InductionDescriptor::IK_NoInduction:
4220     llvm_unreachable("Unknown induction");
4221   case InductionDescriptor::IK_IntInduction:
4222   case InductionDescriptor::IK_FpInduction:
4223     llvm_unreachable("Integer/fp induction is handled elsewhere.");
4224   case InductionDescriptor::IK_PtrInduction: {
4225     // Handle the pointer induction variable case.
4226     assert(P->getType()->isPointerTy() && "Unexpected type.");
4227     // This is the normalized GEP that starts counting at zero.
4228     Value *PtrInd = Induction;
4229     PtrInd = Builder.CreateSExtOrTrunc(PtrInd, II.getStep()->getType());
4230     // Determine the number of scalars we need to generate for each unroll
4231     // iteration. If the instruction is uniform, we only need to generate the
4232     // first lane. Otherwise, we generate all VF values.
4233     unsigned Lanes = Cost->isUniformAfterVectorization(P, VF) ? 1 : VF;
4234     // These are the scalar results. Notice that we don't generate vector GEPs
4235     // because scalar GEPs result in better code.
4236     for (unsigned Part = 0; Part < UF; ++Part) {
4237       for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
4238         Constant *Idx = ConstantInt::get(PtrInd->getType(), Lane + Part * VF);
4239         Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
4240         Value *SclrGep =
4241             emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II);
4242         SclrGep->setName("next.gep");
4243         VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep);
4244       }
4245     }
4246     return;
4247   }
4248   }
4249 }
4250 
4251 /// A helper function for checking whether an integer division-related
4252 /// instruction may divide by zero (in which case it must be predicated if
4253 /// executed conditionally in the scalar code).
4254 /// TODO: It may be worthwhile to generalize and check isKnownNonZero().
4255 /// Non-zero divisors that are non compile-time constants will not be
4256 /// converted into multiplication, so we will still end up scalarizing
4257 /// the division, but can do so w/o predication.
4258 static bool mayDivideByZero(Instruction &I) {
4259   assert((I.getOpcode() == Instruction::UDiv ||
4260           I.getOpcode() == Instruction::SDiv ||
4261           I.getOpcode() == Instruction::URem ||
4262           I.getOpcode() == Instruction::SRem) &&
4263          "Unexpected instruction");
4264   Value *Divisor = I.getOperand(1);
4265   auto *CInt = dyn_cast<ConstantInt>(Divisor);
4266   return !CInt || CInt->isZero();
4267 }
4268 
4269 void InnerLoopVectorizer::widenInstruction(Instruction &I, VPUser &User,
4270                                            VPTransformState &State) {
4271   switch (I.getOpcode()) {
4272   case Instruction::Call:
4273   case Instruction::Br:
4274   case Instruction::PHI:
4275   case Instruction::GetElementPtr:
4276   case Instruction::Select:
4277     llvm_unreachable("This instruction is handled by a different recipe.");
4278   case Instruction::UDiv:
4279   case Instruction::SDiv:
4280   case Instruction::SRem:
4281   case Instruction::URem:
4282   case Instruction::Add:
4283   case Instruction::FAdd:
4284   case Instruction::Sub:
4285   case Instruction::FSub:
4286   case Instruction::FNeg:
4287   case Instruction::Mul:
4288   case Instruction::FMul:
4289   case Instruction::FDiv:
4290   case Instruction::FRem:
4291   case Instruction::Shl:
4292   case Instruction::LShr:
4293   case Instruction::AShr:
4294   case Instruction::And:
4295   case Instruction::Or:
4296   case Instruction::Xor: {
4297     // Just widen unops and binops.
4298     setDebugLocFromInst(Builder, &I);
4299 
4300     for (unsigned Part = 0; Part < UF; ++Part) {
4301       SmallVector<Value *, 2> Ops;
4302       for (VPValue *VPOp : User.operands())
4303         Ops.push_back(State.get(VPOp, Part));
4304 
4305       Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops);
4306 
4307       if (auto *VecOp = dyn_cast<Instruction>(V))
4308         VecOp->copyIRFlags(&I);
4309 
4310       // Use this vector value for all users of the original instruction.
4311       VectorLoopValueMap.setVectorValue(&I, Part, V);
4312       addMetadata(V, &I);
4313     }
4314 
4315     break;
4316   }
4317   case Instruction::ICmp:
4318   case Instruction::FCmp: {
4319     // Widen compares. Generate vector compares.
4320     bool FCmp = (I.getOpcode() == Instruction::FCmp);
4321     auto *Cmp = cast<CmpInst>(&I);
4322     setDebugLocFromInst(Builder, Cmp);
4323     for (unsigned Part = 0; Part < UF; ++Part) {
4324       Value *A = State.get(User.getOperand(0), Part);
4325       Value *B = State.get(User.getOperand(1), Part);
4326       Value *C = nullptr;
4327       if (FCmp) {
4328         // Propagate fast math flags.
4329         IRBuilder<>::FastMathFlagGuard FMFG(Builder);
4330         Builder.setFastMathFlags(Cmp->getFastMathFlags());
4331         C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
4332       } else {
4333         C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
4334       }
4335       VectorLoopValueMap.setVectorValue(&I, Part, C);
4336       addMetadata(C, &I);
4337     }
4338 
4339     break;
4340   }
4341 
4342   case Instruction::ZExt:
4343   case Instruction::SExt:
4344   case Instruction::FPToUI:
4345   case Instruction::FPToSI:
4346   case Instruction::FPExt:
4347   case Instruction::PtrToInt:
4348   case Instruction::IntToPtr:
4349   case Instruction::SIToFP:
4350   case Instruction::UIToFP:
4351   case Instruction::Trunc:
4352   case Instruction::FPTrunc:
4353   case Instruction::BitCast: {
4354     auto *CI = cast<CastInst>(&I);
4355     setDebugLocFromInst(Builder, CI);
4356 
4357     /// Vectorize casts.
4358     Type *DestTy =
4359         (VF == 1) ? CI->getType() : FixedVectorType::get(CI->getType(), VF);
4360 
4361     for (unsigned Part = 0; Part < UF; ++Part) {
4362       Value *A = State.get(User.getOperand(0), Part);
4363       Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
4364       VectorLoopValueMap.setVectorValue(&I, Part, Cast);
4365       addMetadata(Cast, &I);
4366     }
4367     break;
4368   }
4369   default:
4370     // This instruction is not vectorized by simple widening.
4371     LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
4372     llvm_unreachable("Unhandled instruction!");
4373   } // end of switch.
4374 }
4375 
4376 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPUser &ArgOperands,
4377                                                VPTransformState &State) {
4378   assert(!isa<DbgInfoIntrinsic>(I) &&
4379          "DbgInfoIntrinsic should have been dropped during VPlan construction");
4380   setDebugLocFromInst(Builder, &I);
4381 
4382   Module *M = I.getParent()->getParent()->getParent();
4383   auto *CI = cast<CallInst>(&I);
4384 
4385   SmallVector<Type *, 4> Tys;
4386   for (Value *ArgOperand : CI->arg_operands())
4387     Tys.push_back(ToVectorTy(ArgOperand->getType(), VF));
4388 
4389   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
4390 
4391   // The flag shows whether we use Intrinsic or a usual Call for vectorized
4392   // version of the instruction.
4393   // Is it beneficial to perform intrinsic call compared to lib call?
4394   bool NeedToScalarize = false;
4395   unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize);
4396   bool UseVectorIntrinsic =
4397       ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost;
4398   assert((UseVectorIntrinsic || !NeedToScalarize) &&
4399          "Instruction should be scalarized elsewhere.");
4400 
4401   for (unsigned Part = 0; Part < UF; ++Part) {
4402     SmallVector<Value *, 4> Args;
4403     for (auto &I : enumerate(ArgOperands.operands())) {
4404       // Some intrinsics have a scalar argument - don't replace it with a
4405       // vector.
4406       Value *Arg;
4407       if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index()))
4408         Arg = State.get(I.value(), Part);
4409       else
4410         Arg = State.get(I.value(), {0, 0});
4411       Args.push_back(Arg);
4412     }
4413 
4414     Function *VectorF;
4415     if (UseVectorIntrinsic) {
4416       // Use vector version of the intrinsic.
4417       Type *TysForDecl[] = {CI->getType()};
4418       if (VF > 1)
4419         TysForDecl[0] =
4420             FixedVectorType::get(CI->getType()->getScalarType(), VF);
4421       VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
4422       assert(VectorF && "Can't retrieve vector intrinsic.");
4423     } else {
4424       // Use vector version of the function call.
4425       const VFShape Shape =
4426           VFShape::get(*CI, {VF, false} /*EC*/, false /*HasGlobalPred*/);
4427 #ifndef NDEBUG
4428       assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr &&
4429              "Can't create vector function.");
4430 #endif
4431         VectorF = VFDatabase(*CI).getVectorizedFunction(Shape);
4432     }
4433       SmallVector<OperandBundleDef, 1> OpBundles;
4434       CI->getOperandBundlesAsDefs(OpBundles);
4435       CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
4436 
4437       if (isa<FPMathOperator>(V))
4438         V->copyFastMathFlags(CI);
4439 
4440       VectorLoopValueMap.setVectorValue(&I, Part, V);
4441       addMetadata(V, &I);
4442   }
4443 }
4444 
4445 void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I,
4446                                                  VPUser &Operands,
4447                                                  bool InvariantCond,
4448                                                  VPTransformState &State) {
4449   setDebugLocFromInst(Builder, &I);
4450 
4451   // The condition can be loop invariant  but still defined inside the
4452   // loop. This means that we can't just use the original 'cond' value.
4453   // We have to take the 'vectorized' value and pick the first lane.
4454   // Instcombine will make this a no-op.
4455   auto *InvarCond =
4456       InvariantCond ? State.get(Operands.getOperand(0), {0, 0}) : nullptr;
4457 
4458   for (unsigned Part = 0; Part < UF; ++Part) {
4459     Value *Cond =
4460         InvarCond ? InvarCond : State.get(Operands.getOperand(0), Part);
4461     Value *Op0 = State.get(Operands.getOperand(1), Part);
4462     Value *Op1 = State.get(Operands.getOperand(2), Part);
4463     Value *Sel = Builder.CreateSelect(Cond, Op0, Op1);
4464     VectorLoopValueMap.setVectorValue(&I, Part, Sel);
4465     addMetadata(Sel, &I);
4466   }
4467 }
4468 
4469 void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
4470   // We should not collect Scalars more than once per VF. Right now, this
4471   // function is called from collectUniformsAndScalars(), which already does
4472   // this check. Collecting Scalars for VF=1 does not make any sense.
4473   assert(VF >= 2 && Scalars.find(VF) == Scalars.end() &&
4474          "This function should not be visited twice for the same VF");
4475 
4476   SmallSetVector<Instruction *, 8> Worklist;
4477 
4478   // These sets are used to seed the analysis with pointers used by memory
4479   // accesses that will remain scalar.
4480   SmallSetVector<Instruction *, 8> ScalarPtrs;
4481   SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
4482 
4483   // A helper that returns true if the use of Ptr by MemAccess will be scalar.
4484   // The pointer operands of loads and stores will be scalar as long as the
4485   // memory access is not a gather or scatter operation. The value operand of a
4486   // store will remain scalar if the store is scalarized.
4487   auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
4488     InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
4489     assert(WideningDecision != CM_Unknown &&
4490            "Widening decision should be ready at this moment");
4491     if (auto *Store = dyn_cast<StoreInst>(MemAccess))
4492       if (Ptr == Store->getValueOperand())
4493         return WideningDecision == CM_Scalarize;
4494     assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
4495            "Ptr is neither a value or pointer operand");
4496     return WideningDecision != CM_GatherScatter;
4497   };
4498 
4499   // A helper that returns true if the given value is a bitcast or
4500   // getelementptr instruction contained in the loop.
4501   auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
4502     return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
4503             isa<GetElementPtrInst>(V)) &&
4504            !TheLoop->isLoopInvariant(V);
4505   };
4506 
4507   // A helper that evaluates a memory access's use of a pointer. If the use
4508   // will be a scalar use, and the pointer is only used by memory accesses, we
4509   // place the pointer in ScalarPtrs. Otherwise, the pointer is placed in
4510   // PossibleNonScalarPtrs.
4511   auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
4512     // We only care about bitcast and getelementptr instructions contained in
4513     // the loop.
4514     if (!isLoopVaryingBitCastOrGEP(Ptr))
4515       return;
4516 
4517     // If the pointer has already been identified as scalar (e.g., if it was
4518     // also identified as uniform), there's nothing to do.
4519     auto *I = cast<Instruction>(Ptr);
4520     if (Worklist.count(I))
4521       return;
4522 
4523     // If the use of the pointer will be a scalar use, and all users of the
4524     // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
4525     // place the pointer in PossibleNonScalarPtrs.
4526     if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
4527           return isa<LoadInst>(U) || isa<StoreInst>(U);
4528         }))
4529       ScalarPtrs.insert(I);
4530     else
4531       PossibleNonScalarPtrs.insert(I);
4532   };
4533 
4534   // We seed the scalars analysis with three classes of instructions: (1)
4535   // instructions marked uniform-after-vectorization, (2) bitcast and
4536   // getelementptr instructions used by memory accesses requiring a scalar use,
4537   // and (3) pointer induction variables and their update instructions (we
4538   // currently only scalarize these).
4539   //
4540   // (1) Add to the worklist all instructions that have been identified as
4541   // uniform-after-vectorization.
4542   Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
4543 
4544   // (2) Add to the worklist all bitcast and getelementptr instructions used by
4545   // memory accesses requiring a scalar use. The pointer operands of loads and
4546   // stores will be scalar as long as the memory accesses is not a gather or
4547   // scatter operation. The value operand of a store will remain scalar if the
4548   // store is scalarized.
4549   for (auto *BB : TheLoop->blocks())
4550     for (auto &I : *BB) {
4551       if (auto *Load = dyn_cast<LoadInst>(&I)) {
4552         evaluatePtrUse(Load, Load->getPointerOperand());
4553       } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
4554         evaluatePtrUse(Store, Store->getPointerOperand());
4555         evaluatePtrUse(Store, Store->getValueOperand());
4556       }
4557     }
4558   for (auto *I : ScalarPtrs)
4559     if (!PossibleNonScalarPtrs.count(I)) {
4560       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
4561       Worklist.insert(I);
4562     }
4563 
4564   // (3) Add to the worklist all pointer induction variables and their update
4565   // instructions.
4566   //
4567   // TODO: Once we are able to vectorize pointer induction variables we should
4568   //       no longer insert them into the worklist here.
4569   auto *Latch = TheLoop->getLoopLatch();
4570   for (auto &Induction : Legal->getInductionVars()) {
4571     auto *Ind = Induction.first;
4572     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4573     if (Induction.second.getKind() != InductionDescriptor::IK_PtrInduction)
4574       continue;
4575     Worklist.insert(Ind);
4576     Worklist.insert(IndUpdate);
4577     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
4578     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
4579                       << "\n");
4580   }
4581 
4582   // Insert the forced scalars.
4583   // FIXME: Currently widenPHIInstruction() often creates a dead vector
4584   // induction variable when the PHI user is scalarized.
4585   auto ForcedScalar = ForcedScalars.find(VF);
4586   if (ForcedScalar != ForcedScalars.end())
4587     for (auto *I : ForcedScalar->second)
4588       Worklist.insert(I);
4589 
4590   // Expand the worklist by looking through any bitcasts and getelementptr
4591   // instructions we've already identified as scalar. This is similar to the
4592   // expansion step in collectLoopUniforms(); however, here we're only
4593   // expanding to include additional bitcasts and getelementptr instructions.
4594   unsigned Idx = 0;
4595   while (Idx != Worklist.size()) {
4596     Instruction *Dst = Worklist[Idx++];
4597     if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
4598       continue;
4599     auto *Src = cast<Instruction>(Dst->getOperand(0));
4600     if (llvm::all_of(Src->users(), [&](User *U) -> bool {
4601           auto *J = cast<Instruction>(U);
4602           return !TheLoop->contains(J) || Worklist.count(J) ||
4603                  ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
4604                   isScalarUse(J, Src));
4605         })) {
4606       Worklist.insert(Src);
4607       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
4608     }
4609   }
4610 
4611   // An induction variable will remain scalar if all users of the induction
4612   // variable and induction variable update remain scalar.
4613   for (auto &Induction : Legal->getInductionVars()) {
4614     auto *Ind = Induction.first;
4615     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4616 
4617     // We already considered pointer induction variables, so there's no reason
4618     // to look at their users again.
4619     //
4620     // TODO: Once we are able to vectorize pointer induction variables we
4621     //       should no longer skip over them here.
4622     if (Induction.second.getKind() == InductionDescriptor::IK_PtrInduction)
4623       continue;
4624 
4625     // If tail-folding is applied, the primary induction variable will be used
4626     // to feed a vector compare.
4627     if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
4628       continue;
4629 
4630     // Determine if all users of the induction variable are scalar after
4631     // vectorization.
4632     auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4633       auto *I = cast<Instruction>(U);
4634       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I);
4635     });
4636     if (!ScalarInd)
4637       continue;
4638 
4639     // Determine if all users of the induction variable update instruction are
4640     // scalar after vectorization.
4641     auto ScalarIndUpdate =
4642         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4643           auto *I = cast<Instruction>(U);
4644           return I == Ind || !TheLoop->contains(I) || Worklist.count(I);
4645         });
4646     if (!ScalarIndUpdate)
4647       continue;
4648 
4649     // The induction variable and its update instruction will remain scalar.
4650     Worklist.insert(Ind);
4651     Worklist.insert(IndUpdate);
4652     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
4653     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
4654                       << "\n");
4655   }
4656 
4657   Scalars[VF].insert(Worklist.begin(), Worklist.end());
4658 }
4659 
4660 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, unsigned VF) {
4661   if (!blockNeedsPredication(I->getParent()))
4662     return false;
4663   switch(I->getOpcode()) {
4664   default:
4665     break;
4666   case Instruction::Load:
4667   case Instruction::Store: {
4668     if (!Legal->isMaskRequired(I))
4669       return false;
4670     auto *Ptr = getLoadStorePointerOperand(I);
4671     auto *Ty = getMemInstValueType(I);
4672     // We have already decided how to vectorize this instruction, get that
4673     // result.
4674     if (VF > 1) {
4675       InstWidening WideningDecision = getWideningDecision(I, VF);
4676       assert(WideningDecision != CM_Unknown &&
4677              "Widening decision should be ready at this moment");
4678       return WideningDecision == CM_Scalarize;
4679     }
4680     const Align Alignment = getLoadStoreAlignment(I);
4681     return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
4682                                 isLegalMaskedGather(Ty, Alignment))
4683                             : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
4684                                 isLegalMaskedScatter(Ty, Alignment));
4685   }
4686   case Instruction::UDiv:
4687   case Instruction::SDiv:
4688   case Instruction::SRem:
4689   case Instruction::URem:
4690     return mayDivideByZero(*I);
4691   }
4692   return false;
4693 }
4694 
4695 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(Instruction *I,
4696                                                                unsigned VF) {
4697   assert(isAccessInterleaved(I) && "Expecting interleaved access.");
4698   assert(getWideningDecision(I, VF) == CM_Unknown &&
4699          "Decision should not be set yet.");
4700   auto *Group = getInterleavedAccessGroup(I);
4701   assert(Group && "Must have a group.");
4702 
4703   // If the instruction's allocated size doesn't equal it's type size, it
4704   // requires padding and will be scalarized.
4705   auto &DL = I->getModule()->getDataLayout();
4706   auto *ScalarTy = getMemInstValueType(I);
4707   if (hasIrregularType(ScalarTy, DL, VF))
4708     return false;
4709 
4710   // Check if masking is required.
4711   // A Group may need masking for one of two reasons: it resides in a block that
4712   // needs predication, or it was decided to use masking to deal with gaps.
4713   bool PredicatedAccessRequiresMasking =
4714       Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I);
4715   bool AccessWithGapsRequiresMasking =
4716       Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
4717   if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking)
4718     return true;
4719 
4720   // If masked interleaving is required, we expect that the user/target had
4721   // enabled it, because otherwise it either wouldn't have been created or
4722   // it should have been invalidated by the CostModel.
4723   assert(useMaskedInterleavedAccesses(TTI) &&
4724          "Masked interleave-groups for predicated accesses are not enabled.");
4725 
4726   auto *Ty = getMemInstValueType(I);
4727   const Align Alignment = getLoadStoreAlignment(I);
4728   return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
4729                           : TTI.isLegalMaskedStore(Ty, Alignment);
4730 }
4731 
4732 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(Instruction *I,
4733                                                                unsigned VF) {
4734   // Get and ensure we have a valid memory instruction.
4735   LoadInst *LI = dyn_cast<LoadInst>(I);
4736   StoreInst *SI = dyn_cast<StoreInst>(I);
4737   assert((LI || SI) && "Invalid memory instruction");
4738 
4739   auto *Ptr = getLoadStorePointerOperand(I);
4740 
4741   // In order to be widened, the pointer should be consecutive, first of all.
4742   if (!Legal->isConsecutivePtr(Ptr))
4743     return false;
4744 
4745   // If the instruction is a store located in a predicated block, it will be
4746   // scalarized.
4747   if (isScalarWithPredication(I))
4748     return false;
4749 
4750   // If the instruction's allocated size doesn't equal it's type size, it
4751   // requires padding and will be scalarized.
4752   auto &DL = I->getModule()->getDataLayout();
4753   auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType();
4754   if (hasIrregularType(ScalarTy, DL, VF))
4755     return false;
4756 
4757   return true;
4758 }
4759 
4760 void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
4761   // We should not collect Uniforms more than once per VF. Right now,
4762   // this function is called from collectUniformsAndScalars(), which
4763   // already does this check. Collecting Uniforms for VF=1 does not make any
4764   // sense.
4765 
4766   assert(VF >= 2 && Uniforms.find(VF) == Uniforms.end() &&
4767          "This function should not be visited twice for the same VF");
4768 
4769   // Visit the list of Uniforms. If we'll not find any uniform value, we'll
4770   // not analyze again.  Uniforms.count(VF) will return 1.
4771   Uniforms[VF].clear();
4772 
4773   // We now know that the loop is vectorizable!
4774   // Collect instructions inside the loop that will remain uniform after
4775   // vectorization.
4776 
4777   // Global values, params and instructions outside of current loop are out of
4778   // scope.
4779   auto isOutOfScope = [&](Value *V) -> bool {
4780     Instruction *I = dyn_cast<Instruction>(V);
4781     return (!I || !TheLoop->contains(I));
4782   };
4783 
4784   SetVector<Instruction *> Worklist;
4785   BasicBlock *Latch = TheLoop->getLoopLatch();
4786 
4787   // Instructions that are scalar with predication must not be considered
4788   // uniform after vectorization, because that would create an erroneous
4789   // replicating region where only a single instance out of VF should be formed.
4790   // TODO: optimize such seldom cases if found important, see PR40816.
4791   auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
4792     if (isScalarWithPredication(I, VF)) {
4793       LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "
4794                         << *I << "\n");
4795       return;
4796     }
4797     LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
4798     Worklist.insert(I);
4799   };
4800 
4801   // Start with the conditional branch. If the branch condition is an
4802   // instruction contained in the loop that is only used by the branch, it is
4803   // uniform.
4804   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
4805   if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
4806     addToWorklistIfAllowed(Cmp);
4807 
4808   // Holds consecutive and consecutive-like pointers. Consecutive-like pointers
4809   // are pointers that are treated like consecutive pointers during
4810   // vectorization. The pointer operands of interleaved accesses are an
4811   // example.
4812   SmallSetVector<Instruction *, 8> ConsecutiveLikePtrs;
4813 
4814   // Holds pointer operands of instructions that are possibly non-uniform.
4815   SmallPtrSet<Instruction *, 8> PossibleNonUniformPtrs;
4816 
4817   auto isUniformDecision = [&](Instruction *I, unsigned VF) {
4818     InstWidening WideningDecision = getWideningDecision(I, VF);
4819     assert(WideningDecision != CM_Unknown &&
4820            "Widening decision should be ready at this moment");
4821 
4822     return (WideningDecision == CM_Widen ||
4823             WideningDecision == CM_Widen_Reverse ||
4824             WideningDecision == CM_Interleave);
4825   };
4826   // Iterate over the instructions in the loop, and collect all
4827   // consecutive-like pointer operands in ConsecutiveLikePtrs. If it's possible
4828   // that a consecutive-like pointer operand will be scalarized, we collect it
4829   // in PossibleNonUniformPtrs instead. We use two sets here because a single
4830   // getelementptr instruction can be used by both vectorized and scalarized
4831   // memory instructions. For example, if a loop loads and stores from the same
4832   // location, but the store is conditional, the store will be scalarized, and
4833   // the getelementptr won't remain uniform.
4834   for (auto *BB : TheLoop->blocks())
4835     for (auto &I : *BB) {
4836       // If there's no pointer operand, there's nothing to do.
4837       auto *Ptr = dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
4838       if (!Ptr)
4839         continue;
4840 
4841       // True if all users of Ptr are memory accesses that have Ptr as their
4842       // pointer operand.
4843       auto UsersAreMemAccesses =
4844           llvm::all_of(Ptr->users(), [&](User *U) -> bool {
4845             return getLoadStorePointerOperand(U) == Ptr;
4846           });
4847 
4848       // Ensure the memory instruction will not be scalarized or used by
4849       // gather/scatter, making its pointer operand non-uniform. If the pointer
4850       // operand is used by any instruction other than a memory access, we
4851       // conservatively assume the pointer operand may be non-uniform.
4852       if (!UsersAreMemAccesses || !isUniformDecision(&I, VF))
4853         PossibleNonUniformPtrs.insert(Ptr);
4854 
4855       // If the memory instruction will be vectorized and its pointer operand
4856       // is consecutive-like, or interleaving - the pointer operand should
4857       // remain uniform.
4858       else
4859         ConsecutiveLikePtrs.insert(Ptr);
4860     }
4861 
4862   // Add to the Worklist all consecutive and consecutive-like pointers that
4863   // aren't also identified as possibly non-uniform.
4864   for (auto *V : ConsecutiveLikePtrs)
4865     if (!PossibleNonUniformPtrs.count(V))
4866       addToWorklistIfAllowed(V);
4867 
4868   // Expand Worklist in topological order: whenever a new instruction
4869   // is added , its users should be already inside Worklist.  It ensures
4870   // a uniform instruction will only be used by uniform instructions.
4871   unsigned idx = 0;
4872   while (idx != Worklist.size()) {
4873     Instruction *I = Worklist[idx++];
4874 
4875     for (auto OV : I->operand_values()) {
4876       // isOutOfScope operands cannot be uniform instructions.
4877       if (isOutOfScope(OV))
4878         continue;
4879       // First order recurrence Phi's should typically be considered
4880       // non-uniform.
4881       auto *OP = dyn_cast<PHINode>(OV);
4882       if (OP && Legal->isFirstOrderRecurrence(OP))
4883         continue;
4884       // If all the users of the operand are uniform, then add the
4885       // operand into the uniform worklist.
4886       auto *OI = cast<Instruction>(OV);
4887       if (llvm::all_of(OI->users(), [&](User *U) -> bool {
4888             auto *J = cast<Instruction>(U);
4889             return Worklist.count(J) ||
4890                    (OI == getLoadStorePointerOperand(J) &&
4891                     isUniformDecision(J, VF));
4892           }))
4893         addToWorklistIfAllowed(OI);
4894     }
4895   }
4896 
4897   // Returns true if Ptr is the pointer operand of a memory access instruction
4898   // I, and I is known to not require scalarization.
4899   auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
4900     return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF);
4901   };
4902 
4903   // For an instruction to be added into Worklist above, all its users inside
4904   // the loop should also be in Worklist. However, this condition cannot be
4905   // true for phi nodes that form a cyclic dependence. We must process phi
4906   // nodes separately. An induction variable will remain uniform if all users
4907   // of the induction variable and induction variable update remain uniform.
4908   // The code below handles both pointer and non-pointer induction variables.
4909   for (auto &Induction : Legal->getInductionVars()) {
4910     auto *Ind = Induction.first;
4911     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4912 
4913     // Determine if all users of the induction variable are uniform after
4914     // vectorization.
4915     auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4916       auto *I = cast<Instruction>(U);
4917       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
4918              isVectorizedMemAccessUse(I, Ind);
4919     });
4920     if (!UniformInd)
4921       continue;
4922 
4923     // Determine if all users of the induction variable update instruction are
4924     // uniform after vectorization.
4925     auto UniformIndUpdate =
4926         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4927           auto *I = cast<Instruction>(U);
4928           return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
4929                  isVectorizedMemAccessUse(I, IndUpdate);
4930         });
4931     if (!UniformIndUpdate)
4932       continue;
4933 
4934     // The induction variable and its update instruction will remain uniform.
4935     addToWorklistIfAllowed(Ind);
4936     addToWorklistIfAllowed(IndUpdate);
4937   }
4938 
4939   Uniforms[VF].insert(Worklist.begin(), Worklist.end());
4940 }
4941 
4942 bool LoopVectorizationCostModel::runtimeChecksRequired() {
4943   LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
4944 
4945   if (Legal->getRuntimePointerChecking()->Need) {
4946     reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
4947         "runtime pointer checks needed. Enable vectorization of this "
4948         "loop with '#pragma clang loop vectorize(enable)' when "
4949         "compiling with -Os/-Oz",
4950         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4951     return true;
4952   }
4953 
4954   if (!PSE.getUnionPredicate().getPredicates().empty()) {
4955     reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
4956         "runtime SCEV checks needed. Enable vectorization of this "
4957         "loop with '#pragma clang loop vectorize(enable)' when "
4958         "compiling with -Os/-Oz",
4959         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4960     return true;
4961   }
4962 
4963   // FIXME: Avoid specializing for stride==1 instead of bailing out.
4964   if (!Legal->getLAI()->getSymbolicStrides().empty()) {
4965     reportVectorizationFailure("Runtime stride check for small trip count",
4966         "runtime stride == 1 checks needed. Enable vectorization of "
4967         "this loop without such check by compiling with -Os/-Oz",
4968         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4969     return true;
4970   }
4971 
4972   return false;
4973 }
4974 
4975 Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(unsigned UserVF,
4976                                                             unsigned UserIC) {
4977   if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
4978     // TODO: It may by useful to do since it's still likely to be dynamically
4979     // uniform if the target can skip.
4980     reportVectorizationFailure(
4981         "Not inserting runtime ptr check for divergent target",
4982         "runtime pointer checks needed. Not enabled for divergent target",
4983         "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
4984     return None;
4985   }
4986 
4987   unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
4988   LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
4989   if (TC == 1) {
4990     reportVectorizationFailure("Single iteration (non) loop",
4991         "loop trip count is one, irrelevant for vectorization",
4992         "SingleIterationLoop", ORE, TheLoop);
4993     return None;
4994   }
4995 
4996   switch (ScalarEpilogueStatus) {
4997   case CM_ScalarEpilogueAllowed:
4998     return UserVF ? UserVF : computeFeasibleMaxVF(TC);
4999   case CM_ScalarEpilogueNotNeededUsePredicate:
5000     LLVM_DEBUG(
5001         dbgs() << "LV: vector predicate hint/switch found.\n"
5002                << "LV: Not allowing scalar epilogue, creating predicated "
5003                << "vector loop.\n");
5004     break;
5005   case CM_ScalarEpilogueNotAllowedLowTripLoop:
5006     // fallthrough as a special case of OptForSize
5007   case CM_ScalarEpilogueNotAllowedOptSize:
5008     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
5009       LLVM_DEBUG(
5010           dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
5011     else
5012       LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
5013                         << "count.\n");
5014 
5015     // Bail if runtime checks are required, which are not good when optimising
5016     // for size.
5017     if (runtimeChecksRequired())
5018       return None;
5019     break;
5020   }
5021 
5022   // Now try the tail folding
5023 
5024   // Invalidate interleave groups that require an epilogue if we can't mask
5025   // the interleave-group.
5026   if (!useMaskedInterleavedAccesses(TTI)) {
5027     assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() &&
5028            "No decisions should have been taken at this point");
5029     // Note: There is no need to invalidate any cost modeling decisions here, as
5030     // non where taken so far.
5031     InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
5032   }
5033 
5034   unsigned MaxVF = UserVF ? UserVF : computeFeasibleMaxVF(TC);
5035   assert((UserVF || isPowerOf2_32(MaxVF)) && "MaxVF must be a power of 2");
5036   unsigned MaxVFtimesIC = UserIC ? MaxVF * UserIC : MaxVF;
5037   if (TC > 0 && TC % MaxVFtimesIC == 0) {
5038     // Accept MaxVF if we do not have a tail.
5039     LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
5040     return MaxVF;
5041   }
5042 
5043   // If we don't know the precise trip count, or if the trip count that we
5044   // found modulo the vectorization factor is not zero, try to fold the tail
5045   // by masking.
5046   // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
5047   if (Legal->prepareToFoldTailByMasking()) {
5048     FoldTailByMasking = true;
5049     return MaxVF;
5050   }
5051 
5052   if (TC == 0) {
5053     reportVectorizationFailure(
5054         "Unable to calculate the loop count due to complex control flow",
5055         "unable to calculate the loop count due to complex control flow",
5056         "UnknownLoopCountComplexCFG", ORE, TheLoop);
5057     return None;
5058   }
5059 
5060   reportVectorizationFailure(
5061       "Cannot optimize for size and vectorize at the same time.",
5062       "cannot optimize for size and vectorize at the same time. "
5063       "Enable vectorization of this loop with '#pragma clang loop "
5064       "vectorize(enable)' when compiling with -Os/-Oz",
5065       "NoTailLoopWithOptForSize", ORE, TheLoop);
5066   return None;
5067 }
5068 
5069 unsigned
5070 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) {
5071   MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
5072   unsigned SmallestType, WidestType;
5073   std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
5074   unsigned WidestRegister = TTI.getRegisterBitWidth(true);
5075 
5076   // Get the maximum safe dependence distance in bits computed by LAA.
5077   // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
5078   // the memory accesses that is most restrictive (involved in the smallest
5079   // dependence distance).
5080   unsigned MaxSafeRegisterWidth = Legal->getMaxSafeRegisterWidth();
5081 
5082   WidestRegister = std::min(WidestRegister, MaxSafeRegisterWidth);
5083 
5084   // Ensure MaxVF is a power of 2; the dependence distance bound may not be.
5085   // Note that both WidestRegister and WidestType may not be a powers of 2.
5086   unsigned MaxVectorSize = PowerOf2Floor(WidestRegister / WidestType);
5087 
5088   LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
5089                     << " / " << WidestType << " bits.\n");
5090   LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
5091                     << WidestRegister << " bits.\n");
5092 
5093   assert(MaxVectorSize <= 256 && "Did not expect to pack so many elements"
5094                                  " into one vector!");
5095   if (MaxVectorSize == 0) {
5096     LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n");
5097     MaxVectorSize = 1;
5098     return MaxVectorSize;
5099   } else if (ConstTripCount && ConstTripCount < MaxVectorSize &&
5100              isPowerOf2_32(ConstTripCount)) {
5101     // We need to clamp the VF to be the ConstTripCount. There is no point in
5102     // choosing a higher viable VF as done in the loop below.
5103     LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: "
5104                       << ConstTripCount << "\n");
5105     MaxVectorSize = ConstTripCount;
5106     return MaxVectorSize;
5107   }
5108 
5109   unsigned MaxVF = MaxVectorSize;
5110   if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) ||
5111       (MaximizeBandwidth && isScalarEpilogueAllowed())) {
5112     // Collect all viable vectorization factors larger than the default MaxVF
5113     // (i.e. MaxVectorSize).
5114     SmallVector<unsigned, 8> VFs;
5115     unsigned NewMaxVectorSize = WidestRegister / SmallestType;
5116     for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2)
5117       VFs.push_back(VS);
5118 
5119     // For each VF calculate its register usage.
5120     auto RUs = calculateRegisterUsage(VFs);
5121 
5122     // Select the largest VF which doesn't require more registers than existing
5123     // ones.
5124     for (int i = RUs.size() - 1; i >= 0; --i) {
5125       bool Selected = true;
5126       for (auto& pair : RUs[i].MaxLocalUsers) {
5127         unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5128         if (pair.second > TargetNumRegisters)
5129           Selected = false;
5130       }
5131       if (Selected) {
5132         MaxVF = VFs[i];
5133         break;
5134       }
5135     }
5136     if (unsigned MinVF = TTI.getMinimumVF(SmallestType)) {
5137       if (MaxVF < MinVF) {
5138         LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
5139                           << ") with target's minimum: " << MinVF << '\n');
5140         MaxVF = MinVF;
5141       }
5142     }
5143   }
5144   return MaxVF;
5145 }
5146 
5147 VectorizationFactor
5148 LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) {
5149   float Cost = expectedCost(1).first;
5150   const float ScalarCost = Cost;
5151   unsigned Width = 1;
5152   LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n");
5153 
5154   bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
5155   if (ForceVectorization && MaxVF > 1) {
5156     // Ignore scalar width, because the user explicitly wants vectorization.
5157     // Initialize cost to max so that VF = 2 is, at least, chosen during cost
5158     // evaluation.
5159     Cost = std::numeric_limits<float>::max();
5160   }
5161 
5162   for (unsigned i = 2; i <= MaxVF; i *= 2) {
5163     // Notice that the vector loop needs to be executed less times, so
5164     // we need to divide the cost of the vector loops by the width of
5165     // the vector elements.
5166     VectorizationCostTy C = expectedCost(i);
5167     float VectorCost = C.first / (float)i;
5168     LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
5169                       << " costs: " << (int)VectorCost << ".\n");
5170     if (!C.second && !ForceVectorization) {
5171       LLVM_DEBUG(
5172           dbgs() << "LV: Not considering vector loop of width " << i
5173                  << " because it will not generate any vector instructions.\n");
5174       continue;
5175     }
5176     if (VectorCost < Cost) {
5177       Cost = VectorCost;
5178       Width = i;
5179     }
5180   }
5181 
5182   if (!EnableCondStoresVectorization && NumPredStores) {
5183     reportVectorizationFailure("There are conditional stores.",
5184         "store that is conditionally executed prevents vectorization",
5185         "ConditionalStore", ORE, TheLoop);
5186     Width = 1;
5187     Cost = ScalarCost;
5188   }
5189 
5190   LLVM_DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs()
5191              << "LV: Vectorization seems to be not beneficial, "
5192              << "but was forced by a user.\n");
5193   LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n");
5194   VectorizationFactor Factor = {Width, (unsigned)(Width * Cost)};
5195   return Factor;
5196 }
5197 
5198 std::pair<unsigned, unsigned>
5199 LoopVectorizationCostModel::getSmallestAndWidestTypes() {
5200   unsigned MinWidth = -1U;
5201   unsigned MaxWidth = 8;
5202   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5203 
5204   // For each block.
5205   for (BasicBlock *BB : TheLoop->blocks()) {
5206     // For each instruction in the loop.
5207     for (Instruction &I : BB->instructionsWithoutDebug()) {
5208       Type *T = I.getType();
5209 
5210       // Skip ignored values.
5211       if (ValuesToIgnore.count(&I))
5212         continue;
5213 
5214       // Only examine Loads, Stores and PHINodes.
5215       if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
5216         continue;
5217 
5218       // Examine PHI nodes that are reduction variables. Update the type to
5219       // account for the recurrence type.
5220       if (auto *PN = dyn_cast<PHINode>(&I)) {
5221         if (!Legal->isReductionVariable(PN))
5222           continue;
5223         RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[PN];
5224         T = RdxDesc.getRecurrenceType();
5225       }
5226 
5227       // Examine the stored values.
5228       if (auto *ST = dyn_cast<StoreInst>(&I))
5229         T = ST->getValueOperand()->getType();
5230 
5231       // Ignore loaded pointer types and stored pointer types that are not
5232       // vectorizable.
5233       //
5234       // FIXME: The check here attempts to predict whether a load or store will
5235       //        be vectorized. We only know this for certain after a VF has
5236       //        been selected. Here, we assume that if an access can be
5237       //        vectorized, it will be. We should also look at extending this
5238       //        optimization to non-pointer types.
5239       //
5240       if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) &&
5241           !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I))
5242         continue;
5243 
5244       MinWidth = std::min(MinWidth,
5245                           (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
5246       MaxWidth = std::max(MaxWidth,
5247                           (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
5248     }
5249   }
5250 
5251   return {MinWidth, MaxWidth};
5252 }
5253 
5254 unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF,
5255                                                            unsigned LoopCost) {
5256   // -- The interleave heuristics --
5257   // We interleave the loop in order to expose ILP and reduce the loop overhead.
5258   // There are many micro-architectural considerations that we can't predict
5259   // at this level. For example, frontend pressure (on decode or fetch) due to
5260   // code size, or the number and capabilities of the execution ports.
5261   //
5262   // We use the following heuristics to select the interleave count:
5263   // 1. If the code has reductions, then we interleave to break the cross
5264   // iteration dependency.
5265   // 2. If the loop is really small, then we interleave to reduce the loop
5266   // overhead.
5267   // 3. We don't interleave if we think that we will spill registers to memory
5268   // due to the increased register pressure.
5269 
5270   if (!isScalarEpilogueAllowed())
5271     return 1;
5272 
5273   // We used the distance for the interleave count.
5274   if (Legal->getMaxSafeDepDistBytes() != -1U)
5275     return 1;
5276 
5277   // Do not interleave loops with a relatively small known or estimated trip
5278   // count.
5279   auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
5280   if (BestKnownTC && *BestKnownTC < TinyTripCountInterleaveThreshold)
5281     return 1;
5282 
5283   RegisterUsage R = calculateRegisterUsage({VF})[0];
5284   // We divide by these constants so assume that we have at least one
5285   // instruction that uses at least one register.
5286   for (auto& pair : R.MaxLocalUsers) {
5287     pair.second = std::max(pair.second, 1U);
5288   }
5289 
5290   // We calculate the interleave count using the following formula.
5291   // Subtract the number of loop invariants from the number of available
5292   // registers. These registers are used by all of the interleaved instances.
5293   // Next, divide the remaining registers by the number of registers that is
5294   // required by the loop, in order to estimate how many parallel instances
5295   // fit without causing spills. All of this is rounded down if necessary to be
5296   // a power of two. We want power of two interleave count to simplify any
5297   // addressing operations or alignment considerations.
5298   // We also want power of two interleave counts to ensure that the induction
5299   // variable of the vector loop wraps to zero, when tail is folded by masking;
5300   // this currently happens when OptForSize, in which case IC is set to 1 above.
5301   unsigned IC = UINT_MAX;
5302 
5303   for (auto& pair : R.MaxLocalUsers) {
5304     unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5305     LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
5306                       << " registers of "
5307                       << TTI.getRegisterClassName(pair.first) << " register class\n");
5308     if (VF == 1) {
5309       if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
5310         TargetNumRegisters = ForceTargetNumScalarRegs;
5311     } else {
5312       if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
5313         TargetNumRegisters = ForceTargetNumVectorRegs;
5314     }
5315     unsigned MaxLocalUsers = pair.second;
5316     unsigned LoopInvariantRegs = 0;
5317     if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end())
5318       LoopInvariantRegs = R.LoopInvariantRegs[pair.first];
5319 
5320     unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers);
5321     // Don't count the induction variable as interleaved.
5322     if (EnableIndVarRegisterHeur) {
5323       TmpIC =
5324           PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) /
5325                         std::max(1U, (MaxLocalUsers - 1)));
5326     }
5327 
5328     IC = std::min(IC, TmpIC);
5329   }
5330 
5331   // Clamp the interleave ranges to reasonable counts.
5332   unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF);
5333 
5334   // Check if the user has overridden the max.
5335   if (VF == 1) {
5336     if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
5337       MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
5338   } else {
5339     if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
5340       MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
5341   }
5342 
5343   // If trip count is known or estimated compile time constant, limit the
5344   // interleave count to be less than the trip count divided by VF.
5345   if (BestKnownTC) {
5346     MaxInterleaveCount = std::min(*BestKnownTC / VF, MaxInterleaveCount);
5347   }
5348 
5349   // If we did not calculate the cost for VF (because the user selected the VF)
5350   // then we calculate the cost of VF here.
5351   if (LoopCost == 0)
5352     LoopCost = expectedCost(VF).first;
5353 
5354   assert(LoopCost && "Non-zero loop cost expected");
5355 
5356   // Clamp the calculated IC to be between the 1 and the max interleave count
5357   // that the target and trip count allows.
5358   if (IC > MaxInterleaveCount)
5359     IC = MaxInterleaveCount;
5360   else if (IC < 1)
5361     IC = 1;
5362 
5363   // Interleave if we vectorized this loop and there is a reduction that could
5364   // benefit from interleaving.
5365   if (VF > 1 && !Legal->getReductionVars().empty()) {
5366     LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
5367     return IC;
5368   }
5369 
5370   // Note that if we've already vectorized the loop we will have done the
5371   // runtime check and so interleaving won't require further checks.
5372   bool InterleavingRequiresRuntimePointerCheck =
5373       (VF == 1 && Legal->getRuntimePointerChecking()->Need);
5374 
5375   // We want to interleave small loops in order to reduce the loop overhead and
5376   // potentially expose ILP opportunities.
5377   LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n');
5378   if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) {
5379     // We assume that the cost overhead is 1 and we use the cost model
5380     // to estimate the cost of the loop and interleave until the cost of the
5381     // loop overhead is about 5% of the cost of the loop.
5382     unsigned SmallIC =
5383         std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost));
5384 
5385     // Interleave until store/load ports (estimated by max interleave count) are
5386     // saturated.
5387     unsigned NumStores = Legal->getNumStores();
5388     unsigned NumLoads = Legal->getNumLoads();
5389     unsigned StoresIC = IC / (NumStores ? NumStores : 1);
5390     unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
5391 
5392     // If we have a scalar reduction (vector reductions are already dealt with
5393     // by this point), we can increase the critical path length if the loop
5394     // we're interleaving is inside another loop. Limit, by default to 2, so the
5395     // critical path only gets increased by one reduction operation.
5396     if (!Legal->getReductionVars().empty() && TheLoop->getLoopDepth() > 1) {
5397       unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
5398       SmallIC = std::min(SmallIC, F);
5399       StoresIC = std::min(StoresIC, F);
5400       LoadsIC = std::min(LoadsIC, F);
5401     }
5402 
5403     if (EnableLoadStoreRuntimeInterleave &&
5404         std::max(StoresIC, LoadsIC) > SmallIC) {
5405       LLVM_DEBUG(
5406           dbgs() << "LV: Interleaving to saturate store or load ports.\n");
5407       return std::max(StoresIC, LoadsIC);
5408     }
5409 
5410     LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
5411     return SmallIC;
5412   }
5413 
5414   // Interleave if this is a large loop (small loops are already dealt with by
5415   // this point) that could benefit from interleaving.
5416   bool HasReductions = !Legal->getReductionVars().empty();
5417   if (TTI.enableAggressiveInterleaving(HasReductions)) {
5418     LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5419     return IC;
5420   }
5421 
5422   LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
5423   return 1;
5424 }
5425 
5426 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
5427 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {
5428   // This function calculates the register usage by measuring the highest number
5429   // of values that are alive at a single location. Obviously, this is a very
5430   // rough estimation. We scan the loop in a topological order in order and
5431   // assign a number to each instruction. We use RPO to ensure that defs are
5432   // met before their users. We assume that each instruction that has in-loop
5433   // users starts an interval. We record every time that an in-loop value is
5434   // used, so we have a list of the first and last occurrences of each
5435   // instruction. Next, we transpose this data structure into a multi map that
5436   // holds the list of intervals that *end* at a specific location. This multi
5437   // map allows us to perform a linear search. We scan the instructions linearly
5438   // and record each time that a new interval starts, by placing it in a set.
5439   // If we find this value in the multi-map then we remove it from the set.
5440   // The max register usage is the maximum size of the set.
5441   // We also search for instructions that are defined outside the loop, but are
5442   // used inside the loop. We need this number separately from the max-interval
5443   // usage number because when we unroll, loop-invariant values do not take
5444   // more register.
5445   LoopBlocksDFS DFS(TheLoop);
5446   DFS.perform(LI);
5447 
5448   RegisterUsage RU;
5449 
5450   // Each 'key' in the map opens a new interval. The values
5451   // of the map are the index of the 'last seen' usage of the
5452   // instruction that is the key.
5453   using IntervalMap = DenseMap<Instruction *, unsigned>;
5454 
5455   // Maps instruction to its index.
5456   SmallVector<Instruction *, 64> IdxToInstr;
5457   // Marks the end of each interval.
5458   IntervalMap EndPoint;
5459   // Saves the list of instruction indices that are used in the loop.
5460   SmallPtrSet<Instruction *, 8> Ends;
5461   // Saves the list of values that are used in the loop but are
5462   // defined outside the loop, such as arguments and constants.
5463   SmallPtrSet<Value *, 8> LoopInvariants;
5464 
5465   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
5466     for (Instruction &I : BB->instructionsWithoutDebug()) {
5467       IdxToInstr.push_back(&I);
5468 
5469       // Save the end location of each USE.
5470       for (Value *U : I.operands()) {
5471         auto *Instr = dyn_cast<Instruction>(U);
5472 
5473         // Ignore non-instruction values such as arguments, constants, etc.
5474         if (!Instr)
5475           continue;
5476 
5477         // If this instruction is outside the loop then record it and continue.
5478         if (!TheLoop->contains(Instr)) {
5479           LoopInvariants.insert(Instr);
5480           continue;
5481         }
5482 
5483         // Overwrite previous end points.
5484         EndPoint[Instr] = IdxToInstr.size();
5485         Ends.insert(Instr);
5486       }
5487     }
5488   }
5489 
5490   // Saves the list of intervals that end with the index in 'key'.
5491   using InstrList = SmallVector<Instruction *, 2>;
5492   DenseMap<unsigned, InstrList> TransposeEnds;
5493 
5494   // Transpose the EndPoints to a list of values that end at each index.
5495   for (auto &Interval : EndPoint)
5496     TransposeEnds[Interval.second].push_back(Interval.first);
5497 
5498   SmallPtrSet<Instruction *, 8> OpenIntervals;
5499 
5500   // Get the size of the widest register.
5501   unsigned MaxSafeDepDist = -1U;
5502   if (Legal->getMaxSafeDepDistBytes() != -1U)
5503     MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8;
5504   unsigned WidestRegister =
5505       std::min(TTI.getRegisterBitWidth(true), MaxSafeDepDist);
5506   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5507 
5508   SmallVector<RegisterUsage, 8> RUs(VFs.size());
5509   SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size());
5510 
5511   LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
5512 
5513   // A lambda that gets the register usage for the given type and VF.
5514   auto GetRegUsage = [&DL, WidestRegister](Type *Ty, unsigned VF) {
5515     if (Ty->isTokenTy())
5516       return 0U;
5517     unsigned TypeSize = DL.getTypeSizeInBits(Ty->getScalarType());
5518     return std::max<unsigned>(1, VF * TypeSize / WidestRegister);
5519   };
5520 
5521   for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
5522     Instruction *I = IdxToInstr[i];
5523 
5524     // Remove all of the instructions that end at this location.
5525     InstrList &List = TransposeEnds[i];
5526     for (Instruction *ToRemove : List)
5527       OpenIntervals.erase(ToRemove);
5528 
5529     // Ignore instructions that are never used within the loop.
5530     if (!Ends.count(I))
5531       continue;
5532 
5533     // Skip ignored values.
5534     if (ValuesToIgnore.count(I))
5535       continue;
5536 
5537     // For each VF find the maximum usage of registers.
5538     for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
5539       // Count the number of live intervals.
5540       SmallMapVector<unsigned, unsigned, 4> RegUsage;
5541 
5542       if (VFs[j] == 1) {
5543         for (auto Inst : OpenIntervals) {
5544           unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
5545           if (RegUsage.find(ClassID) == RegUsage.end())
5546             RegUsage[ClassID] = 1;
5547           else
5548             RegUsage[ClassID] += 1;
5549         }
5550       } else {
5551         collectUniformsAndScalars(VFs[j]);
5552         for (auto Inst : OpenIntervals) {
5553           // Skip ignored values for VF > 1.
5554           if (VecValuesToIgnore.count(Inst))
5555             continue;
5556           if (isScalarAfterVectorization(Inst, VFs[j])) {
5557             unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
5558             if (RegUsage.find(ClassID) == RegUsage.end())
5559               RegUsage[ClassID] = 1;
5560             else
5561               RegUsage[ClassID] += 1;
5562           } else {
5563             unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType());
5564             if (RegUsage.find(ClassID) == RegUsage.end())
5565               RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]);
5566             else
5567               RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]);
5568           }
5569         }
5570       }
5571 
5572       for (auto& pair : RegUsage) {
5573         if (MaxUsages[j].find(pair.first) != MaxUsages[j].end())
5574           MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second);
5575         else
5576           MaxUsages[j][pair.first] = pair.second;
5577       }
5578     }
5579 
5580     LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
5581                       << OpenIntervals.size() << '\n');
5582 
5583     // Add the current instruction to the list of open intervals.
5584     OpenIntervals.insert(I);
5585   }
5586 
5587   for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
5588     SmallMapVector<unsigned, unsigned, 4> Invariant;
5589 
5590     for (auto Inst : LoopInvariants) {
5591       unsigned Usage = VFs[i] == 1 ? 1 : GetRegUsage(Inst->getType(), VFs[i]);
5592       unsigned ClassID = TTI.getRegisterClassForType(VFs[i] > 1, Inst->getType());
5593       if (Invariant.find(ClassID) == Invariant.end())
5594         Invariant[ClassID] = Usage;
5595       else
5596         Invariant[ClassID] += Usage;
5597     }
5598 
5599     LLVM_DEBUG({
5600       dbgs() << "LV(REG): VF = " << VFs[i] << '\n';
5601       dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size()
5602              << " item\n";
5603       for (const auto &pair : MaxUsages[i]) {
5604         dbgs() << "LV(REG): RegisterClass: "
5605                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5606                << " registers\n";
5607       }
5608       dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()
5609              << " item\n";
5610       for (const auto &pair : Invariant) {
5611         dbgs() << "LV(REG): RegisterClass: "
5612                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5613                << " registers\n";
5614       }
5615     });
5616 
5617     RU.LoopInvariantRegs = Invariant;
5618     RU.MaxLocalUsers = MaxUsages[i];
5619     RUs[i] = RU;
5620   }
5621 
5622   return RUs;
5623 }
5624 
5625 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){
5626   // TODO: Cost model for emulated masked load/store is completely
5627   // broken. This hack guides the cost model to use an artificially
5628   // high enough value to practically disable vectorization with such
5629   // operations, except where previously deployed legality hack allowed
5630   // using very low cost values. This is to avoid regressions coming simply
5631   // from moving "masked load/store" check from legality to cost model.
5632   // Masked Load/Gather emulation was previously never allowed.
5633   // Limited number of Masked Store/Scatter emulation was allowed.
5634   assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction");
5635   return isa<LoadInst>(I) ||
5636          (isa<StoreInst>(I) &&
5637           NumPredStores > NumberOfStoresToPredicate);
5638 }
5639 
5640 void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) {
5641   // If we aren't vectorizing the loop, or if we've already collected the
5642   // instructions to scalarize, there's nothing to do. Collection may already
5643   // have occurred if we have a user-selected VF and are now computing the
5644   // expected cost for interleaving.
5645   if (VF < 2 || InstsToScalarize.find(VF) != InstsToScalarize.end())
5646     return;
5647 
5648   // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
5649   // not profitable to scalarize any instructions, the presence of VF in the
5650   // map will indicate that we've analyzed it already.
5651   ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
5652 
5653   // Find all the instructions that are scalar with predication in the loop and
5654   // determine if it would be better to not if-convert the blocks they are in.
5655   // If so, we also record the instructions to scalarize.
5656   for (BasicBlock *BB : TheLoop->blocks()) {
5657     if (!blockNeedsPredication(BB))
5658       continue;
5659     for (Instruction &I : *BB)
5660       if (isScalarWithPredication(&I)) {
5661         ScalarCostsTy ScalarCosts;
5662         // Do not apply discount logic if hacked cost is needed
5663         // for emulated masked memrefs.
5664         if (!useEmulatedMaskMemRefHack(&I) &&
5665             computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
5666           ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
5667         // Remember that BB will remain after vectorization.
5668         PredicatedBBsAfterVectorization.insert(BB);
5669       }
5670   }
5671 }
5672 
5673 int LoopVectorizationCostModel::computePredInstDiscount(
5674     Instruction *PredInst, DenseMap<Instruction *, unsigned> &ScalarCosts,
5675     unsigned VF) {
5676   assert(!isUniformAfterVectorization(PredInst, VF) &&
5677          "Instruction marked uniform-after-vectorization will be predicated");
5678 
5679   // Initialize the discount to zero, meaning that the scalar version and the
5680   // vector version cost the same.
5681   int Discount = 0;
5682 
5683   // Holds instructions to analyze. The instructions we visit are mapped in
5684   // ScalarCosts. Those instructions are the ones that would be scalarized if
5685   // we find that the scalar version costs less.
5686   SmallVector<Instruction *, 8> Worklist;
5687 
5688   // Returns true if the given instruction can be scalarized.
5689   auto canBeScalarized = [&](Instruction *I) -> bool {
5690     // We only attempt to scalarize instructions forming a single-use chain
5691     // from the original predicated block that would otherwise be vectorized.
5692     // Although not strictly necessary, we give up on instructions we know will
5693     // already be scalar to avoid traversing chains that are unlikely to be
5694     // beneficial.
5695     if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
5696         isScalarAfterVectorization(I, VF))
5697       return false;
5698 
5699     // If the instruction is scalar with predication, it will be analyzed
5700     // separately. We ignore it within the context of PredInst.
5701     if (isScalarWithPredication(I))
5702       return false;
5703 
5704     // If any of the instruction's operands are uniform after vectorization,
5705     // the instruction cannot be scalarized. This prevents, for example, a
5706     // masked load from being scalarized.
5707     //
5708     // We assume we will only emit a value for lane zero of an instruction
5709     // marked uniform after vectorization, rather than VF identical values.
5710     // Thus, if we scalarize an instruction that uses a uniform, we would
5711     // create uses of values corresponding to the lanes we aren't emitting code
5712     // for. This behavior can be changed by allowing getScalarValue to clone
5713     // the lane zero values for uniforms rather than asserting.
5714     for (Use &U : I->operands())
5715       if (auto *J = dyn_cast<Instruction>(U.get()))
5716         if (isUniformAfterVectorization(J, VF))
5717           return false;
5718 
5719     // Otherwise, we can scalarize the instruction.
5720     return true;
5721   };
5722 
5723   // Compute the expected cost discount from scalarizing the entire expression
5724   // feeding the predicated instruction. We currently only consider expressions
5725   // that are single-use instruction chains.
5726   Worklist.push_back(PredInst);
5727   while (!Worklist.empty()) {
5728     Instruction *I = Worklist.pop_back_val();
5729 
5730     // If we've already analyzed the instruction, there's nothing to do.
5731     if (ScalarCosts.find(I) != ScalarCosts.end())
5732       continue;
5733 
5734     // Compute the cost of the vector instruction. Note that this cost already
5735     // includes the scalarization overhead of the predicated instruction.
5736     unsigned VectorCost = getInstructionCost(I, VF).first;
5737 
5738     // Compute the cost of the scalarized instruction. This cost is the cost of
5739     // the instruction as if it wasn't if-converted and instead remained in the
5740     // predicated block. We will scale this cost by block probability after
5741     // computing the scalarization overhead.
5742     unsigned ScalarCost = VF * getInstructionCost(I, 1).first;
5743 
5744     // Compute the scalarization overhead of needed insertelement instructions
5745     // and phi nodes.
5746     if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) {
5747       ScalarCost += TTI.getScalarizationOverhead(
5748           cast<VectorType>(ToVectorTy(I->getType(), VF)),
5749           APInt::getAllOnesValue(VF), true, false);
5750       ScalarCost += VF * TTI.getCFInstrCost(Instruction::PHI,
5751                                             TTI::TCK_RecipThroughput);
5752     }
5753 
5754     // Compute the scalarization overhead of needed extractelement
5755     // instructions. For each of the instruction's operands, if the operand can
5756     // be scalarized, add it to the worklist; otherwise, account for the
5757     // overhead.
5758     for (Use &U : I->operands())
5759       if (auto *J = dyn_cast<Instruction>(U.get())) {
5760         assert(VectorType::isValidElementType(J->getType()) &&
5761                "Instruction has non-scalar type");
5762         if (canBeScalarized(J))
5763           Worklist.push_back(J);
5764         else if (needsExtract(J, VF))
5765           ScalarCost += TTI.getScalarizationOverhead(
5766               cast<VectorType>(ToVectorTy(J->getType(), VF)),
5767               APInt::getAllOnesValue(VF), false, true);
5768       }
5769 
5770     // Scale the total scalar cost by block probability.
5771     ScalarCost /= getReciprocalPredBlockProb();
5772 
5773     // Compute the discount. A non-negative discount means the vector version
5774     // of the instruction costs more, and scalarizing would be beneficial.
5775     Discount += VectorCost - ScalarCost;
5776     ScalarCosts[I] = ScalarCost;
5777   }
5778 
5779   return Discount;
5780 }
5781 
5782 LoopVectorizationCostModel::VectorizationCostTy
5783 LoopVectorizationCostModel::expectedCost(unsigned VF) {
5784   VectorizationCostTy Cost;
5785 
5786   // For each block.
5787   for (BasicBlock *BB : TheLoop->blocks()) {
5788     VectorizationCostTy BlockCost;
5789 
5790     // For each instruction in the old loop.
5791     for (Instruction &I : BB->instructionsWithoutDebug()) {
5792       // Skip ignored values.
5793       if (ValuesToIgnore.count(&I) || (VF > 1 && VecValuesToIgnore.count(&I)))
5794         continue;
5795 
5796       VectorizationCostTy C = getInstructionCost(&I, VF);
5797 
5798       // Check if we should override the cost.
5799       if (ForceTargetInstructionCost.getNumOccurrences() > 0)
5800         C.first = ForceTargetInstructionCost;
5801 
5802       BlockCost.first += C.first;
5803       BlockCost.second |= C.second;
5804       LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first
5805                         << " for VF " << VF << " For instruction: " << I
5806                         << '\n');
5807     }
5808 
5809     // If we are vectorizing a predicated block, it will have been
5810     // if-converted. This means that the block's instructions (aside from
5811     // stores and instructions that may divide by zero) will now be
5812     // unconditionally executed. For the scalar case, we may not always execute
5813     // the predicated block. Thus, scale the block's cost by the probability of
5814     // executing it.
5815     if (VF == 1 && blockNeedsPredication(BB))
5816       BlockCost.first /= getReciprocalPredBlockProb();
5817 
5818     Cost.first += BlockCost.first;
5819     Cost.second |= BlockCost.second;
5820   }
5821 
5822   return Cost;
5823 }
5824 
5825 /// Gets Address Access SCEV after verifying that the access pattern
5826 /// is loop invariant except the induction variable dependence.
5827 ///
5828 /// This SCEV can be sent to the Target in order to estimate the address
5829 /// calculation cost.
5830 static const SCEV *getAddressAccessSCEV(
5831               Value *Ptr,
5832               LoopVectorizationLegality *Legal,
5833               PredicatedScalarEvolution &PSE,
5834               const Loop *TheLoop) {
5835 
5836   auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
5837   if (!Gep)
5838     return nullptr;
5839 
5840   // We are looking for a gep with all loop invariant indices except for one
5841   // which should be an induction variable.
5842   auto SE = PSE.getSE();
5843   unsigned NumOperands = Gep->getNumOperands();
5844   for (unsigned i = 1; i < NumOperands; ++i) {
5845     Value *Opd = Gep->getOperand(i);
5846     if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
5847         !Legal->isInductionVariable(Opd))
5848       return nullptr;
5849   }
5850 
5851   // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
5852   return PSE.getSCEV(Ptr);
5853 }
5854 
5855 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
5856   return Legal->hasStride(I->getOperand(0)) ||
5857          Legal->hasStride(I->getOperand(1));
5858 }
5859 
5860 unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
5861                                                                  unsigned VF) {
5862   assert(VF > 1 && "Scalarization cost of instruction implies vectorization.");
5863   Type *ValTy = getMemInstValueType(I);
5864   auto SE = PSE.getSE();
5865 
5866   unsigned AS = getLoadStoreAddressSpace(I);
5867   Value *Ptr = getLoadStorePointerOperand(I);
5868   Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
5869 
5870   // Figure out whether the access is strided and get the stride value
5871   // if it's known in compile time
5872   const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
5873 
5874   // Get the cost of the scalar memory instruction and address computation.
5875   unsigned Cost = VF * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
5876 
5877   // Don't pass *I here, since it is scalar but will actually be part of a
5878   // vectorized loop where the user of it is a vectorized instruction.
5879   const Align Alignment = getLoadStoreAlignment(I);
5880   Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(),
5881                                    Alignment, AS,
5882                                    TTI::TCK_RecipThroughput);
5883 
5884   // Get the overhead of the extractelement and insertelement instructions
5885   // we might create due to scalarization.
5886   Cost += getScalarizationOverhead(I, VF);
5887 
5888   // If we have a predicated store, it may not be executed for each vector
5889   // lane. Scale the cost by the probability of executing the predicated
5890   // block.
5891   if (isPredicatedInst(I)) {
5892     Cost /= getReciprocalPredBlockProb();
5893 
5894     if (useEmulatedMaskMemRefHack(I))
5895       // Artificially setting to a high enough value to practically disable
5896       // vectorization with such operations.
5897       Cost = 3000000;
5898   }
5899 
5900   return Cost;
5901 }
5902 
5903 unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
5904                                                              unsigned VF) {
5905   Type *ValTy = getMemInstValueType(I);
5906   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
5907   Value *Ptr = getLoadStorePointerOperand(I);
5908   unsigned AS = getLoadStoreAddressSpace(I);
5909   int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
5910   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
5911 
5912   assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
5913          "Stride should be 1 or -1 for consecutive memory access");
5914   const Align Alignment = getLoadStoreAlignment(I);
5915   unsigned Cost = 0;
5916   if (Legal->isMaskRequired(I))
5917     Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
5918                                       CostKind);
5919   else
5920     Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
5921                                 CostKind, I);
5922 
5923   bool Reverse = ConsecutiveStride < 0;
5924   if (Reverse)
5925     Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
5926   return Cost;
5927 }
5928 
5929 unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
5930                                                          unsigned VF) {
5931   Type *ValTy = getMemInstValueType(I);
5932   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
5933   const Align Alignment = getLoadStoreAlignment(I);
5934   unsigned AS = getLoadStoreAddressSpace(I);
5935   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
5936   if (isa<LoadInst>(I)) {
5937     return TTI.getAddressComputationCost(ValTy) +
5938            TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS,
5939                                CostKind) +
5940            TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
5941   }
5942   StoreInst *SI = cast<StoreInst>(I);
5943 
5944   bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand());
5945   return TTI.getAddressComputationCost(ValTy) +
5946          TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS,
5947                              CostKind) +
5948          (isLoopInvariantStoreValue
5949               ? 0
5950               : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
5951                                        VF - 1));
5952 }
5953 
5954 unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
5955                                                           unsigned VF) {
5956   Type *ValTy = getMemInstValueType(I);
5957   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
5958   const Align Alignment = getLoadStoreAlignment(I);
5959   const Value *Ptr = getLoadStorePointerOperand(I);
5960 
5961   return TTI.getAddressComputationCost(VectorTy) +
5962          TTI.getGatherScatterOpCost(
5963              I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment,
5964              TargetTransformInfo::TCK_RecipThroughput, I);
5965 }
5966 
5967 unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
5968                                                             unsigned VF) {
5969   Type *ValTy = getMemInstValueType(I);
5970   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
5971   unsigned AS = getLoadStoreAddressSpace(I);
5972 
5973   auto Group = getInterleavedAccessGroup(I);
5974   assert(Group && "Fail to get an interleaved access group.");
5975 
5976   unsigned InterleaveFactor = Group->getFactor();
5977   auto *WideVecTy = FixedVectorType::get(ValTy, VF * InterleaveFactor);
5978 
5979   // Holds the indices of existing members in an interleaved load group.
5980   // An interleaved store group doesn't need this as it doesn't allow gaps.
5981   SmallVector<unsigned, 4> Indices;
5982   if (isa<LoadInst>(I)) {
5983     for (unsigned i = 0; i < InterleaveFactor; i++)
5984       if (Group->getMember(i))
5985         Indices.push_back(i);
5986   }
5987 
5988   // Calculate the cost of the whole interleaved group.
5989   bool UseMaskForGaps =
5990       Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
5991   unsigned Cost = TTI.getInterleavedMemoryOpCost(
5992       I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(),
5993       AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps);
5994 
5995   if (Group->isReverse()) {
5996     // TODO: Add support for reversed masked interleaved access.
5997     assert(!Legal->isMaskRequired(I) &&
5998            "Reverse masked interleaved access not supported.");
5999     Cost += Group->getNumMembers() *
6000             TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
6001   }
6002   return Cost;
6003 }
6004 
6005 unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
6006                                                               unsigned VF) {
6007   // Calculate scalar cost only. Vectorization cost should be ready at this
6008   // moment.
6009   if (VF == 1) {
6010     Type *ValTy = getMemInstValueType(I);
6011     const Align Alignment = getLoadStoreAlignment(I);
6012     unsigned AS = getLoadStoreAddressSpace(I);
6013 
6014     return TTI.getAddressComputationCost(ValTy) +
6015            TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS,
6016                                TTI::TCK_RecipThroughput, I);
6017   }
6018   return getWideningCost(I, VF);
6019 }
6020 
6021 LoopVectorizationCostModel::VectorizationCostTy
6022 LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
6023   // If we know that this instruction will remain uniform, check the cost of
6024   // the scalar version.
6025   if (isUniformAfterVectorization(I, VF))
6026     VF = 1;
6027 
6028   if (VF > 1 && isProfitableToScalarize(I, VF))
6029     return VectorizationCostTy(InstsToScalarize[VF][I], false);
6030 
6031   // Forced scalars do not have any scalarization overhead.
6032   auto ForcedScalar = ForcedScalars.find(VF);
6033   if (VF > 1 && ForcedScalar != ForcedScalars.end()) {
6034     auto InstSet = ForcedScalar->second;
6035     if (InstSet.count(I))
6036       return VectorizationCostTy((getInstructionCost(I, 1).first * VF), false);
6037   }
6038 
6039   Type *VectorTy;
6040   unsigned C = getInstructionCost(I, VF, VectorTy);
6041 
6042   bool TypeNotScalarized =
6043       VF > 1 && VectorTy->isVectorTy() && TTI.getNumberOfParts(VectorTy) < VF;
6044   return VectorizationCostTy(C, TypeNotScalarized);
6045 }
6046 
6047 unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
6048                                                               unsigned VF) {
6049 
6050   if (VF == 1)
6051     return 0;
6052 
6053   unsigned Cost = 0;
6054   Type *RetTy = ToVectorTy(I->getType(), VF);
6055   if (!RetTy->isVoidTy() &&
6056       (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
6057     Cost += TTI.getScalarizationOverhead(
6058         cast<VectorType>(RetTy), APInt::getAllOnesValue(VF), true, false);
6059 
6060   // Some targets keep addresses scalar.
6061   if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
6062     return Cost;
6063 
6064   // Some targets support efficient element stores.
6065   if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
6066     return Cost;
6067 
6068   // Collect operands to consider.
6069   CallInst *CI = dyn_cast<CallInst>(I);
6070   Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands();
6071 
6072   // Skip operands that do not require extraction/scalarization and do not incur
6073   // any overhead.
6074   return Cost + TTI.getOperandsScalarizationOverhead(
6075                     filterExtractingOperands(Ops, VF), VF);
6076 }
6077 
6078 void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {
6079   if (VF == 1)
6080     return;
6081   NumPredStores = 0;
6082   for (BasicBlock *BB : TheLoop->blocks()) {
6083     // For each instruction in the old loop.
6084     for (Instruction &I : *BB) {
6085       Value *Ptr =  getLoadStorePointerOperand(&I);
6086       if (!Ptr)
6087         continue;
6088 
6089       // TODO: We should generate better code and update the cost model for
6090       // predicated uniform stores. Today they are treated as any other
6091       // predicated store (see added test cases in
6092       // invariant-store-vectorization.ll).
6093       if (isa<StoreInst>(&I) && isScalarWithPredication(&I))
6094         NumPredStores++;
6095 
6096       if (Legal->isUniform(Ptr) &&
6097           // Conditional loads and stores should be scalarized and predicated.
6098           // isScalarWithPredication cannot be used here since masked
6099           // gather/scatters are not considered scalar with predication.
6100           !Legal->blockNeedsPredication(I.getParent())) {
6101         // TODO: Avoid replicating loads and stores instead of
6102         // relying on instcombine to remove them.
6103         // Load: Scalar load + broadcast
6104         // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
6105         unsigned Cost = getUniformMemOpCost(&I, VF);
6106         setWideningDecision(&I, VF, CM_Scalarize, Cost);
6107         continue;
6108       }
6109 
6110       // We assume that widening is the best solution when possible.
6111       if (memoryInstructionCanBeWidened(&I, VF)) {
6112         unsigned Cost = getConsecutiveMemOpCost(&I, VF);
6113         int ConsecutiveStride =
6114                Legal->isConsecutivePtr(getLoadStorePointerOperand(&I));
6115         assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6116                "Expected consecutive stride.");
6117         InstWidening Decision =
6118             ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
6119         setWideningDecision(&I, VF, Decision, Cost);
6120         continue;
6121       }
6122 
6123       // Choose between Interleaving, Gather/Scatter or Scalarization.
6124       unsigned InterleaveCost = std::numeric_limits<unsigned>::max();
6125       unsigned NumAccesses = 1;
6126       if (isAccessInterleaved(&I)) {
6127         auto Group = getInterleavedAccessGroup(&I);
6128         assert(Group && "Fail to get an interleaved access group.");
6129 
6130         // Make one decision for the whole group.
6131         if (getWideningDecision(&I, VF) != CM_Unknown)
6132           continue;
6133 
6134         NumAccesses = Group->getNumMembers();
6135         if (interleavedAccessCanBeWidened(&I, VF))
6136           InterleaveCost = getInterleaveGroupCost(&I, VF);
6137       }
6138 
6139       unsigned GatherScatterCost =
6140           isLegalGatherOrScatter(&I)
6141               ? getGatherScatterCost(&I, VF) * NumAccesses
6142               : std::numeric_limits<unsigned>::max();
6143 
6144       unsigned ScalarizationCost =
6145           getMemInstScalarizationCost(&I, VF) * NumAccesses;
6146 
6147       // Choose better solution for the current VF,
6148       // write down this decision and use it during vectorization.
6149       unsigned Cost;
6150       InstWidening Decision;
6151       if (InterleaveCost <= GatherScatterCost &&
6152           InterleaveCost < ScalarizationCost) {
6153         Decision = CM_Interleave;
6154         Cost = InterleaveCost;
6155       } else if (GatherScatterCost < ScalarizationCost) {
6156         Decision = CM_GatherScatter;
6157         Cost = GatherScatterCost;
6158       } else {
6159         Decision = CM_Scalarize;
6160         Cost = ScalarizationCost;
6161       }
6162       // If the instructions belongs to an interleave group, the whole group
6163       // receives the same decision. The whole group receives the cost, but
6164       // the cost will actually be assigned to one instruction.
6165       if (auto Group = getInterleavedAccessGroup(&I))
6166         setWideningDecision(Group, VF, Decision, Cost);
6167       else
6168         setWideningDecision(&I, VF, Decision, Cost);
6169     }
6170   }
6171 
6172   // Make sure that any load of address and any other address computation
6173   // remains scalar unless there is gather/scatter support. This avoids
6174   // inevitable extracts into address registers, and also has the benefit of
6175   // activating LSR more, since that pass can't optimize vectorized
6176   // addresses.
6177   if (TTI.prefersVectorizedAddressing())
6178     return;
6179 
6180   // Start with all scalar pointer uses.
6181   SmallPtrSet<Instruction *, 8> AddrDefs;
6182   for (BasicBlock *BB : TheLoop->blocks())
6183     for (Instruction &I : *BB) {
6184       Instruction *PtrDef =
6185         dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
6186       if (PtrDef && TheLoop->contains(PtrDef) &&
6187           getWideningDecision(&I, VF) != CM_GatherScatter)
6188         AddrDefs.insert(PtrDef);
6189     }
6190 
6191   // Add all instructions used to generate the addresses.
6192   SmallVector<Instruction *, 4> Worklist;
6193   for (auto *I : AddrDefs)
6194     Worklist.push_back(I);
6195   while (!Worklist.empty()) {
6196     Instruction *I = Worklist.pop_back_val();
6197     for (auto &Op : I->operands())
6198       if (auto *InstOp = dyn_cast<Instruction>(Op))
6199         if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
6200             AddrDefs.insert(InstOp).second)
6201           Worklist.push_back(InstOp);
6202   }
6203 
6204   for (auto *I : AddrDefs) {
6205     if (isa<LoadInst>(I)) {
6206       // Setting the desired widening decision should ideally be handled in
6207       // by cost functions, but since this involves the task of finding out
6208       // if the loaded register is involved in an address computation, it is
6209       // instead changed here when we know this is the case.
6210       InstWidening Decision = getWideningDecision(I, VF);
6211       if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
6212         // Scalarize a widened load of address.
6213         setWideningDecision(I, VF, CM_Scalarize,
6214                             (VF * getMemoryInstructionCost(I, 1)));
6215       else if (auto Group = getInterleavedAccessGroup(I)) {
6216         // Scalarize an interleave group of address loads.
6217         for (unsigned I = 0; I < Group->getFactor(); ++I) {
6218           if (Instruction *Member = Group->getMember(I))
6219             setWideningDecision(Member, VF, CM_Scalarize,
6220                                 (VF * getMemoryInstructionCost(Member, 1)));
6221         }
6222       }
6223     } else
6224       // Make sure I gets scalarized and a cost estimate without
6225       // scalarization overhead.
6226       ForcedScalars[VF].insert(I);
6227   }
6228 }
6229 
6230 unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
6231                                                         unsigned VF,
6232                                                         Type *&VectorTy) {
6233   Type *RetTy = I->getType();
6234   if (canTruncateToMinimalBitwidth(I, VF))
6235     RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
6236   VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF);
6237   auto SE = PSE.getSE();
6238   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6239 
6240   // TODO: We need to estimate the cost of intrinsic calls.
6241   switch (I->getOpcode()) {
6242   case Instruction::GetElementPtr:
6243     // We mark this instruction as zero-cost because the cost of GEPs in
6244     // vectorized code depends on whether the corresponding memory instruction
6245     // is scalarized or not. Therefore, we handle GEPs with the memory
6246     // instruction cost.
6247     return 0;
6248   case Instruction::Br: {
6249     // In cases of scalarized and predicated instructions, there will be VF
6250     // predicated blocks in the vectorized loop. Each branch around these
6251     // blocks requires also an extract of its vector compare i1 element.
6252     bool ScalarPredicatedBB = false;
6253     BranchInst *BI = cast<BranchInst>(I);
6254     if (VF > 1 && BI->isConditional() &&
6255         (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) ||
6256          PredicatedBBsAfterVectorization.count(BI->getSuccessor(1))))
6257       ScalarPredicatedBB = true;
6258 
6259     if (ScalarPredicatedBB) {
6260       // Return cost for branches around scalarized and predicated blocks.
6261       auto *Vec_i1Ty =
6262           FixedVectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
6263       return (TTI.getScalarizationOverhead(Vec_i1Ty, APInt::getAllOnesValue(VF),
6264                                            false, true) +
6265               (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF));
6266     } else if (I->getParent() == TheLoop->getLoopLatch() || VF == 1)
6267       // The back-edge branch will remain, as will all scalar branches.
6268       return TTI.getCFInstrCost(Instruction::Br, CostKind);
6269     else
6270       // This branch will be eliminated by if-conversion.
6271       return 0;
6272     // Note: We currently assume zero cost for an unconditional branch inside
6273     // a predicated block since it will become a fall-through, although we
6274     // may decide in the future to call TTI for all branches.
6275   }
6276   case Instruction::PHI: {
6277     auto *Phi = cast<PHINode>(I);
6278 
6279     // First-order recurrences are replaced by vector shuffles inside the loop.
6280     // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type.
6281     if (VF > 1 && Legal->isFirstOrderRecurrence(Phi))
6282       return TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector,
6283                                 cast<VectorType>(VectorTy), VF - 1,
6284                                 FixedVectorType::get(RetTy, 1));
6285 
6286     // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
6287     // converted into select instructions. We require N - 1 selects per phi
6288     // node, where N is the number of incoming values.
6289     if (VF > 1 && Phi->getParent() != TheLoop->getHeader())
6290       return (Phi->getNumIncomingValues() - 1) *
6291              TTI.getCmpSelInstrCost(
6292                  Instruction::Select, ToVectorTy(Phi->getType(), VF),
6293                  ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF),
6294                  CostKind);
6295 
6296     return TTI.getCFInstrCost(Instruction::PHI, CostKind);
6297   }
6298   case Instruction::UDiv:
6299   case Instruction::SDiv:
6300   case Instruction::URem:
6301   case Instruction::SRem:
6302     // If we have a predicated instruction, it may not be executed for each
6303     // vector lane. Get the scalarization cost and scale this amount by the
6304     // probability of executing the predicated block. If the instruction is not
6305     // predicated, we fall through to the next case.
6306     if (VF > 1 && isScalarWithPredication(I)) {
6307       unsigned Cost = 0;
6308 
6309       // These instructions have a non-void type, so account for the phi nodes
6310       // that we will create. This cost is likely to be zero. The phi node
6311       // cost, if any, should be scaled by the block probability because it
6312       // models a copy at the end of each predicated block.
6313       Cost += VF * TTI.getCFInstrCost(Instruction::PHI, CostKind);
6314 
6315       // The cost of the non-predicated instruction.
6316       Cost += VF * TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind);
6317 
6318       // The cost of insertelement and extractelement instructions needed for
6319       // scalarization.
6320       Cost += getScalarizationOverhead(I, VF);
6321 
6322       // Scale the cost by the probability of executing the predicated blocks.
6323       // This assumes the predicated block for each vector lane is equally
6324       // likely.
6325       return Cost / getReciprocalPredBlockProb();
6326     }
6327     LLVM_FALLTHROUGH;
6328   case Instruction::Add:
6329   case Instruction::FAdd:
6330   case Instruction::Sub:
6331   case Instruction::FSub:
6332   case Instruction::Mul:
6333   case Instruction::FMul:
6334   case Instruction::FDiv:
6335   case Instruction::FRem:
6336   case Instruction::Shl:
6337   case Instruction::LShr:
6338   case Instruction::AShr:
6339   case Instruction::And:
6340   case Instruction::Or:
6341   case Instruction::Xor: {
6342     // Since we will replace the stride by 1 the multiplication should go away.
6343     if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal))
6344       return 0;
6345     // Certain instructions can be cheaper to vectorize if they have a constant
6346     // second vector operand. One example of this are shifts on x86.
6347     Value *Op2 = I->getOperand(1);
6348     TargetTransformInfo::OperandValueProperties Op2VP;
6349     TargetTransformInfo::OperandValueKind Op2VK =
6350         TTI.getOperandInfo(Op2, Op2VP);
6351     if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2))
6352       Op2VK = TargetTransformInfo::OK_UniformValue;
6353 
6354     SmallVector<const Value *, 4> Operands(I->operand_values());
6355     unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
6356     return N * TTI.getArithmeticInstrCost(
6357                    I->getOpcode(), VectorTy, CostKind,
6358                    TargetTransformInfo::OK_AnyValue,
6359                    Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I);
6360   }
6361   case Instruction::FNeg: {
6362     unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
6363     return N * TTI.getArithmeticInstrCost(
6364                    I->getOpcode(), VectorTy, CostKind,
6365                    TargetTransformInfo::OK_AnyValue,
6366                    TargetTransformInfo::OK_AnyValue,
6367                    TargetTransformInfo::OP_None, TargetTransformInfo::OP_None,
6368                    I->getOperand(0), I);
6369   }
6370   case Instruction::Select: {
6371     SelectInst *SI = cast<SelectInst>(I);
6372     const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
6373     bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
6374     Type *CondTy = SI->getCondition()->getType();
6375     if (!ScalarCond)
6376       CondTy = FixedVectorType::get(CondTy, VF);
6377 
6378     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy,
6379                                   CostKind, I);
6380   }
6381   case Instruction::ICmp:
6382   case Instruction::FCmp: {
6383     Type *ValTy = I->getOperand(0)->getType();
6384     Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
6385     if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
6386       ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
6387     VectorTy = ToVectorTy(ValTy, VF);
6388     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, CostKind,
6389                                   I);
6390   }
6391   case Instruction::Store:
6392   case Instruction::Load: {
6393     unsigned Width = VF;
6394     if (Width > 1) {
6395       InstWidening Decision = getWideningDecision(I, Width);
6396       assert(Decision != CM_Unknown &&
6397              "CM decision should be taken at this point");
6398       if (Decision == CM_Scalarize)
6399         Width = 1;
6400     }
6401     VectorTy = ToVectorTy(getMemInstValueType(I), Width);
6402     return getMemoryInstructionCost(I, VF);
6403   }
6404   case Instruction::ZExt:
6405   case Instruction::SExt:
6406   case Instruction::FPToUI:
6407   case Instruction::FPToSI:
6408   case Instruction::FPExt:
6409   case Instruction::PtrToInt:
6410   case Instruction::IntToPtr:
6411   case Instruction::SIToFP:
6412   case Instruction::UIToFP:
6413   case Instruction::Trunc:
6414   case Instruction::FPTrunc:
6415   case Instruction::BitCast: {
6416     // We optimize the truncation of induction variables having constant
6417     // integer steps. The cost of these truncations is the same as the scalar
6418     // operation.
6419     if (isOptimizableIVTruncate(I, VF)) {
6420       auto *Trunc = cast<TruncInst>(I);
6421       return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
6422                                   Trunc->getSrcTy(), CostKind, Trunc);
6423     }
6424 
6425     Type *SrcScalarTy = I->getOperand(0)->getType();
6426     Type *SrcVecTy =
6427         VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
6428     if (canTruncateToMinimalBitwidth(I, VF)) {
6429       // This cast is going to be shrunk. This may remove the cast or it might
6430       // turn it into slightly different cast. For example, if MinBW == 16,
6431       // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
6432       //
6433       // Calculate the modified src and dest types.
6434       Type *MinVecTy = VectorTy;
6435       if (I->getOpcode() == Instruction::Trunc) {
6436         SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
6437         VectorTy =
6438             largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
6439       } else if (I->getOpcode() == Instruction::ZExt ||
6440                  I->getOpcode() == Instruction::SExt) {
6441         SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy);
6442         VectorTy =
6443             smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
6444       }
6445     }
6446 
6447     unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
6448     return N * TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy,
6449                                     CostKind, I);
6450   }
6451   case Instruction::Call: {
6452     bool NeedToScalarize;
6453     CallInst *CI = cast<CallInst>(I);
6454     unsigned CallCost = getVectorCallCost(CI, VF, NeedToScalarize);
6455     if (getVectorIntrinsicIDForCall(CI, TLI))
6456       return std::min(CallCost, getVectorIntrinsicCost(CI, VF));
6457     return CallCost;
6458   }
6459   default:
6460     // The cost of executing VF copies of the scalar instruction. This opcode
6461     // is unknown. Assume that it is the same as 'mul'.
6462     return VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy,
6463                                            CostKind) +
6464            getScalarizationOverhead(I, VF);
6465   } // end of switch.
6466 }
6467 
6468 char LoopVectorize::ID = 0;
6469 
6470 static const char lv_name[] = "Loop Vectorization";
6471 
6472 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
6473 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
6474 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)
6475 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
6476 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
6477 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
6478 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
6479 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
6480 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
6481 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
6482 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
6483 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
6484 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
6485 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
6486 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy)
6487 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
6488 
6489 namespace llvm {
6490 
6491 Pass *createLoopVectorizePass() { return new LoopVectorize(); }
6492 
6493 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced,
6494                               bool VectorizeOnlyWhenForced) {
6495   return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced);
6496 }
6497 
6498 } // end namespace llvm
6499 
6500 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
6501   // Check if the pointer operand of a load or store instruction is
6502   // consecutive.
6503   if (auto *Ptr = getLoadStorePointerOperand(Inst))
6504     return Legal->isConsecutivePtr(Ptr);
6505   return false;
6506 }
6507 
6508 void LoopVectorizationCostModel::collectValuesToIgnore() {
6509   // Ignore ephemeral values.
6510   CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
6511 
6512   // Ignore type-promoting instructions we identified during reduction
6513   // detection.
6514   for (auto &Reduction : Legal->getReductionVars()) {
6515     RecurrenceDescriptor &RedDes = Reduction.second;
6516     SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
6517     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
6518   }
6519   // Ignore type-casting instructions we identified during induction
6520   // detection.
6521   for (auto &Induction : Legal->getInductionVars()) {
6522     InductionDescriptor &IndDes = Induction.second;
6523     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
6524     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
6525   }
6526 }
6527 
6528 // TODO: we could return a pair of values that specify the max VF and
6529 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
6530 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
6531 // doesn't have a cost model that can choose which plan to execute if
6532 // more than one is generated.
6533 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits,
6534                                  LoopVectorizationCostModel &CM) {
6535   unsigned WidestType;
6536   std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
6537   return WidestVectorRegBits / WidestType;
6538 }
6539 
6540 VectorizationFactor
6541 LoopVectorizationPlanner::planInVPlanNativePath(unsigned UserVF) {
6542   unsigned VF = UserVF;
6543   // Outer loop handling: They may require CFG and instruction level
6544   // transformations before even evaluating whether vectorization is profitable.
6545   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
6546   // the vectorization pipeline.
6547   if (!OrigLoop->empty()) {
6548     // If the user doesn't provide a vectorization factor, determine a
6549     // reasonable one.
6550     if (!UserVF) {
6551       VF = determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM);
6552       LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
6553 
6554       // Make sure we have a VF > 1 for stress testing.
6555       if (VPlanBuildStressTest && VF < 2) {
6556         LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
6557                           << "overriding computed VF.\n");
6558         VF = 4;
6559       }
6560     }
6561     assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
6562     assert(isPowerOf2_32(VF) && "VF needs to be a power of two");
6563     LLVM_DEBUG(dbgs() << "LV: Using " << (UserVF ? "user " : "") << "VF " << VF
6564                       << " to build VPlans.\n");
6565     buildVPlans(VF, VF);
6566 
6567     // For VPlan build stress testing, we bail out after VPlan construction.
6568     if (VPlanBuildStressTest)
6569       return VectorizationFactor::Disabled();
6570 
6571     return {VF, 0};
6572   }
6573 
6574   LLVM_DEBUG(
6575       dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
6576                 "VPlan-native path.\n");
6577   return VectorizationFactor::Disabled();
6578 }
6579 
6580 Optional<VectorizationFactor> LoopVectorizationPlanner::plan(unsigned UserVF,
6581                                                              unsigned UserIC) {
6582   assert(OrigLoop->empty() && "Inner loop expected.");
6583   Optional<unsigned> MaybeMaxVF = CM.computeMaxVF(UserVF, UserIC);
6584   if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved.
6585     return None;
6586 
6587   // Invalidate interleave groups if all blocks of loop will be predicated.
6588   if (CM.blockNeedsPredication(OrigLoop->getHeader()) &&
6589       !useMaskedInterleavedAccesses(*TTI)) {
6590     LLVM_DEBUG(
6591         dbgs()
6592         << "LV: Invalidate all interleaved groups due to fold-tail by masking "
6593            "which requires masked-interleaved support.\n");
6594     if (CM.InterleaveInfo.invalidateGroups())
6595       // Invalidating interleave groups also requires invalidating all decisions
6596       // based on them, which includes widening decisions and uniform and scalar
6597       // values.
6598       CM.invalidateCostModelingDecisions();
6599   }
6600 
6601   if (UserVF) {
6602     LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
6603     assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two");
6604     // Collect the instructions (and their associated costs) that will be more
6605     // profitable to scalarize.
6606     CM.selectUserVectorizationFactor(UserVF);
6607     buildVPlansWithVPRecipes(UserVF, UserVF);
6608     LLVM_DEBUG(printPlans(dbgs()));
6609     return {{UserVF, 0}};
6610   }
6611 
6612   unsigned MaxVF = MaybeMaxVF.getValue();
6613   assert(MaxVF != 0 && "MaxVF is zero.");
6614 
6615   for (unsigned VF = 1; VF <= MaxVF; VF *= 2) {
6616     // Collect Uniform and Scalar instructions after vectorization with VF.
6617     CM.collectUniformsAndScalars(VF);
6618 
6619     // Collect the instructions (and their associated costs) that will be more
6620     // profitable to scalarize.
6621     if (VF > 1)
6622       CM.collectInstsToScalarize(VF);
6623   }
6624 
6625   buildVPlansWithVPRecipes(1, MaxVF);
6626   LLVM_DEBUG(printPlans(dbgs()));
6627   if (MaxVF == 1)
6628     return VectorizationFactor::Disabled();
6629 
6630   // Select the optimal vectorization factor.
6631   return CM.selectVectorizationFactor(MaxVF);
6632 }
6633 
6634 void LoopVectorizationPlanner::setBestPlan(unsigned VF, unsigned UF) {
6635   LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF
6636                     << '\n');
6637   BestVF = VF;
6638   BestUF = UF;
6639 
6640   erase_if(VPlans, [VF](const VPlanPtr &Plan) {
6641     return !Plan->hasVF(VF);
6642   });
6643   assert(VPlans.size() == 1 && "Best VF has not a single VPlan.");
6644 }
6645 
6646 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV,
6647                                            DominatorTree *DT) {
6648   // Perform the actual loop transformation.
6649 
6650   // 1. Create a new empty loop. Unlink the old loop and connect the new one.
6651   VPCallbackILV CallbackILV(ILV);
6652 
6653   VPTransformState State{BestVF, BestUF,      LI,
6654                          DT,     ILV.Builder, ILV.VectorLoopValueMap,
6655                          &ILV,   CallbackILV};
6656   State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
6657   State.TripCount = ILV.getOrCreateTripCount(nullptr);
6658   State.CanonicalIV = ILV.Induction;
6659 
6660   //===------------------------------------------------===//
6661   //
6662   // Notice: any optimization or new instruction that go
6663   // into the code below should also be implemented in
6664   // the cost-model.
6665   //
6666   //===------------------------------------------------===//
6667 
6668   // 2. Copy and widen instructions from the old loop into the new loop.
6669   assert(VPlans.size() == 1 && "Not a single VPlan to execute.");
6670   VPlans.front()->execute(&State);
6671 
6672   // 3. Fix the vectorized code: take care of header phi's, live-outs,
6673   //    predication, updating analyses.
6674   ILV.fixVectorizedLoop();
6675 }
6676 
6677 void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
6678     SmallPtrSetImpl<Instruction *> &DeadInstructions) {
6679   BasicBlock *Latch = OrigLoop->getLoopLatch();
6680 
6681   // We create new control-flow for the vectorized loop, so the original
6682   // condition will be dead after vectorization if it's only used by the
6683   // branch.
6684   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
6685   if (Cmp && Cmp->hasOneUse())
6686     DeadInstructions.insert(Cmp);
6687 
6688   // We create new "steps" for induction variable updates to which the original
6689   // induction variables map. An original update instruction will be dead if
6690   // all its users except the induction variable are dead.
6691   for (auto &Induction : Legal->getInductionVars()) {
6692     PHINode *Ind = Induction.first;
6693     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
6694     if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
6695           return U == Ind || DeadInstructions.count(cast<Instruction>(U));
6696         }))
6697       DeadInstructions.insert(IndUpdate);
6698 
6699     // We record as "Dead" also the type-casting instructions we had identified
6700     // during induction analysis. We don't need any handling for them in the
6701     // vectorized loop because we have proven that, under a proper runtime
6702     // test guarding the vectorized loop, the value of the phi, and the casted
6703     // value of the phi, are the same. The last instruction in this casting chain
6704     // will get its scalar/vector/widened def from the scalar/vector/widened def
6705     // of the respective phi node. Any other casts in the induction def-use chain
6706     // have no other uses outside the phi update chain, and will be ignored.
6707     InductionDescriptor &IndDes = Induction.second;
6708     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
6709     DeadInstructions.insert(Casts.begin(), Casts.end());
6710   }
6711 }
6712 
6713 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; }
6714 
6715 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; }
6716 
6717 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step,
6718                                         Instruction::BinaryOps BinOp) {
6719   // When unrolling and the VF is 1, we only need to add a simple scalar.
6720   Type *Ty = Val->getType();
6721   assert(!Ty->isVectorTy() && "Val must be a scalar");
6722 
6723   if (Ty->isFloatingPointTy()) {
6724     Constant *C = ConstantFP::get(Ty, (double)StartIdx);
6725 
6726     // Floating point operations had to be 'fast' to enable the unrolling.
6727     Value *MulOp = addFastMathFlag(Builder.CreateFMul(C, Step));
6728     return addFastMathFlag(Builder.CreateBinOp(BinOp, Val, MulOp));
6729   }
6730   Constant *C = ConstantInt::get(Ty, StartIdx);
6731   return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction");
6732 }
6733 
6734 static void AddRuntimeUnrollDisableMetaData(Loop *L) {
6735   SmallVector<Metadata *, 4> MDs;
6736   // Reserve first location for self reference to the LoopID metadata node.
6737   MDs.push_back(nullptr);
6738   bool IsUnrollMetadata = false;
6739   MDNode *LoopID = L->getLoopID();
6740   if (LoopID) {
6741     // First find existing loop unrolling disable metadata.
6742     for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
6743       auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
6744       if (MD) {
6745         const auto *S = dyn_cast<MDString>(MD->getOperand(0));
6746         IsUnrollMetadata =
6747             S && S->getString().startswith("llvm.loop.unroll.disable");
6748       }
6749       MDs.push_back(LoopID->getOperand(i));
6750     }
6751   }
6752 
6753   if (!IsUnrollMetadata) {
6754     // Add runtime unroll disable metadata.
6755     LLVMContext &Context = L->getHeader()->getContext();
6756     SmallVector<Metadata *, 1> DisableOperands;
6757     DisableOperands.push_back(
6758         MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
6759     MDNode *DisableNode = MDNode::get(Context, DisableOperands);
6760     MDs.push_back(DisableNode);
6761     MDNode *NewLoopID = MDNode::get(Context, MDs);
6762     // Set operand 0 to refer to the loop id itself.
6763     NewLoopID->replaceOperandWith(0, NewLoopID);
6764     L->setLoopID(NewLoopID);
6765   }
6766 }
6767 
6768 bool LoopVectorizationPlanner::getDecisionAndClampRange(
6769     const std::function<bool(unsigned)> &Predicate, VFRange &Range) {
6770   assert(Range.End > Range.Start && "Trying to test an empty VF range.");
6771   bool PredicateAtRangeStart = Predicate(Range.Start);
6772 
6773   for (unsigned TmpVF = Range.Start * 2; TmpVF < Range.End; TmpVF *= 2)
6774     if (Predicate(TmpVF) != PredicateAtRangeStart) {
6775       Range.End = TmpVF;
6776       break;
6777     }
6778 
6779   return PredicateAtRangeStart;
6780 }
6781 
6782 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
6783 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
6784 /// of VF's starting at a given VF and extending it as much as possible. Each
6785 /// vectorization decision can potentially shorten this sub-range during
6786 /// buildVPlan().
6787 void LoopVectorizationPlanner::buildVPlans(unsigned MinVF, unsigned MaxVF) {
6788   for (unsigned VF = MinVF; VF < MaxVF + 1;) {
6789     VFRange SubRange = {VF, MaxVF + 1};
6790     VPlans.push_back(buildVPlan(SubRange));
6791     VF = SubRange.End;
6792   }
6793 }
6794 
6795 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
6796                                          VPlanPtr &Plan) {
6797   assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
6798 
6799   // Look for cached value.
6800   std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
6801   EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
6802   if (ECEntryIt != EdgeMaskCache.end())
6803     return ECEntryIt->second;
6804 
6805   VPValue *SrcMask = createBlockInMask(Src, Plan);
6806 
6807   // The terminator has to be a branch inst!
6808   BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
6809   assert(BI && "Unexpected terminator found");
6810 
6811   if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1))
6812     return EdgeMaskCache[Edge] = SrcMask;
6813 
6814   VPValue *EdgeMask = Plan->getVPValue(BI->getCondition());
6815   assert(EdgeMask && "No Edge Mask found for condition");
6816 
6817   if (BI->getSuccessor(0) != Dst)
6818     EdgeMask = Builder.createNot(EdgeMask);
6819 
6820   if (SrcMask) // Otherwise block in-mask is all-one, no need to AND.
6821     EdgeMask = Builder.createAnd(EdgeMask, SrcMask);
6822 
6823   return EdgeMaskCache[Edge] = EdgeMask;
6824 }
6825 
6826 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
6827   assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
6828 
6829   // Look for cached value.
6830   BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB);
6831   if (BCEntryIt != BlockMaskCache.end())
6832     return BCEntryIt->second;
6833 
6834   // All-one mask is modelled as no-mask following the convention for masked
6835   // load/store/gather/scatter. Initialize BlockMask to no-mask.
6836   VPValue *BlockMask = nullptr;
6837 
6838   if (OrigLoop->getHeader() == BB) {
6839     if (!CM.blockNeedsPredication(BB))
6840       return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one.
6841 
6842     // Introduce the early-exit compare IV <= BTC to form header block mask.
6843     // This is used instead of IV < TC because TC may wrap, unlike BTC.
6844     // Start by constructing the desired canonical IV.
6845     VPValue *IV = nullptr;
6846     if (Legal->getPrimaryInduction())
6847       IV = Plan->getVPValue(Legal->getPrimaryInduction());
6848     else {
6849       auto IVRecipe = new VPWidenCanonicalIVRecipe();
6850       Builder.getInsertBlock()->appendRecipe(IVRecipe);
6851       IV = IVRecipe->getVPValue();
6852     }
6853     VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
6854     bool TailFolded = !CM.isScalarEpilogueAllowed();
6855     if (TailFolded && CM.TTI.emitGetActiveLaneMask())
6856       BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, BTC});
6857     else
6858       BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
6859     return BlockMaskCache[BB] = BlockMask;
6860   }
6861 
6862   // This is the block mask. We OR all incoming edges.
6863   for (auto *Predecessor : predecessors(BB)) {
6864     VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan);
6865     if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too.
6866       return BlockMaskCache[BB] = EdgeMask;
6867 
6868     if (!BlockMask) { // BlockMask has its initialized nullptr value.
6869       BlockMask = EdgeMask;
6870       continue;
6871     }
6872 
6873     BlockMask = Builder.createOr(BlockMask, EdgeMask);
6874   }
6875 
6876   return BlockMaskCache[BB] = BlockMask;
6877 }
6878 
6879 VPWidenMemoryInstructionRecipe *
6880 VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range,
6881                                   VPlanPtr &Plan) {
6882   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
6883          "Must be called with either a load or store");
6884 
6885   auto willWiden = [&](unsigned VF) -> bool {
6886     if (VF == 1)
6887       return false;
6888     LoopVectorizationCostModel::InstWidening Decision =
6889         CM.getWideningDecision(I, VF);
6890     assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
6891            "CM decision should be taken at this point.");
6892     if (Decision == LoopVectorizationCostModel::CM_Interleave)
6893       return true;
6894     if (CM.isScalarAfterVectorization(I, VF) ||
6895         CM.isProfitableToScalarize(I, VF))
6896       return false;
6897     return Decision != LoopVectorizationCostModel::CM_Scalarize;
6898   };
6899 
6900   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
6901     return nullptr;
6902 
6903   VPValue *Mask = nullptr;
6904   if (Legal->isMaskRequired(I))
6905     Mask = createBlockInMask(I->getParent(), Plan);
6906 
6907   VPValue *Addr = Plan->getOrAddVPValue(getLoadStorePointerOperand(I));
6908   if (LoadInst *Load = dyn_cast<LoadInst>(I))
6909     return new VPWidenMemoryInstructionRecipe(*Load, Addr, Mask);
6910 
6911   StoreInst *Store = cast<StoreInst>(I);
6912   VPValue *StoredValue = Plan->getOrAddVPValue(Store->getValueOperand());
6913   return new VPWidenMemoryInstructionRecipe(*Store, Addr, StoredValue, Mask);
6914 }
6915 
6916 VPWidenIntOrFpInductionRecipe *
6917 VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi) const {
6918   // Check if this is an integer or fp induction. If so, build the recipe that
6919   // produces its scalar and vector values.
6920   InductionDescriptor II = Legal->getInductionVars().lookup(Phi);
6921   if (II.getKind() == InductionDescriptor::IK_IntInduction ||
6922       II.getKind() == InductionDescriptor::IK_FpInduction)
6923     return new VPWidenIntOrFpInductionRecipe(Phi);
6924 
6925   return nullptr;
6926 }
6927 
6928 VPWidenIntOrFpInductionRecipe *
6929 VPRecipeBuilder::tryToOptimizeInductionTruncate(TruncInst *I,
6930                                                 VFRange &Range) const {
6931   // Optimize the special case where the source is a constant integer
6932   // induction variable. Notice that we can only optimize the 'trunc' case
6933   // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
6934   // (c) other casts depend on pointer size.
6935 
6936   // Determine whether \p K is a truncation based on an induction variable that
6937   // can be optimized.
6938   auto isOptimizableIVTruncate =
6939       [&](Instruction *K) -> std::function<bool(unsigned)> {
6940     return
6941         [=](unsigned VF) -> bool { return CM.isOptimizableIVTruncate(K, VF); };
6942   };
6943 
6944   if (LoopVectorizationPlanner::getDecisionAndClampRange(
6945           isOptimizableIVTruncate(I), Range))
6946     return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)),
6947                                              I);
6948   return nullptr;
6949 }
6950 
6951 VPBlendRecipe *VPRecipeBuilder::tryToBlend(PHINode *Phi, VPlanPtr &Plan) {
6952   // We know that all PHIs in non-header blocks are converted into selects, so
6953   // we don't have to worry about the insertion order and we can just use the
6954   // builder. At this point we generate the predication tree. There may be
6955   // duplications since this is a simple recursive scan, but future
6956   // optimizations will clean it up.
6957 
6958   SmallVector<VPValue *, 2> Operands;
6959   unsigned NumIncoming = Phi->getNumIncomingValues();
6960   for (unsigned In = 0; In < NumIncoming; In++) {
6961     VPValue *EdgeMask =
6962       createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan);
6963     assert((EdgeMask || NumIncoming == 1) &&
6964            "Multiple predecessors with one having a full mask");
6965     Operands.push_back(Plan->getOrAddVPValue(Phi->getIncomingValue(In)));
6966     if (EdgeMask)
6967       Operands.push_back(EdgeMask);
6968   }
6969   return new VPBlendRecipe(Phi, Operands);
6970 }
6971 
6972 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, VFRange &Range,
6973                                                    VPlan &Plan) const {
6974 
6975   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
6976       [this, CI](unsigned VF) { return CM.isScalarWithPredication(CI, VF); },
6977       Range);
6978 
6979   if (IsPredicated)
6980     return nullptr;
6981 
6982   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
6983   if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
6984              ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect))
6985     return nullptr;
6986 
6987   auto willWiden = [&](unsigned VF) -> bool {
6988     Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
6989     // The following case may be scalarized depending on the VF.
6990     // The flag shows whether we use Intrinsic or a usual Call for vectorized
6991     // version of the instruction.
6992     // Is it beneficial to perform intrinsic call compared to lib call?
6993     bool NeedToScalarize = false;
6994     unsigned CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize);
6995     bool UseVectorIntrinsic =
6996         ID && CM.getVectorIntrinsicCost(CI, VF) <= CallCost;
6997     return UseVectorIntrinsic || !NeedToScalarize;
6998   };
6999 
7000   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
7001     return nullptr;
7002 
7003   return new VPWidenCallRecipe(*CI, Plan.mapToVPValues(CI->arg_operands()));
7004 }
7005 
7006 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
7007   assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) &&
7008          !isa<StoreInst>(I) && "Instruction should have been handled earlier");
7009   // Instruction should be widened, unless it is scalar after vectorization,
7010   // scalarization is profitable or it is predicated.
7011   auto WillScalarize = [this, I](unsigned VF) -> bool {
7012     return CM.isScalarAfterVectorization(I, VF) ||
7013            CM.isProfitableToScalarize(I, VF) ||
7014            CM.isScalarWithPredication(I, VF);
7015   };
7016   return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize,
7017                                                              Range);
7018 }
7019 
7020 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, VPlan &Plan) const {
7021   auto IsVectorizableOpcode = [](unsigned Opcode) {
7022     switch (Opcode) {
7023     case Instruction::Add:
7024     case Instruction::And:
7025     case Instruction::AShr:
7026     case Instruction::BitCast:
7027     case Instruction::FAdd:
7028     case Instruction::FCmp:
7029     case Instruction::FDiv:
7030     case Instruction::FMul:
7031     case Instruction::FNeg:
7032     case Instruction::FPExt:
7033     case Instruction::FPToSI:
7034     case Instruction::FPToUI:
7035     case Instruction::FPTrunc:
7036     case Instruction::FRem:
7037     case Instruction::FSub:
7038     case Instruction::ICmp:
7039     case Instruction::IntToPtr:
7040     case Instruction::LShr:
7041     case Instruction::Mul:
7042     case Instruction::Or:
7043     case Instruction::PtrToInt:
7044     case Instruction::SDiv:
7045     case Instruction::Select:
7046     case Instruction::SExt:
7047     case Instruction::Shl:
7048     case Instruction::SIToFP:
7049     case Instruction::SRem:
7050     case Instruction::Sub:
7051     case Instruction::Trunc:
7052     case Instruction::UDiv:
7053     case Instruction::UIToFP:
7054     case Instruction::URem:
7055     case Instruction::Xor:
7056     case Instruction::ZExt:
7057       return true;
7058     }
7059     return false;
7060   };
7061 
7062   if (!IsVectorizableOpcode(I->getOpcode()))
7063     return nullptr;
7064 
7065   // Success: widen this instruction.
7066   return new VPWidenRecipe(*I, Plan.mapToVPValues(I->operands()));
7067 }
7068 
7069 VPBasicBlock *VPRecipeBuilder::handleReplication(
7070     Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
7071     DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe,
7072     VPlanPtr &Plan) {
7073   bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
7074       [&](unsigned VF) { return CM.isUniformAfterVectorization(I, VF); },
7075       Range);
7076 
7077   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
7078       [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range);
7079 
7080   auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()),
7081                                        IsUniform, IsPredicated);
7082   setRecipe(I, Recipe);
7083 
7084   // Find if I uses a predicated instruction. If so, it will use its scalar
7085   // value. Avoid hoisting the insert-element which packs the scalar value into
7086   // a vector value, as that happens iff all users use the vector value.
7087   for (auto &Op : I->operands())
7088     if (auto *PredInst = dyn_cast<Instruction>(Op))
7089       if (PredInst2Recipe.find(PredInst) != PredInst2Recipe.end())
7090         PredInst2Recipe[PredInst]->setAlsoPack(false);
7091 
7092   // Finalize the recipe for Instr, first if it is not predicated.
7093   if (!IsPredicated) {
7094     LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
7095     VPBB->appendRecipe(Recipe);
7096     return VPBB;
7097   }
7098   LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
7099   assert(VPBB->getSuccessors().empty() &&
7100          "VPBB has successors when handling predicated replication.");
7101   // Record predicated instructions for above packing optimizations.
7102   PredInst2Recipe[I] = Recipe;
7103   VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan);
7104   VPBlockUtils::insertBlockAfter(Region, VPBB);
7105   auto *RegSucc = new VPBasicBlock();
7106   VPBlockUtils::insertBlockAfter(RegSucc, Region);
7107   return RegSucc;
7108 }
7109 
7110 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr,
7111                                                       VPRecipeBase *PredRecipe,
7112                                                       VPlanPtr &Plan) {
7113   // Instructions marked for predication are replicated and placed under an
7114   // if-then construct to prevent side-effects.
7115 
7116   // Generate recipes to compute the block mask for this region.
7117   VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan);
7118 
7119   // Build the triangular if-then region.
7120   std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
7121   assert(Instr->getParent() && "Predicated instruction not in any basic block");
7122   auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask);
7123   auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
7124   auto *PHIRecipe =
7125       Instr->getType()->isVoidTy() ? nullptr : new VPPredInstPHIRecipe(Instr);
7126   auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
7127   auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe);
7128   VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true);
7129 
7130   // Note: first set Entry as region entry and then connect successors starting
7131   // from it in order, to propagate the "parent" of each VPBasicBlock.
7132   VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry);
7133   VPBlockUtils::connectBlocks(Pred, Exit);
7134 
7135   return Region;
7136 }
7137 
7138 VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
7139                                                       VFRange &Range,
7140                                                       VPlanPtr &Plan) {
7141   // First, check for specific widening recipes that deal with calls, memory
7142   // operations, inductions and Phi nodes.
7143   if (auto *CI = dyn_cast<CallInst>(Instr))
7144     return tryToWidenCall(CI, Range, *Plan);
7145 
7146   if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
7147     return tryToWidenMemory(Instr, Range, Plan);
7148 
7149   VPRecipeBase *Recipe;
7150   if (auto Phi = dyn_cast<PHINode>(Instr)) {
7151     if (Phi->getParent() != OrigLoop->getHeader())
7152       return tryToBlend(Phi, Plan);
7153     if ((Recipe = tryToOptimizeInductionPHI(Phi)))
7154       return Recipe;
7155     return new VPWidenPHIRecipe(Phi);
7156   }
7157 
7158   if (isa<TruncInst>(Instr) &&
7159       (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Range)))
7160     return Recipe;
7161 
7162   if (!shouldWiden(Instr, Range))
7163     return nullptr;
7164 
7165   if (auto GEP = dyn_cast<GetElementPtrInst>(Instr))
7166     return new VPWidenGEPRecipe(GEP, Plan->mapToVPValues(GEP->operands()),
7167                                 OrigLoop);
7168 
7169   if (auto *SI = dyn_cast<SelectInst>(Instr)) {
7170     bool InvariantCond =
7171         PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop);
7172     return new VPWidenSelectRecipe(*SI, Plan->mapToVPValues(SI->operands()),
7173                                    InvariantCond);
7174   }
7175 
7176   return tryToWiden(Instr, *Plan);
7177 }
7178 
7179 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF,
7180                                                         unsigned MaxVF) {
7181   assert(OrigLoop->empty() && "Inner loop expected.");
7182 
7183   // Collect conditions feeding internal conditional branches; they need to be
7184   // represented in VPlan for it to model masking.
7185   SmallPtrSet<Value *, 1> NeedDef;
7186 
7187   auto *Latch = OrigLoop->getLoopLatch();
7188   for (BasicBlock *BB : OrigLoop->blocks()) {
7189     if (BB == Latch)
7190       continue;
7191     BranchInst *Branch = dyn_cast<BranchInst>(BB->getTerminator());
7192     if (Branch && Branch->isConditional())
7193       NeedDef.insert(Branch->getCondition());
7194   }
7195 
7196   // If the tail is to be folded by masking, the primary induction variable, if
7197   // exists needs to be represented in VPlan for it to model early-exit masking.
7198   // Also, both the Phi and the live-out instruction of each reduction are
7199   // required in order to introduce a select between them in VPlan.
7200   if (CM.foldTailByMasking()) {
7201     if (Legal->getPrimaryInduction())
7202       NeedDef.insert(Legal->getPrimaryInduction());
7203     for (auto &Reduction : Legal->getReductionVars()) {
7204       NeedDef.insert(Reduction.first);
7205       NeedDef.insert(Reduction.second.getLoopExitInstr());
7206     }
7207   }
7208 
7209   // Collect instructions from the original loop that will become trivially dead
7210   // in the vectorized loop. We don't need to vectorize these instructions. For
7211   // example, original induction update instructions can become dead because we
7212   // separately emit induction "steps" when generating code for the new loop.
7213   // Similarly, we create a new latch condition when setting up the structure
7214   // of the new loop, so the old one can become dead.
7215   SmallPtrSet<Instruction *, 4> DeadInstructions;
7216   collectTriviallyDeadInstructions(DeadInstructions);
7217 
7218   // Add assume instructions we need to drop to DeadInstructions, to prevent
7219   // them from being added to the VPlan.
7220   // TODO: We only need to drop assumes in blocks that get flattend. If the
7221   // control flow is preserved, we should keep them.
7222   auto &ConditionalAssumes = Legal->getConditionalAssumes();
7223   DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end());
7224 
7225   DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter();
7226   // Dead instructions do not need sinking. Remove them from SinkAfter.
7227   for (Instruction *I : DeadInstructions)
7228     SinkAfter.erase(I);
7229 
7230   for (unsigned VF = MinVF; VF < MaxVF + 1;) {
7231     VFRange SubRange = {VF, MaxVF + 1};
7232     VPlans.push_back(buildVPlanWithVPRecipes(SubRange, NeedDef,
7233                                              DeadInstructions, SinkAfter));
7234     VF = SubRange.End;
7235   }
7236 }
7237 
7238 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
7239     VFRange &Range, SmallPtrSetImpl<Value *> &NeedDef,
7240     SmallPtrSetImpl<Instruction *> &DeadInstructions,
7241     const DenseMap<Instruction *, Instruction *> &SinkAfter) {
7242 
7243   // Hold a mapping from predicated instructions to their recipes, in order to
7244   // fix their AlsoPack behavior if a user is determined to replicate and use a
7245   // scalar instead of vector value.
7246   DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe;
7247 
7248   SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
7249 
7250   VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder);
7251 
7252   // ---------------------------------------------------------------------------
7253   // Pre-construction: record ingredients whose recipes we'll need to further
7254   // process after constructing the initial VPlan.
7255   // ---------------------------------------------------------------------------
7256 
7257   // Mark instructions we'll need to sink later and their targets as
7258   // ingredients whose recipe we'll need to record.
7259   for (auto &Entry : SinkAfter) {
7260     RecipeBuilder.recordRecipeOf(Entry.first);
7261     RecipeBuilder.recordRecipeOf(Entry.second);
7262   }
7263 
7264   // For each interleave group which is relevant for this (possibly trimmed)
7265   // Range, add it to the set of groups to be later applied to the VPlan and add
7266   // placeholders for its members' Recipes which we'll be replacing with a
7267   // single VPInterleaveRecipe.
7268   for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
7269     auto applyIG = [IG, this](unsigned VF) -> bool {
7270       return (VF >= 2 && // Query is illegal for VF == 1
7271               CM.getWideningDecision(IG->getInsertPos(), VF) ==
7272                   LoopVectorizationCostModel::CM_Interleave);
7273     };
7274     if (!getDecisionAndClampRange(applyIG, Range))
7275       continue;
7276     InterleaveGroups.insert(IG);
7277     for (unsigned i = 0; i < IG->getFactor(); i++)
7278       if (Instruction *Member = IG->getMember(i))
7279         RecipeBuilder.recordRecipeOf(Member);
7280   };
7281 
7282   // ---------------------------------------------------------------------------
7283   // Build initial VPlan: Scan the body of the loop in a topological order to
7284   // visit each basic block after having visited its predecessor basic blocks.
7285   // ---------------------------------------------------------------------------
7286 
7287   // Create a dummy pre-entry VPBasicBlock to start building the VPlan.
7288   auto Plan = std::make_unique<VPlan>();
7289   VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry");
7290   Plan->setEntry(VPBB);
7291 
7292   // Represent values that will have defs inside VPlan.
7293   for (Value *V : NeedDef)
7294     Plan->addVPValue(V);
7295 
7296   // Scan the body of the loop in a topological order to visit each basic block
7297   // after having visited its predecessor basic blocks.
7298   LoopBlocksDFS DFS(OrigLoop);
7299   DFS.perform(LI);
7300 
7301   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
7302     // Relevant instructions from basic block BB will be grouped into VPRecipe
7303     // ingredients and fill a new VPBasicBlock.
7304     unsigned VPBBsForBB = 0;
7305     auto *FirstVPBBForBB = new VPBasicBlock(BB->getName());
7306     VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB);
7307     VPBB = FirstVPBBForBB;
7308     Builder.setInsertPoint(VPBB);
7309 
7310     // Introduce each ingredient into VPlan.
7311     // TODO: Model and preserve debug instrinsics in VPlan.
7312     for (Instruction &I : BB->instructionsWithoutDebug()) {
7313       Instruction *Instr = &I;
7314 
7315       // First filter out irrelevant instructions, to ensure no recipes are
7316       // built for them.
7317       if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr))
7318         continue;
7319 
7320       if (auto Recipe =
7321               RecipeBuilder.tryToCreateWidenRecipe(Instr, Range, Plan)) {
7322         RecipeBuilder.setRecipe(Instr, Recipe);
7323         VPBB->appendRecipe(Recipe);
7324         continue;
7325       }
7326 
7327       // Otherwise, if all widening options failed, Instruction is to be
7328       // replicated. This may create a successor for VPBB.
7329       VPBasicBlock *NextVPBB = RecipeBuilder.handleReplication(
7330           Instr, Range, VPBB, PredInst2Recipe, Plan);
7331       if (NextVPBB != VPBB) {
7332         VPBB = NextVPBB;
7333         VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++)
7334                                     : "");
7335       }
7336     }
7337   }
7338 
7339   // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks
7340   // may also be empty, such as the last one VPBB, reflecting original
7341   // basic-blocks with no recipes.
7342   VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry());
7343   assert(PreEntry->empty() && "Expecting empty pre-entry block.");
7344   VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor());
7345   VPBlockUtils::disconnectBlocks(PreEntry, Entry);
7346   delete PreEntry;
7347 
7348   // ---------------------------------------------------------------------------
7349   // Transform initial VPlan: Apply previously taken decisions, in order, to
7350   // bring the VPlan to its final state.
7351   // ---------------------------------------------------------------------------
7352 
7353   // Apply Sink-After legal constraints.
7354   for (auto &Entry : SinkAfter) {
7355     VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first);
7356     VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second);
7357     Sink->moveAfter(Target);
7358   }
7359 
7360   // Interleave memory: for each Interleave Group we marked earlier as relevant
7361   // for this VPlan, replace the Recipes widening its memory instructions with a
7362   // single VPInterleaveRecipe at its insertion point.
7363   for (auto IG : InterleaveGroups) {
7364     auto *Recipe = cast<VPWidenMemoryInstructionRecipe>(
7365         RecipeBuilder.getRecipe(IG->getInsertPos()));
7366     (new VPInterleaveRecipe(IG, Recipe->getAddr(), Recipe->getMask()))
7367         ->insertBefore(Recipe);
7368 
7369     for (unsigned i = 0; i < IG->getFactor(); ++i)
7370       if (Instruction *Member = IG->getMember(i)) {
7371         RecipeBuilder.getRecipe(Member)->eraseFromParent();
7372       }
7373   }
7374 
7375   // Finally, if tail is folded by masking, introduce selects between the phi
7376   // and the live-out instruction of each reduction, at the end of the latch.
7377   if (CM.foldTailByMasking()) {
7378     Builder.setInsertPoint(VPBB);
7379     auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan);
7380     for (auto &Reduction : Legal->getReductionVars()) {
7381       VPValue *Phi = Plan->getVPValue(Reduction.first);
7382       VPValue *Red = Plan->getVPValue(Reduction.second.getLoopExitInstr());
7383       Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi});
7384     }
7385   }
7386 
7387   std::string PlanName;
7388   raw_string_ostream RSO(PlanName);
7389   unsigned VF = Range.Start;
7390   Plan->addVF(VF);
7391   RSO << "Initial VPlan for VF={" << VF;
7392   for (VF *= 2; VF < Range.End; VF *= 2) {
7393     Plan->addVF(VF);
7394     RSO << "," << VF;
7395   }
7396   RSO << "},UF>=1";
7397   RSO.flush();
7398   Plan->setName(PlanName);
7399 
7400   return Plan;
7401 }
7402 
7403 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
7404   // Outer loop handling: They may require CFG and instruction level
7405   // transformations before even evaluating whether vectorization is profitable.
7406   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
7407   // the vectorization pipeline.
7408   assert(!OrigLoop->empty());
7409   assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
7410 
7411   // Create new empty VPlan
7412   auto Plan = std::make_unique<VPlan>();
7413 
7414   // Build hierarchical CFG
7415   VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
7416   HCFGBuilder.buildHierarchicalCFG();
7417 
7418   for (unsigned VF = Range.Start; VF < Range.End; VF *= 2)
7419     Plan->addVF(VF);
7420 
7421   if (EnableVPlanPredication) {
7422     VPlanPredicator VPP(*Plan);
7423     VPP.predicate();
7424 
7425     // Avoid running transformation to recipes until masked code generation in
7426     // VPlan-native path is in place.
7427     return Plan;
7428   }
7429 
7430   SmallPtrSet<Instruction *, 1> DeadInstructions;
7431   VPlanTransforms::VPInstructionsToVPRecipes(
7432       OrigLoop, Plan, Legal->getInductionVars(), DeadInstructions);
7433   return Plan;
7434 }
7435 
7436 Value* LoopVectorizationPlanner::VPCallbackILV::
7437 getOrCreateVectorValues(Value *V, unsigned Part) {
7438       return ILV.getOrCreateVectorValue(V, Part);
7439 }
7440 
7441 Value *LoopVectorizationPlanner::VPCallbackILV::getOrCreateScalarValue(
7442     Value *V, const VPIteration &Instance) {
7443   return ILV.getOrCreateScalarValue(V, Instance);
7444 }
7445 
7446 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
7447                                VPSlotTracker &SlotTracker) const {
7448   O << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
7449   IG->getInsertPos()->printAsOperand(O, false);
7450   O << ", ";
7451   getAddr()->printAsOperand(O, SlotTracker);
7452   VPValue *Mask = getMask();
7453   if (Mask) {
7454     O << ", ";
7455     Mask->printAsOperand(O, SlotTracker);
7456   }
7457   for (unsigned i = 0; i < IG->getFactor(); ++i)
7458     if (Instruction *I = IG->getMember(i))
7459       O << "\\l\" +\n" << Indent << "\"  " << VPlanIngredient(I) << " " << i;
7460 }
7461 
7462 void VPWidenCallRecipe::execute(VPTransformState &State) {
7463   State.ILV->widenCallInstruction(Ingredient, User, State);
7464 }
7465 
7466 void VPWidenSelectRecipe::execute(VPTransformState &State) {
7467   State.ILV->widenSelectInstruction(Ingredient, User, InvariantCond, State);
7468 }
7469 
7470 void VPWidenRecipe::execute(VPTransformState &State) {
7471   State.ILV->widenInstruction(Ingredient, User, State);
7472 }
7473 
7474 void VPWidenGEPRecipe::execute(VPTransformState &State) {
7475   State.ILV->widenGEP(GEP, User, State.UF, State.VF, IsPtrLoopInvariant,
7476                       IsIndexLoopInvariant, State);
7477 }
7478 
7479 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
7480   assert(!State.Instance && "Int or FP induction being replicated.");
7481   State.ILV->widenIntOrFpInduction(IV, Trunc);
7482 }
7483 
7484 void VPWidenPHIRecipe::execute(VPTransformState &State) {
7485   State.ILV->widenPHIInstruction(Phi, State.UF, State.VF);
7486 }
7487 
7488 void VPBlendRecipe::execute(VPTransformState &State) {
7489   State.ILV->setDebugLocFromInst(State.Builder, Phi);
7490   // We know that all PHIs in non-header blocks are converted into
7491   // selects, so we don't have to worry about the insertion order and we
7492   // can just use the builder.
7493   // At this point we generate the predication tree. There may be
7494   // duplications since this is a simple recursive scan, but future
7495   // optimizations will clean it up.
7496 
7497   unsigned NumIncoming = getNumIncomingValues();
7498 
7499   // Generate a sequence of selects of the form:
7500   // SELECT(Mask3, In3,
7501   //        SELECT(Mask2, In2,
7502   //               SELECT(Mask1, In1,
7503   //                      In0)))
7504   // Note that Mask0 is never used: lanes for which no path reaches this phi and
7505   // are essentially undef are taken from In0.
7506   InnerLoopVectorizer::VectorParts Entry(State.UF);
7507   for (unsigned In = 0; In < NumIncoming; ++In) {
7508     for (unsigned Part = 0; Part < State.UF; ++Part) {
7509       // We might have single edge PHIs (blocks) - use an identity
7510       // 'select' for the first PHI operand.
7511       Value *In0 = State.get(getIncomingValue(In), Part);
7512       if (In == 0)
7513         Entry[Part] = In0; // Initialize with the first incoming value.
7514       else {
7515         // Select between the current value and the previous incoming edge
7516         // based on the incoming mask.
7517         Value *Cond = State.get(getMask(In), Part);
7518         Entry[Part] =
7519             State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi");
7520       }
7521     }
7522   }
7523   for (unsigned Part = 0; Part < State.UF; ++Part)
7524     State.ValueMap.setVectorValue(Phi, Part, Entry[Part]);
7525 }
7526 
7527 void VPInterleaveRecipe::execute(VPTransformState &State) {
7528   assert(!State.Instance && "Interleave group being replicated.");
7529   State.ILV->vectorizeInterleaveGroup(IG, State, getAddr(), getMask());
7530 }
7531 
7532 void VPReplicateRecipe::execute(VPTransformState &State) {
7533   if (State.Instance) { // Generate a single instance.
7534     State.ILV->scalarizeInstruction(Ingredient, User, *State.Instance,
7535                                     IsPredicated, State);
7536     // Insert scalar instance packing it into a vector.
7537     if (AlsoPack && State.VF > 1) {
7538       // If we're constructing lane 0, initialize to start from undef.
7539       if (State.Instance->Lane == 0) {
7540         Value *Undef = UndefValue::get(
7541             FixedVectorType::get(Ingredient->getType(), State.VF));
7542         State.ValueMap.setVectorValue(Ingredient, State.Instance->Part, Undef);
7543       }
7544       State.ILV->packScalarIntoVectorValue(Ingredient, *State.Instance);
7545     }
7546     return;
7547   }
7548 
7549   // Generate scalar instances for all VF lanes of all UF parts, unless the
7550   // instruction is uniform inwhich case generate only the first lane for each
7551   // of the UF parts.
7552   unsigned EndLane = IsUniform ? 1 : State.VF;
7553   for (unsigned Part = 0; Part < State.UF; ++Part)
7554     for (unsigned Lane = 0; Lane < EndLane; ++Lane)
7555       State.ILV->scalarizeInstruction(Ingredient, User, {Part, Lane},
7556                                       IsPredicated, State);
7557 }
7558 
7559 void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
7560   assert(State.Instance && "Branch on Mask works only on single instance.");
7561 
7562   unsigned Part = State.Instance->Part;
7563   unsigned Lane = State.Instance->Lane;
7564 
7565   Value *ConditionBit = nullptr;
7566   VPValue *BlockInMask = getMask();
7567   if (BlockInMask) {
7568     ConditionBit = State.get(BlockInMask, Part);
7569     if (ConditionBit->getType()->isVectorTy())
7570       ConditionBit = State.Builder.CreateExtractElement(
7571           ConditionBit, State.Builder.getInt32(Lane));
7572   } else // Block in mask is all-one.
7573     ConditionBit = State.Builder.getTrue();
7574 
7575   // Replace the temporary unreachable terminator with a new conditional branch,
7576   // whose two destinations will be set later when they are created.
7577   auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
7578   assert(isa<UnreachableInst>(CurrentTerminator) &&
7579          "Expected to replace unreachable terminator with conditional branch.");
7580   auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
7581   CondBr->setSuccessor(0, nullptr);
7582   ReplaceInstWithInst(CurrentTerminator, CondBr);
7583 }
7584 
7585 void VPPredInstPHIRecipe::execute(VPTransformState &State) {
7586   assert(State.Instance && "Predicated instruction PHI works per instance.");
7587   Instruction *ScalarPredInst = cast<Instruction>(
7588       State.ValueMap.getScalarValue(PredInst, *State.Instance));
7589   BasicBlock *PredicatedBB = ScalarPredInst->getParent();
7590   BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
7591   assert(PredicatingBB && "Predicated block has no single predecessor.");
7592 
7593   // By current pack/unpack logic we need to generate only a single phi node: if
7594   // a vector value for the predicated instruction exists at this point it means
7595   // the instruction has vector users only, and a phi for the vector value is
7596   // needed. In this case the recipe of the predicated instruction is marked to
7597   // also do that packing, thereby "hoisting" the insert-element sequence.
7598   // Otherwise, a phi node for the scalar value is needed.
7599   unsigned Part = State.Instance->Part;
7600   if (State.ValueMap.hasVectorValue(PredInst, Part)) {
7601     Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part);
7602     InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
7603     PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
7604     VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
7605     VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
7606     State.ValueMap.resetVectorValue(PredInst, Part, VPhi); // Update cache.
7607   } else {
7608     Type *PredInstType = PredInst->getType();
7609     PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
7610     Phi->addIncoming(UndefValue::get(ScalarPredInst->getType()), PredicatingBB);
7611     Phi->addIncoming(ScalarPredInst, PredicatedBB);
7612     State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi);
7613   }
7614 }
7615 
7616 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
7617   VPValue *StoredValue = isa<StoreInst>(Instr) ? getStoredValue() : nullptr;
7618   State.ILV->vectorizeMemoryInstruction(&Instr, State, getAddr(), StoredValue,
7619                                         getMask());
7620 }
7621 
7622 // Determine how to lower the scalar epilogue, which depends on 1) optimising
7623 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
7624 // predication, and 4) a TTI hook that analyses whether the loop is suitable
7625 // for predication.
7626 static ScalarEpilogueLowering getScalarEpilogueLowering(
7627     Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI,
7628     BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
7629     AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
7630     LoopVectorizationLegality &LVL) {
7631   bool OptSize =
7632       F->hasOptSize() || llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
7633                                                      PGSOQueryType::IRPass);
7634   // 1) OptSize takes precedence over all other options, i.e. if this is set,
7635   // don't look at hints or options, and don't request a scalar epilogue.
7636   if (OptSize)
7637     return CM_ScalarEpilogueNotAllowedOptSize;
7638 
7639   bool PredicateOptDisabled = PreferPredicateOverEpilog.getNumOccurrences() &&
7640                               !PreferPredicateOverEpilog;
7641 
7642   // 2) Next, if disabling predication is requested on the command line, honour
7643   // this and request a scalar epilogue.
7644   if (PredicateOptDisabled)
7645     return CM_ScalarEpilogueAllowed;
7646 
7647   // 3) and 4) look if enabling predication is requested on the command line,
7648   // with a loop hint, or if the TTI hook indicates this is profitable, request
7649   // predication .
7650   if (PreferPredicateOverEpilog ||
7651       Hints.getPredicate() == LoopVectorizeHints::FK_Enabled ||
7652       (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT,
7653                                         LVL.getLAI()) &&
7654        Hints.getPredicate() != LoopVectorizeHints::FK_Disabled))
7655     return CM_ScalarEpilogueNotNeededUsePredicate;
7656 
7657   return CM_ScalarEpilogueAllowed;
7658 }
7659 
7660 // Process the loop in the VPlan-native vectorization path. This path builds
7661 // VPlan upfront in the vectorization pipeline, which allows to apply
7662 // VPlan-to-VPlan transformations from the very beginning without modifying the
7663 // input LLVM IR.
7664 static bool processLoopInVPlanNativePath(
7665     Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
7666     LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
7667     TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
7668     OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
7669     ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) {
7670 
7671   if (PSE.getBackedgeTakenCount() == PSE.getSE()->getCouldNotCompute()) {
7672     LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n");
7673     return false;
7674   }
7675   assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
7676   Function *F = L->getHeader()->getParent();
7677   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
7678 
7679   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
7680       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL);
7681 
7682   LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
7683                                 &Hints, IAI);
7684   // Use the planner for outer loop vectorization.
7685   // TODO: CM is not used at this point inside the planner. Turn CM into an
7686   // optional argument if we don't need it in the future.
7687   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE);
7688 
7689   // Get user vectorization factor.
7690   const unsigned UserVF = Hints.getWidth();
7691 
7692   // Plan how to best vectorize, return the best VF and its cost.
7693   const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
7694 
7695   // If we are stress testing VPlan builds, do not attempt to generate vector
7696   // code. Masked vector code generation support will follow soon.
7697   // Also, do not attempt to vectorize if no vector code will be produced.
7698   if (VPlanBuildStressTest || EnableVPlanPredication ||
7699       VectorizationFactor::Disabled() == VF)
7700     return false;
7701 
7702   LVP.setBestPlan(VF.Width, 1);
7703 
7704   InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL,
7705                          &CM, BFI, PSI);
7706   LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
7707                     << L->getHeader()->getParent()->getName() << "\"\n");
7708   LVP.executePlan(LB, DT);
7709 
7710   // Mark the loop as already vectorized to avoid vectorizing again.
7711   Hints.setAlreadyVectorized();
7712 
7713   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
7714   return true;
7715 }
7716 
7717 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts)
7718     : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced ||
7719                                !EnableLoopInterleaving),
7720       VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced ||
7721                               !EnableLoopVectorization) {}
7722 
7723 bool LoopVectorizePass::processLoop(Loop *L) {
7724   assert((EnableVPlanNativePath || L->empty()) &&
7725          "VPlan-native path is not enabled. Only process inner loops.");
7726 
7727 #ifndef NDEBUG
7728   const std::string DebugLocStr = getDebugLocString(L);
7729 #endif /* NDEBUG */
7730 
7731   LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \""
7732                     << L->getHeader()->getParent()->getName() << "\" from "
7733                     << DebugLocStr << "\n");
7734 
7735   LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE);
7736 
7737   LLVM_DEBUG(
7738       dbgs() << "LV: Loop hints:"
7739              << " force="
7740              << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
7741                      ? "disabled"
7742                      : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
7743                             ? "enabled"
7744                             : "?"))
7745              << " width=" << Hints.getWidth()
7746              << " unroll=" << Hints.getInterleave() << "\n");
7747 
7748   // Function containing loop
7749   Function *F = L->getHeader()->getParent();
7750 
7751   // Looking at the diagnostic output is the only way to determine if a loop
7752   // was vectorized (other than looking at the IR or machine code), so it
7753   // is important to generate an optimization remark for each loop. Most of
7754   // these messages are generated as OptimizationRemarkAnalysis. Remarks
7755   // generated as OptimizationRemark and OptimizationRemarkMissed are
7756   // less verbose reporting vectorized loops and unvectorized loops that may
7757   // benefit from vectorization, respectively.
7758 
7759   if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
7760     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
7761     return false;
7762   }
7763 
7764   PredicatedScalarEvolution PSE(*SE, *L);
7765 
7766   // Check if it is legal to vectorize the loop.
7767   LoopVectorizationRequirements Requirements(*ORE);
7768   LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE,
7769                                 &Requirements, &Hints, DB, AC, BFI, PSI);
7770   if (!LVL.canVectorize(EnableVPlanNativePath)) {
7771     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
7772     Hints.emitRemarkWithHints();
7773     return false;
7774   }
7775 
7776   // Check the function attributes and profiles to find out if this function
7777   // should be optimized for size.
7778   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
7779       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL);
7780 
7781   // Entrance to the VPlan-native vectorization path. Outer loops are processed
7782   // here. They may require CFG and instruction level transformations before
7783   // even evaluating whether vectorization is profitable. Since we cannot modify
7784   // the incoming IR, we need to build VPlan upfront in the vectorization
7785   // pipeline.
7786   if (!L->empty())
7787     return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
7788                                         ORE, BFI, PSI, Hints);
7789 
7790   assert(L->empty() && "Inner loop expected.");
7791 
7792   // Check the loop for a trip count threshold: vectorize loops with a tiny trip
7793   // count by optimizing for size, to minimize overheads.
7794   auto ExpectedTC = getSmallBestKnownTC(*SE, L);
7795   if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
7796     LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
7797                       << "This loop is worth vectorizing only if no scalar "
7798                       << "iteration overheads are incurred.");
7799     if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
7800       LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
7801     else {
7802       LLVM_DEBUG(dbgs() << "\n");
7803       SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
7804     }
7805   }
7806 
7807   // Check the function attributes to see if implicit floats are allowed.
7808   // FIXME: This check doesn't seem possibly correct -- what if the loop is
7809   // an integer loop and the vector instructions selected are purely integer
7810   // vector instructions?
7811   if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
7812     reportVectorizationFailure(
7813         "Can't vectorize when the NoImplicitFloat attribute is used",
7814         "loop not vectorized due to NoImplicitFloat attribute",
7815         "NoImplicitFloat", ORE, L);
7816     Hints.emitRemarkWithHints();
7817     return false;
7818   }
7819 
7820   // Check if the target supports potentially unsafe FP vectorization.
7821   // FIXME: Add a check for the type of safety issue (denormal, signaling)
7822   // for the target we're vectorizing for, to make sure none of the
7823   // additional fp-math flags can help.
7824   if (Hints.isPotentiallyUnsafe() &&
7825       TTI->isFPVectorizationPotentiallyUnsafe()) {
7826     reportVectorizationFailure(
7827         "Potentially unsafe FP op prevents vectorization",
7828         "loop not vectorized due to unsafe FP support.",
7829         "UnsafeFP", ORE, L);
7830     Hints.emitRemarkWithHints();
7831     return false;
7832   }
7833 
7834   bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
7835   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
7836 
7837   // If an override option has been passed in for interleaved accesses, use it.
7838   if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
7839     UseInterleaved = EnableInterleavedMemAccesses;
7840 
7841   // Analyze interleaved memory accesses.
7842   if (UseInterleaved) {
7843     IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
7844   }
7845 
7846   // Use the cost model.
7847   LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
7848                                 F, &Hints, IAI);
7849   CM.collectValuesToIgnore();
7850 
7851   // Use the planner for vectorization.
7852   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE);
7853 
7854   // Get user vectorization factor and interleave count.
7855   unsigned UserVF = Hints.getWidth();
7856   unsigned UserIC = Hints.getInterleave();
7857 
7858   // Plan how to best vectorize, return the best VF and its cost.
7859   Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC);
7860 
7861   VectorizationFactor VF = VectorizationFactor::Disabled();
7862   unsigned IC = 1;
7863 
7864   if (MaybeVF) {
7865     VF = *MaybeVF;
7866     // Select the interleave count.
7867     IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
7868   }
7869 
7870   // Identify the diagnostic messages that should be produced.
7871   std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
7872   bool VectorizeLoop = true, InterleaveLoop = true;
7873   if (Requirements.doesNotMeet(F, L, Hints)) {
7874     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization "
7875                          "requirements.\n");
7876     Hints.emitRemarkWithHints();
7877     return false;
7878   }
7879 
7880   if (VF.Width == 1) {
7881     LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
7882     VecDiagMsg = std::make_pair(
7883         "VectorizationNotBeneficial",
7884         "the cost-model indicates that vectorization is not beneficial");
7885     VectorizeLoop = false;
7886   }
7887 
7888   if (!MaybeVF && UserIC > 1) {
7889     // Tell the user interleaving was avoided up-front, despite being explicitly
7890     // requested.
7891     LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
7892                          "interleaving should be avoided up front\n");
7893     IntDiagMsg = std::make_pair(
7894         "InterleavingAvoided",
7895         "Ignoring UserIC, because interleaving was avoided up front");
7896     InterleaveLoop = false;
7897   } else if (IC == 1 && UserIC <= 1) {
7898     // Tell the user interleaving is not beneficial.
7899     LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
7900     IntDiagMsg = std::make_pair(
7901         "InterleavingNotBeneficial",
7902         "the cost-model indicates that interleaving is not beneficial");
7903     InterleaveLoop = false;
7904     if (UserIC == 1) {
7905       IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
7906       IntDiagMsg.second +=
7907           " and is explicitly disabled or interleave count is set to 1";
7908     }
7909   } else if (IC > 1 && UserIC == 1) {
7910     // Tell the user interleaving is beneficial, but it explicitly disabled.
7911     LLVM_DEBUG(
7912         dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
7913     IntDiagMsg = std::make_pair(
7914         "InterleavingBeneficialButDisabled",
7915         "the cost-model indicates that interleaving is beneficial "
7916         "but is explicitly disabled or interleave count is set to 1");
7917     InterleaveLoop = false;
7918   }
7919 
7920   // Override IC if user provided an interleave count.
7921   IC = UserIC > 0 ? UserIC : IC;
7922 
7923   // Emit diagnostic messages, if any.
7924   const char *VAPassName = Hints.vectorizeAnalysisPassName();
7925   if (!VectorizeLoop && !InterleaveLoop) {
7926     // Do not vectorize or interleaving the loop.
7927     ORE->emit([&]() {
7928       return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
7929                                       L->getStartLoc(), L->getHeader())
7930              << VecDiagMsg.second;
7931     });
7932     ORE->emit([&]() {
7933       return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
7934                                       L->getStartLoc(), L->getHeader())
7935              << IntDiagMsg.second;
7936     });
7937     return false;
7938   } else if (!VectorizeLoop && InterleaveLoop) {
7939     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
7940     ORE->emit([&]() {
7941       return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
7942                                         L->getStartLoc(), L->getHeader())
7943              << VecDiagMsg.second;
7944     });
7945   } else if (VectorizeLoop && !InterleaveLoop) {
7946     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
7947                       << ") in " << DebugLocStr << '\n');
7948     ORE->emit([&]() {
7949       return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
7950                                         L->getStartLoc(), L->getHeader())
7951              << IntDiagMsg.second;
7952     });
7953   } else if (VectorizeLoop && InterleaveLoop) {
7954     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
7955                       << ") in " << DebugLocStr << '\n');
7956     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
7957   }
7958 
7959   LVP.setBestPlan(VF.Width, IC);
7960 
7961   using namespace ore;
7962   bool DisableRuntimeUnroll = false;
7963   MDNode *OrigLoopID = L->getLoopID();
7964 
7965   if (!VectorizeLoop) {
7966     assert(IC > 1 && "interleave count should not be 1 or 0");
7967     // If we decided that it is not legal to vectorize the loop, then
7968     // interleave it.
7969     InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, &CM,
7970                                BFI, PSI);
7971     LVP.executePlan(Unroller, DT);
7972 
7973     ORE->emit([&]() {
7974       return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
7975                                 L->getHeader())
7976              << "interleaved loop (interleaved count: "
7977              << NV("InterleaveCount", IC) << ")";
7978     });
7979   } else {
7980     // If we decided that it is *legal* to vectorize the loop, then do it.
7981     InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
7982                            &LVL, &CM, BFI, PSI);
7983     LVP.executePlan(LB, DT);
7984     ++LoopsVectorized;
7985 
7986     // Add metadata to disable runtime unrolling a scalar loop when there are
7987     // no runtime checks about strides and memory. A scalar loop that is
7988     // rarely used is not worth unrolling.
7989     if (!LB.areSafetyChecksAdded())
7990       DisableRuntimeUnroll = true;
7991 
7992     // Report the vectorization decision.
7993     ORE->emit([&]() {
7994       return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(),
7995                                 L->getHeader())
7996              << "vectorized loop (vectorization width: "
7997              << NV("VectorizationFactor", VF.Width)
7998              << ", interleaved count: " << NV("InterleaveCount", IC) << ")";
7999     });
8000   }
8001 
8002   Optional<MDNode *> RemainderLoopID =
8003       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
8004                                       LLVMLoopVectorizeFollowupEpilogue});
8005   if (RemainderLoopID.hasValue()) {
8006     L->setLoopID(RemainderLoopID.getValue());
8007   } else {
8008     if (DisableRuntimeUnroll)
8009       AddRuntimeUnrollDisableMetaData(L);
8010 
8011     // Mark the loop as already vectorized to avoid vectorizing again.
8012     Hints.setAlreadyVectorized();
8013   }
8014 
8015   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
8016   return true;
8017 }
8018 
8019 LoopVectorizeResult LoopVectorizePass::runImpl(
8020     Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_,
8021     DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_,
8022     DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_,
8023     std::function<const LoopAccessInfo &(Loop &)> &GetLAA_,
8024     OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) {
8025   SE = &SE_;
8026   LI = &LI_;
8027   TTI = &TTI_;
8028   DT = &DT_;
8029   BFI = &BFI_;
8030   TLI = TLI_;
8031   AA = &AA_;
8032   AC = &AC_;
8033   GetLAA = &GetLAA_;
8034   DB = &DB_;
8035   ORE = &ORE_;
8036   PSI = PSI_;
8037 
8038   // Don't attempt if
8039   // 1. the target claims to have no vector registers, and
8040   // 2. interleaving won't help ILP.
8041   //
8042   // The second condition is necessary because, even if the target has no
8043   // vector registers, loop vectorization may still enable scalar
8044   // interleaving.
8045   if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) &&
8046       TTI->getMaxInterleaveFactor(1) < 2)
8047     return LoopVectorizeResult(false, false);
8048 
8049   bool Changed = false, CFGChanged = false;
8050 
8051   // The vectorizer requires loops to be in simplified form.
8052   // Since simplification may add new inner loops, it has to run before the
8053   // legality and profitability checks. This means running the loop vectorizer
8054   // will simplify all loops, regardless of whether anything end up being
8055   // vectorized.
8056   for (auto &L : *LI)
8057     Changed |= CFGChanged |=
8058         simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
8059 
8060   // Build up a worklist of inner-loops to vectorize. This is necessary as
8061   // the act of vectorizing or partially unrolling a loop creates new loops
8062   // and can invalidate iterators across the loops.
8063   SmallVector<Loop *, 8> Worklist;
8064 
8065   for (Loop *L : *LI)
8066     collectSupportedLoops(*L, LI, ORE, Worklist);
8067 
8068   LoopsAnalyzed += Worklist.size();
8069 
8070   // Now walk the identified inner loops.
8071   while (!Worklist.empty()) {
8072     Loop *L = Worklist.pop_back_val();
8073 
8074     // For the inner loops we actually process, form LCSSA to simplify the
8075     // transform.
8076     Changed |= formLCSSARecursively(*L, *DT, LI, SE);
8077 
8078     Changed |= CFGChanged |= processLoop(L);
8079   }
8080 
8081   // Process each loop nest in the function.
8082   return LoopVectorizeResult(Changed, CFGChanged);
8083 }
8084 
8085 PreservedAnalyses LoopVectorizePass::run(Function &F,
8086                                          FunctionAnalysisManager &AM) {
8087     auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
8088     auto &LI = AM.getResult<LoopAnalysis>(F);
8089     auto &TTI = AM.getResult<TargetIRAnalysis>(F);
8090     auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
8091     auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F);
8092     auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
8093     auto &AA = AM.getResult<AAManager>(F);
8094     auto &AC = AM.getResult<AssumptionAnalysis>(F);
8095     auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
8096     auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
8097     MemorySSA *MSSA = EnableMSSALoopDependency
8098                           ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA()
8099                           : nullptr;
8100 
8101     auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
8102     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
8103         [&](Loop &L) -> const LoopAccessInfo & {
8104       LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI, MSSA};
8105       return LAM.getResult<LoopAccessAnalysis>(L, AR);
8106     };
8107     auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
8108     ProfileSummaryInfo *PSI =
8109         MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
8110     LoopVectorizeResult Result =
8111         runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI);
8112     if (!Result.MadeAnyChange)
8113       return PreservedAnalyses::all();
8114     PreservedAnalyses PA;
8115 
8116     // We currently do not preserve loopinfo/dominator analyses with outer loop
8117     // vectorization. Until this is addressed, mark these analyses as preserved
8118     // only for non-VPlan-native path.
8119     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
8120     if (!EnableVPlanNativePath) {
8121       PA.preserve<LoopAnalysis>();
8122       PA.preserve<DominatorTreeAnalysis>();
8123     }
8124     PA.preserve<BasicAA>();
8125     PA.preserve<GlobalsAA>();
8126     if (!Result.MadeCFGChange)
8127       PA.preserveSet<CFGAnalyses>();
8128     return PA;
8129 }
8130