1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10 // and generates target-independent LLVM-IR.
11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12 // of instructions in order to estimate the profitability of vectorization.
13 //
14 // The loop vectorizer combines consecutive loop iterations into a single
15 // 'wide' iteration. After this transformation the index is incremented
16 // by the SIMD vector width, and not by one.
17 //
18 // This pass has three parts:
19 // 1. The main loop pass that drives the different parts.
20 // 2. LoopVectorizationLegality - A unit that checks for the legality
21 //    of the vectorization.
22 // 3. InnerLoopVectorizer - A unit that performs the actual
23 //    widening of instructions.
24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
25 //    of vectorization. It decides on the optimal vector width, which
26 //    can be one, if vectorization is not profitable.
27 //
28 // There is a development effort going on to migrate loop vectorizer to the
29 // VPlan infrastructure and to introduce outer loop vectorization support (see
30 // docs/Proposal/VectorizationPlan.rst and
31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32 // purpose, we temporarily introduced the VPlan-native vectorization path: an
33 // alternative vectorization path that is natively implemented on top of the
34 // VPlan infrastructure. See EnableVPlanNativePath for enabling.
35 //
36 //===----------------------------------------------------------------------===//
37 //
38 // The reduction-variable vectorization is based on the paper:
39 //  D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40 //
41 // Variable uniformity checks are inspired by:
42 //  Karrenberg, R. and Hack, S. Whole Function Vectorization.
43 //
44 // The interleaved access vectorization is based on the paper:
45 //  Dorit Nuzman, Ira Rosen and Ayal Zaks.  Auto-Vectorization of Interleaved
46 //  Data for SIMD
47 //
48 // Other ideas/concepts are from:
49 //  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50 //
51 //  S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua.  An Evaluation of
52 //  Vectorizing Compilers.
53 //
54 //===----------------------------------------------------------------------===//
55 
56 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
57 #include "LoopVectorizationPlanner.h"
58 #include "VPRecipeBuilder.h"
59 #include "VPlan.h"
60 #include "VPlanHCFGBuilder.h"
61 #include "VPlanPredicator.h"
62 #include "VPlanTransforms.h"
63 #include "llvm/ADT/APInt.h"
64 #include "llvm/ADT/ArrayRef.h"
65 #include "llvm/ADT/DenseMap.h"
66 #include "llvm/ADT/DenseMapInfo.h"
67 #include "llvm/ADT/Hashing.h"
68 #include "llvm/ADT/MapVector.h"
69 #include "llvm/ADT/None.h"
70 #include "llvm/ADT/Optional.h"
71 #include "llvm/ADT/STLExtras.h"
72 #include "llvm/ADT/SetVector.h"
73 #include "llvm/ADT/SmallPtrSet.h"
74 #include "llvm/ADT/SmallVector.h"
75 #include "llvm/ADT/Statistic.h"
76 #include "llvm/ADT/StringRef.h"
77 #include "llvm/ADT/Twine.h"
78 #include "llvm/ADT/iterator_range.h"
79 #include "llvm/Analysis/AssumptionCache.h"
80 #include "llvm/Analysis/BasicAliasAnalysis.h"
81 #include "llvm/Analysis/BlockFrequencyInfo.h"
82 #include "llvm/Analysis/CFG.h"
83 #include "llvm/Analysis/CodeMetrics.h"
84 #include "llvm/Analysis/DemandedBits.h"
85 #include "llvm/Analysis/GlobalsModRef.h"
86 #include "llvm/Analysis/LoopAccessAnalysis.h"
87 #include "llvm/Analysis/LoopAnalysisManager.h"
88 #include "llvm/Analysis/LoopInfo.h"
89 #include "llvm/Analysis/LoopIterator.h"
90 #include "llvm/Analysis/MemorySSA.h"
91 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
92 #include "llvm/Analysis/ProfileSummaryInfo.h"
93 #include "llvm/Analysis/ScalarEvolution.h"
94 #include "llvm/Analysis/ScalarEvolutionExpander.h"
95 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
96 #include "llvm/Analysis/TargetLibraryInfo.h"
97 #include "llvm/Analysis/TargetTransformInfo.h"
98 #include "llvm/Analysis/VectorUtils.h"
99 #include "llvm/IR/Attributes.h"
100 #include "llvm/IR/BasicBlock.h"
101 #include "llvm/IR/CFG.h"
102 #include "llvm/IR/Constant.h"
103 #include "llvm/IR/Constants.h"
104 #include "llvm/IR/DataLayout.h"
105 #include "llvm/IR/DebugInfoMetadata.h"
106 #include "llvm/IR/DebugLoc.h"
107 #include "llvm/IR/DerivedTypes.h"
108 #include "llvm/IR/DiagnosticInfo.h"
109 #include "llvm/IR/Dominators.h"
110 #include "llvm/IR/Function.h"
111 #include "llvm/IR/IRBuilder.h"
112 #include "llvm/IR/InstrTypes.h"
113 #include "llvm/IR/Instruction.h"
114 #include "llvm/IR/Instructions.h"
115 #include "llvm/IR/IntrinsicInst.h"
116 #include "llvm/IR/Intrinsics.h"
117 #include "llvm/IR/LLVMContext.h"
118 #include "llvm/IR/Metadata.h"
119 #include "llvm/IR/Module.h"
120 #include "llvm/IR/Operator.h"
121 #include "llvm/IR/Type.h"
122 #include "llvm/IR/Use.h"
123 #include "llvm/IR/User.h"
124 #include "llvm/IR/Value.h"
125 #include "llvm/IR/ValueHandle.h"
126 #include "llvm/IR/Verifier.h"
127 #include "llvm/InitializePasses.h"
128 #include "llvm/Pass.h"
129 #include "llvm/Support/Casting.h"
130 #include "llvm/Support/CommandLine.h"
131 #include "llvm/Support/Compiler.h"
132 #include "llvm/Support/Debug.h"
133 #include "llvm/Support/ErrorHandling.h"
134 #include "llvm/Support/MathExtras.h"
135 #include "llvm/Support/raw_ostream.h"
136 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
137 #include "llvm/Transforms/Utils/InjectTLIMappings.h"
138 #include "llvm/Transforms/Utils/LoopSimplify.h"
139 #include "llvm/Transforms/Utils/LoopUtils.h"
140 #include "llvm/Transforms/Utils/LoopVersioning.h"
141 #include "llvm/Transforms/Utils/SizeOpts.h"
142 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
143 #include <algorithm>
144 #include <cassert>
145 #include <cstdint>
146 #include <cstdlib>
147 #include <functional>
148 #include <iterator>
149 #include <limits>
150 #include <memory>
151 #include <string>
152 #include <tuple>
153 #include <utility>
154 
155 using namespace llvm;
156 
157 #define LV_NAME "loop-vectorize"
158 #define DEBUG_TYPE LV_NAME
159 
160 /// @{
161 /// Metadata attribute names
162 static const char *const LLVMLoopVectorizeFollowupAll =
163     "llvm.loop.vectorize.followup_all";
164 static const char *const LLVMLoopVectorizeFollowupVectorized =
165     "llvm.loop.vectorize.followup_vectorized";
166 static const char *const LLVMLoopVectorizeFollowupEpilogue =
167     "llvm.loop.vectorize.followup_epilogue";
168 /// @}
169 
170 STATISTIC(LoopsVectorized, "Number of loops vectorized");
171 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
172 
173 /// Loops with a known constant trip count below this number are vectorized only
174 /// if no scalar iteration overheads are incurred.
175 static cl::opt<unsigned> TinyTripCountVectorThreshold(
176     "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
177     cl::desc("Loops with a constant trip count that is smaller than this "
178              "value are vectorized only if no scalar iteration overheads "
179              "are incurred."));
180 
181 // Indicates that an epilogue is undesired, predication is preferred.
182 // This means that the vectorizer will try to fold the loop-tail (epilogue)
183 // into the loop and predicate the loop body accordingly.
184 static cl::opt<bool> PreferPredicateOverEpilog(
185     "prefer-predicate-over-epilog", cl::init(false), cl::Hidden,
186     cl::desc("Indicate that an epilogue is undesired, predication should be "
187              "used instead."));
188 
189 static cl::opt<bool> MaximizeBandwidth(
190     "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
191     cl::desc("Maximize bandwidth when selecting vectorization factor which "
192              "will be determined by the smallest type in loop."));
193 
194 static cl::opt<bool> EnableInterleavedMemAccesses(
195     "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
196     cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
197 
198 /// An interleave-group may need masking if it resides in a block that needs
199 /// predication, or in order to mask away gaps.
200 static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
201     "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
202     cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
203 
204 static cl::opt<unsigned> TinyTripCountInterleaveThreshold(
205     "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden,
206     cl::desc("We don't interleave loops with a estimated constant trip count "
207              "below this number"));
208 
209 static cl::opt<unsigned> ForceTargetNumScalarRegs(
210     "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
211     cl::desc("A flag that overrides the target's number of scalar registers."));
212 
213 static cl::opt<unsigned> ForceTargetNumVectorRegs(
214     "force-target-num-vector-regs", cl::init(0), cl::Hidden,
215     cl::desc("A flag that overrides the target's number of vector registers."));
216 
217 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
218     "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
219     cl::desc("A flag that overrides the target's max interleave factor for "
220              "scalar loops."));
221 
222 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
223     "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
224     cl::desc("A flag that overrides the target's max interleave factor for "
225              "vectorized loops."));
226 
227 static cl::opt<unsigned> ForceTargetInstructionCost(
228     "force-target-instruction-cost", cl::init(0), cl::Hidden,
229     cl::desc("A flag that overrides the target's expected cost for "
230              "an instruction to a single constant value. Mostly "
231              "useful for getting consistent testing."));
232 
233 static cl::opt<unsigned> SmallLoopCost(
234     "small-loop-cost", cl::init(20), cl::Hidden,
235     cl::desc(
236         "The cost of a loop that is considered 'small' by the interleaver."));
237 
238 static cl::opt<bool> LoopVectorizeWithBlockFrequency(
239     "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
240     cl::desc("Enable the use of the block frequency analysis to access PGO "
241              "heuristics minimizing code growth in cold regions and being more "
242              "aggressive in hot regions."));
243 
244 // Runtime interleave loops for load/store throughput.
245 static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
246     "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
247     cl::desc(
248         "Enable runtime interleaving until load/store ports are saturated"));
249 
250 /// The number of stores in a loop that are allowed to need predication.
251 static cl::opt<unsigned> NumberOfStoresToPredicate(
252     "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
253     cl::desc("Max number of stores to be predicated behind an if."));
254 
255 static cl::opt<bool> EnableIndVarRegisterHeur(
256     "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
257     cl::desc("Count the induction variable only once when interleaving"));
258 
259 static cl::opt<bool> EnableCondStoresVectorization(
260     "enable-cond-stores-vec", cl::init(true), cl::Hidden,
261     cl::desc("Enable if predication of stores during vectorization."));
262 
263 static cl::opt<unsigned> MaxNestedScalarReductionIC(
264     "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
265     cl::desc("The maximum interleave count to use when interleaving a scalar "
266              "reduction in a nested loop."));
267 
268 cl::opt<bool> EnableVPlanNativePath(
269     "enable-vplan-native-path", cl::init(false), cl::Hidden,
270     cl::desc("Enable VPlan-native vectorization path with "
271              "support for outer loop vectorization."));
272 
273 // FIXME: Remove this switch once we have divergence analysis. Currently we
274 // assume divergent non-backedge branches when this switch is true.
275 cl::opt<bool> EnableVPlanPredication(
276     "enable-vplan-predication", cl::init(false), cl::Hidden,
277     cl::desc("Enable VPlan-native vectorization path predicator with "
278              "support for outer loop vectorization."));
279 
280 // This flag enables the stress testing of the VPlan H-CFG construction in the
281 // VPlan-native vectorization path. It must be used in conjuction with
282 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
283 // verification of the H-CFGs built.
284 static cl::opt<bool> VPlanBuildStressTest(
285     "vplan-build-stress-test", cl::init(false), cl::Hidden,
286     cl::desc(
287         "Build VPlan for every supported loop nest in the function and bail "
288         "out right after the build (stress test the VPlan H-CFG construction "
289         "in the VPlan-native vectorization path)."));
290 
291 cl::opt<bool> llvm::EnableLoopInterleaving(
292     "interleave-loops", cl::init(true), cl::Hidden,
293     cl::desc("Enable loop interleaving in Loop vectorization passes"));
294 cl::opt<bool> llvm::EnableLoopVectorization(
295     "vectorize-loops", cl::init(true), cl::Hidden,
296     cl::desc("Run the Loop vectorization passes"));
297 
298 /// A helper function that returns the type of loaded or stored value.
299 static Type *getMemInstValueType(Value *I) {
300   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
301          "Expected Load or Store instruction");
302   if (auto *LI = dyn_cast<LoadInst>(I))
303     return LI->getType();
304   return cast<StoreInst>(I)->getValueOperand()->getType();
305 }
306 
307 /// A helper function that returns true if the given type is irregular. The
308 /// type is irregular if its allocated size doesn't equal the store size of an
309 /// element of the corresponding vector type at the given vectorization factor.
310 static bool hasIrregularType(Type *Ty, const DataLayout &DL, unsigned VF) {
311   // Determine if an array of VF elements of type Ty is "bitcast compatible"
312   // with a <VF x Ty> vector.
313   if (VF > 1) {
314     auto *VectorTy = VectorType::get(Ty, VF);
315     return VF * DL.getTypeAllocSize(Ty) != DL.getTypeStoreSize(VectorTy);
316   }
317 
318   // If the vectorization factor is one, we just check if an array of type Ty
319   // requires padding between elements.
320   return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
321 }
322 
323 /// A helper function that returns the reciprocal of the block probability of
324 /// predicated blocks. If we return X, we are assuming the predicated block
325 /// will execute once for every X iterations of the loop header.
326 ///
327 /// TODO: We should use actual block probability here, if available. Currently,
328 ///       we always assume predicated blocks have a 50% chance of executing.
329 static unsigned getReciprocalPredBlockProb() { return 2; }
330 
331 /// A helper function that adds a 'fast' flag to floating-point operations.
332 static Value *addFastMathFlag(Value *V) {
333   if (isa<FPMathOperator>(V))
334     cast<Instruction>(V)->setFastMathFlags(FastMathFlags::getFast());
335   return V;
336 }
337 
338 static Value *addFastMathFlag(Value *V, FastMathFlags FMF) {
339   if (isa<FPMathOperator>(V))
340     cast<Instruction>(V)->setFastMathFlags(FMF);
341   return V;
342 }
343 
344 /// A helper function that returns an integer or floating-point constant with
345 /// value C.
346 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
347   return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
348                            : ConstantFP::get(Ty, C);
349 }
350 
351 /// Returns "best known" trip count for the specified loop \p L as defined by
352 /// the following procedure:
353 ///   1) Returns exact trip count if it is known.
354 ///   2) Returns expected trip count according to profile data if any.
355 ///   3) Returns upper bound estimate if it is known.
356 ///   4) Returns None if all of the above failed.
357 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) {
358   // Check if exact trip count is known.
359   if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
360     return ExpectedTC;
361 
362   // Check if there is an expected trip count available from profile data.
363   if (LoopVectorizeWithBlockFrequency)
364     if (auto EstimatedTC = getLoopEstimatedTripCount(L))
365       return EstimatedTC;
366 
367   // Check if upper bound estimate is known.
368   if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
369     return ExpectedTC;
370 
371   return None;
372 }
373 
374 namespace llvm {
375 
376 /// InnerLoopVectorizer vectorizes loops which contain only one basic
377 /// block to a specified vectorization factor (VF).
378 /// This class performs the widening of scalars into vectors, or multiple
379 /// scalars. This class also implements the following features:
380 /// * It inserts an epilogue loop for handling loops that don't have iteration
381 ///   counts that are known to be a multiple of the vectorization factor.
382 /// * It handles the code generation for reduction variables.
383 /// * Scalarization (implementation using scalars) of un-vectorizable
384 ///   instructions.
385 /// InnerLoopVectorizer does not perform any vectorization-legality
386 /// checks, and relies on the caller to check for the different legality
387 /// aspects. The InnerLoopVectorizer relies on the
388 /// LoopVectorizationLegality class to provide information about the induction
389 /// and reduction variables that were found to a given vectorization factor.
390 class InnerLoopVectorizer {
391 public:
392   InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
393                       LoopInfo *LI, DominatorTree *DT,
394                       const TargetLibraryInfo *TLI,
395                       const TargetTransformInfo *TTI, AssumptionCache *AC,
396                       OptimizationRemarkEmitter *ORE, unsigned VecWidth,
397                       unsigned UnrollFactor, LoopVectorizationLegality *LVL,
398                       LoopVectorizationCostModel *CM)
399       : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
400         AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
401         Builder(PSE.getSE()->getContext()),
402         VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM) {}
403   virtual ~InnerLoopVectorizer() = default;
404 
405   /// Create a new empty loop. Unlink the old loop and connect the new one.
406   /// Return the pre-header block of the new loop.
407   BasicBlock *createVectorizedLoopSkeleton();
408 
409   /// Widen a single instruction within the innermost loop.
410   void widenInstruction(Instruction &I, VPUser &Operands,
411                         VPTransformState &State);
412 
413   /// Widen a single call instruction within the innermost loop.
414   void widenCallInstruction(CallInst &I, VPUser &ArgOperands,
415                             VPTransformState &State);
416 
417   /// Widen a single select instruction within the innermost loop.
418   void widenSelectInstruction(SelectInst &I, bool InvariantCond);
419 
420   /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
421   void fixVectorizedLoop();
422 
423   // Return true if any runtime check is added.
424   bool areSafetyChecksAdded() { return AddedSafetyChecks; }
425 
426   /// A type for vectorized values in the new loop. Each value from the
427   /// original loop, when vectorized, is represented by UF vector values in the
428   /// new unrolled loop, where UF is the unroll factor.
429   using VectorParts = SmallVector<Value *, 2>;
430 
431   /// Vectorize a single GetElementPtrInst based on information gathered and
432   /// decisions taken during planning.
433   void widenGEP(GetElementPtrInst *GEP, unsigned UF, unsigned VF,
434                 bool IsPtrLoopInvariant, SmallBitVector &IsIndexLoopInvariant);
435 
436   /// Vectorize a single PHINode in a block. This method handles the induction
437   /// variable canonicalization. It supports both VF = 1 for unrolled loops and
438   /// arbitrary length vectors.
439   void widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF);
440 
441   /// A helper function to scalarize a single Instruction in the innermost loop.
442   /// Generates a sequence of scalar instances for each lane between \p MinLane
443   /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
444   /// inclusive..
445   void scalarizeInstruction(Instruction *Instr, const VPIteration &Instance,
446                             bool IfPredicateInstr);
447 
448   /// Widen an integer or floating-point induction variable \p IV. If \p Trunc
449   /// is provided, the integer induction variable will first be truncated to
450   /// the corresponding type.
451   void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr);
452 
453   /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a
454   /// vector or scalar value on-demand if one is not yet available. When
455   /// vectorizing a loop, we visit the definition of an instruction before its
456   /// uses. When visiting the definition, we either vectorize or scalarize the
457   /// instruction, creating an entry for it in the corresponding map. (In some
458   /// cases, such as induction variables, we will create both vector and scalar
459   /// entries.) Then, as we encounter uses of the definition, we derive values
460   /// for each scalar or vector use unless such a value is already available.
461   /// For example, if we scalarize a definition and one of its uses is vector,
462   /// we build the required vector on-demand with an insertelement sequence
463   /// when visiting the use. Otherwise, if the use is scalar, we can use the
464   /// existing scalar definition.
465   ///
466   /// Return a value in the new loop corresponding to \p V from the original
467   /// loop at unroll index \p Part. If the value has already been vectorized,
468   /// the corresponding vector entry in VectorLoopValueMap is returned. If,
469   /// however, the value has a scalar entry in VectorLoopValueMap, we construct
470   /// a new vector value on-demand by inserting the scalar values into a vector
471   /// with an insertelement sequence. If the value has been neither vectorized
472   /// nor scalarized, it must be loop invariant, so we simply broadcast the
473   /// value into a vector.
474   Value *getOrCreateVectorValue(Value *V, unsigned Part);
475 
476   /// Return a value in the new loop corresponding to \p V from the original
477   /// loop at unroll and vector indices \p Instance. If the value has been
478   /// vectorized but not scalarized, the necessary extractelement instruction
479   /// will be generated.
480   Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance);
481 
482   /// Construct the vector value of a scalarized value \p V one lane at a time.
483   void packScalarIntoVectorValue(Value *V, const VPIteration &Instance);
484 
485   /// Try to vectorize interleaved access group \p Group with the base address
486   /// given in \p Addr, optionally masking the vector operations if \p
487   /// BlockInMask is non-null. Use \p State to translate given VPValues to IR
488   /// values in the vectorized loop.
489   void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group,
490                                 VPTransformState &State, VPValue *Addr,
491                                 VPValue *BlockInMask = nullptr);
492 
493   /// Vectorize Load and Store instructions with the base address given in \p
494   /// Addr, optionally masking the vector operations if \p BlockInMask is
495   /// non-null. Use \p State to translate given VPValues to IR values in the
496   /// vectorized loop.
497   void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State,
498                                   VPValue *Addr, VPValue *StoredValue,
499                                   VPValue *BlockInMask);
500 
501   /// Set the debug location in the builder using the debug location in
502   /// the instruction.
503   void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr);
504 
505   /// Fix the non-induction PHIs in the OrigPHIsToFix vector.
506   void fixNonInductionPHIs(void);
507 
508 protected:
509   friend class LoopVectorizationPlanner;
510 
511   /// A small list of PHINodes.
512   using PhiVector = SmallVector<PHINode *, 4>;
513 
514   /// A type for scalarized values in the new loop. Each value from the
515   /// original loop, when scalarized, is represented by UF x VF scalar values
516   /// in the new unrolled loop, where UF is the unroll factor and VF is the
517   /// vectorization factor.
518   using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
519 
520   /// Set up the values of the IVs correctly when exiting the vector loop.
521   void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
522                     Value *CountRoundDown, Value *EndValue,
523                     BasicBlock *MiddleBlock);
524 
525   /// Create a new induction variable inside L.
526   PHINode *createInductionVariable(Loop *L, Value *Start, Value *End,
527                                    Value *Step, Instruction *DL);
528 
529   /// Handle all cross-iteration phis in the header.
530   void fixCrossIterationPHIs();
531 
532   /// Fix a first-order recurrence. This is the second phase of vectorizing
533   /// this phi node.
534   void fixFirstOrderRecurrence(PHINode *Phi);
535 
536   /// Fix a reduction cross-iteration phi. This is the second phase of
537   /// vectorizing this phi node.
538   void fixReduction(PHINode *Phi);
539 
540   /// Clear NSW/NUW flags from reduction instructions if necessary.
541   void clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc);
542 
543   /// The Loop exit block may have single value PHI nodes with some
544   /// incoming value. While vectorizing we only handled real values
545   /// that were defined inside the loop and we should have one value for
546   /// each predecessor of its parent basic block. See PR14725.
547   void fixLCSSAPHIs();
548 
549   /// Iteratively sink the scalarized operands of a predicated instruction into
550   /// the block that was created for it.
551   void sinkScalarOperands(Instruction *PredInst);
552 
553   /// Shrinks vector element sizes to the smallest bitwidth they can be legally
554   /// represented as.
555   void truncateToMinimalBitwidths();
556 
557   /// Create a broadcast instruction. This method generates a broadcast
558   /// instruction (shuffle) for loop invariant values and for the induction
559   /// value. If this is the induction variable then we extend it to N, N+1, ...
560   /// this is needed because each iteration in the loop corresponds to a SIMD
561   /// element.
562   virtual Value *getBroadcastInstrs(Value *V);
563 
564   /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...)
565   /// to each vector element of Val. The sequence starts at StartIndex.
566   /// \p Opcode is relevant for FP induction variable.
567   virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step,
568                                Instruction::BinaryOps Opcode =
569                                Instruction::BinaryOpsEnd);
570 
571   /// Compute scalar induction steps. \p ScalarIV is the scalar induction
572   /// variable on which to base the steps, \p Step is the size of the step, and
573   /// \p EntryVal is the value from the original loop that maps to the steps.
574   /// Note that \p EntryVal doesn't have to be an induction variable - it
575   /// can also be a truncate instruction.
576   void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal,
577                         const InductionDescriptor &ID);
578 
579   /// Create a vector induction phi node based on an existing scalar one. \p
580   /// EntryVal is the value from the original loop that maps to the vector phi
581   /// node, and \p Step is the loop-invariant step. If \p EntryVal is a
582   /// truncate instruction, instead of widening the original IV, we widen a
583   /// version of the IV truncated to \p EntryVal's type.
584   void createVectorIntOrFpInductionPHI(const InductionDescriptor &II,
585                                        Value *Step, Instruction *EntryVal);
586 
587   /// Returns true if an instruction \p I should be scalarized instead of
588   /// vectorized for the chosen vectorization factor.
589   bool shouldScalarizeInstruction(Instruction *I) const;
590 
591   /// Returns true if we should generate a scalar version of \p IV.
592   bool needsScalarInduction(Instruction *IV) const;
593 
594   /// If there is a cast involved in the induction variable \p ID, which should
595   /// be ignored in the vectorized loop body, this function records the
596   /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the
597   /// cast. We had already proved that the casted Phi is equal to the uncasted
598   /// Phi in the vectorized loop (under a runtime guard), and therefore
599   /// there is no need to vectorize the cast - the same value can be used in the
600   /// vector loop for both the Phi and the cast.
601   /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified,
602   /// Otherwise, \p VectorLoopValue is a widened/vectorized value.
603   ///
604   /// \p EntryVal is the value from the original loop that maps to the vector
605   /// phi node and is used to distinguish what is the IV currently being
606   /// processed - original one (if \p EntryVal is a phi corresponding to the
607   /// original IV) or the "newly-created" one based on the proof mentioned above
608   /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the
609   /// latter case \p EntryVal is a TruncInst and we must not record anything for
610   /// that IV, but it's error-prone to expect callers of this routine to care
611   /// about that, hence this explicit parameter.
612   void recordVectorLoopValueForInductionCast(const InductionDescriptor &ID,
613                                              const Instruction *EntryVal,
614                                              Value *VectorLoopValue,
615                                              unsigned Part,
616                                              unsigned Lane = UINT_MAX);
617 
618   /// Generate a shuffle sequence that will reverse the vector Vec.
619   virtual Value *reverseVector(Value *Vec);
620 
621   /// Returns (and creates if needed) the original loop trip count.
622   Value *getOrCreateTripCount(Loop *NewLoop);
623 
624   /// Returns (and creates if needed) the trip count of the widened loop.
625   Value *getOrCreateVectorTripCount(Loop *NewLoop);
626 
627   /// Returns a bitcasted value to the requested vector type.
628   /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
629   Value *createBitOrPointerCast(Value *V, VectorType *DstVTy,
630                                 const DataLayout &DL);
631 
632   /// Emit a bypass check to see if the vector trip count is zero, including if
633   /// it overflows.
634   void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass);
635 
636   /// Emit a bypass check to see if all of the SCEV assumptions we've
637   /// had to make are correct.
638   void emitSCEVChecks(Loop *L, BasicBlock *Bypass);
639 
640   /// Emit bypass checks to check any memory assumptions we may have made.
641   void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass);
642 
643   /// Compute the transformed value of Index at offset StartValue using step
644   /// StepValue.
645   /// For integer induction, returns StartValue + Index * StepValue.
646   /// For pointer induction, returns StartValue[Index * StepValue].
647   /// FIXME: The newly created binary instructions should contain nsw/nuw
648   /// flags, which can be found from the original scalar operations.
649   Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE,
650                               const DataLayout &DL,
651                               const InductionDescriptor &ID) const;
652 
653   /// Add additional metadata to \p To that was not present on \p Orig.
654   ///
655   /// Currently this is used to add the noalias annotations based on the
656   /// inserted memchecks.  Use this for instructions that are *cloned* into the
657   /// vector loop.
658   void addNewMetadata(Instruction *To, const Instruction *Orig);
659 
660   /// Add metadata from one instruction to another.
661   ///
662   /// This includes both the original MDs from \p From and additional ones (\see
663   /// addNewMetadata).  Use this for *newly created* instructions in the vector
664   /// loop.
665   void addMetadata(Instruction *To, Instruction *From);
666 
667   /// Similar to the previous function but it adds the metadata to a
668   /// vector of instructions.
669   void addMetadata(ArrayRef<Value *> To, Instruction *From);
670 
671   /// The original loop.
672   Loop *OrigLoop;
673 
674   /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
675   /// dynamic knowledge to simplify SCEV expressions and converts them to a
676   /// more usable form.
677   PredicatedScalarEvolution &PSE;
678 
679   /// Loop Info.
680   LoopInfo *LI;
681 
682   /// Dominator Tree.
683   DominatorTree *DT;
684 
685   /// Alias Analysis.
686   AliasAnalysis *AA;
687 
688   /// Target Library Info.
689   const TargetLibraryInfo *TLI;
690 
691   /// Target Transform Info.
692   const TargetTransformInfo *TTI;
693 
694   /// Assumption Cache.
695   AssumptionCache *AC;
696 
697   /// Interface to emit optimization remarks.
698   OptimizationRemarkEmitter *ORE;
699 
700   /// LoopVersioning.  It's only set up (non-null) if memchecks were
701   /// used.
702   ///
703   /// This is currently only used to add no-alias metadata based on the
704   /// memchecks.  The actually versioning is performed manually.
705   std::unique_ptr<LoopVersioning> LVer;
706 
707   /// The vectorization SIMD factor to use. Each vector will have this many
708   /// vector elements.
709   unsigned VF;
710 
711   /// The vectorization unroll factor to use. Each scalar is vectorized to this
712   /// many different vector instructions.
713   unsigned UF;
714 
715   /// The builder that we use
716   IRBuilder<> Builder;
717 
718   // --- Vectorization state ---
719 
720   /// The vector-loop preheader.
721   BasicBlock *LoopVectorPreHeader;
722 
723   /// The scalar-loop preheader.
724   BasicBlock *LoopScalarPreHeader;
725 
726   /// Middle Block between the vector and the scalar.
727   BasicBlock *LoopMiddleBlock;
728 
729   /// The ExitBlock of the scalar loop.
730   BasicBlock *LoopExitBlock;
731 
732   /// The vector loop body.
733   BasicBlock *LoopVectorBody;
734 
735   /// The scalar loop body.
736   BasicBlock *LoopScalarBody;
737 
738   /// A list of all bypass blocks. The first block is the entry of the loop.
739   SmallVector<BasicBlock *, 4> LoopBypassBlocks;
740 
741   /// The new Induction variable which was added to the new block.
742   PHINode *Induction = nullptr;
743 
744   /// The induction variable of the old basic block.
745   PHINode *OldInduction = nullptr;
746 
747   /// Maps values from the original loop to their corresponding values in the
748   /// vectorized loop. A key value can map to either vector values, scalar
749   /// values or both kinds of values, depending on whether the key was
750   /// vectorized and scalarized.
751   VectorizerValueMap VectorLoopValueMap;
752 
753   /// Store instructions that were predicated.
754   SmallVector<Instruction *, 4> PredicatedInstructions;
755 
756   /// Trip count of the original loop.
757   Value *TripCount = nullptr;
758 
759   /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
760   Value *VectorTripCount = nullptr;
761 
762   /// The legality analysis.
763   LoopVectorizationLegality *Legal;
764 
765   /// The profitablity analysis.
766   LoopVectorizationCostModel *Cost;
767 
768   // Record whether runtime checks are added.
769   bool AddedSafetyChecks = false;
770 
771   // Holds the end values for each induction variable. We save the end values
772   // so we can later fix-up the external users of the induction variables.
773   DenseMap<PHINode *, Value *> IVEndValues;
774 
775   // Vector of original scalar PHIs whose corresponding widened PHIs need to be
776   // fixed up at the end of vector code generation.
777   SmallVector<PHINode *, 8> OrigPHIsToFix;
778 };
779 
780 class InnerLoopUnroller : public InnerLoopVectorizer {
781 public:
782   InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
783                     LoopInfo *LI, DominatorTree *DT,
784                     const TargetLibraryInfo *TLI,
785                     const TargetTransformInfo *TTI, AssumptionCache *AC,
786                     OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
787                     LoopVectorizationLegality *LVL,
788                     LoopVectorizationCostModel *CM)
789       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1,
790                             UnrollFactor, LVL, CM) {}
791 
792 private:
793   Value *getBroadcastInstrs(Value *V) override;
794   Value *getStepVector(Value *Val, int StartIdx, Value *Step,
795                        Instruction::BinaryOps Opcode =
796                        Instruction::BinaryOpsEnd) override;
797   Value *reverseVector(Value *Vec) override;
798 };
799 
800 } // end namespace llvm
801 
802 /// Look for a meaningful debug location on the instruction or it's
803 /// operands.
804 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
805   if (!I)
806     return I;
807 
808   DebugLoc Empty;
809   if (I->getDebugLoc() != Empty)
810     return I;
811 
812   for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) {
813     if (Instruction *OpInst = dyn_cast<Instruction>(*OI))
814       if (OpInst->getDebugLoc() != Empty)
815         return OpInst;
816   }
817 
818   return I;
819 }
820 
821 void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) {
822   if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) {
823     const DILocation *DIL = Inst->getDebugLoc();
824     if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
825         !isa<DbgInfoIntrinsic>(Inst)) {
826       auto NewDIL = DIL->cloneByMultiplyingDuplicationFactor(UF * VF);
827       if (NewDIL)
828         B.SetCurrentDebugLocation(NewDIL.getValue());
829       else
830         LLVM_DEBUG(dbgs()
831                    << "Failed to create new discriminator: "
832                    << DIL->getFilename() << " Line: " << DIL->getLine());
833     }
834     else
835       B.SetCurrentDebugLocation(DIL);
836   } else
837     B.SetCurrentDebugLocation(DebugLoc());
838 }
839 
840 /// Write a record \p DebugMsg about vectorization failure to the debug
841 /// output stream. If \p I is passed, it is an instruction that prevents
842 /// vectorization.
843 #ifndef NDEBUG
844 static void debugVectorizationFailure(const StringRef DebugMsg,
845     Instruction *I) {
846   dbgs() << "LV: Not vectorizing: " << DebugMsg;
847   if (I != nullptr)
848     dbgs() << " " << *I;
849   else
850     dbgs() << '.';
851   dbgs() << '\n';
852 }
853 #endif
854 
855 /// Create an analysis remark that explains why vectorization failed
856 ///
857 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint).  \p
858 /// RemarkName is the identifier for the remark.  If \p I is passed it is an
859 /// instruction that prevents vectorization.  Otherwise \p TheLoop is used for
860 /// the location of the remark.  \return the remark object that can be
861 /// streamed to.
862 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
863     StringRef RemarkName, Loop *TheLoop, Instruction *I) {
864   Value *CodeRegion = TheLoop->getHeader();
865   DebugLoc DL = TheLoop->getStartLoc();
866 
867   if (I) {
868     CodeRegion = I->getParent();
869     // If there is no debug location attached to the instruction, revert back to
870     // using the loop's.
871     if (I->getDebugLoc())
872       DL = I->getDebugLoc();
873   }
874 
875   OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion);
876   R << "loop not vectorized: ";
877   return R;
878 }
879 
880 namespace llvm {
881 
882 void reportVectorizationFailure(const StringRef DebugMsg,
883     const StringRef OREMsg, const StringRef ORETag,
884     OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) {
885   LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I));
886   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
887   ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(),
888                 ORETag, TheLoop, I) << OREMsg);
889 }
890 
891 } // end namespace llvm
892 
893 #ifndef NDEBUG
894 /// \return string containing a file name and a line # for the given loop.
895 static std::string getDebugLocString(const Loop *L) {
896   std::string Result;
897   if (L) {
898     raw_string_ostream OS(Result);
899     if (const DebugLoc LoopDbgLoc = L->getStartLoc())
900       LoopDbgLoc.print(OS);
901     else
902       // Just print the module name.
903       OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
904     OS.flush();
905   }
906   return Result;
907 }
908 #endif
909 
910 void InnerLoopVectorizer::addNewMetadata(Instruction *To,
911                                          const Instruction *Orig) {
912   // If the loop was versioned with memchecks, add the corresponding no-alias
913   // metadata.
914   if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig)))
915     LVer->annotateInstWithNoAlias(To, Orig);
916 }
917 
918 void InnerLoopVectorizer::addMetadata(Instruction *To,
919                                       Instruction *From) {
920   propagateMetadata(To, From);
921   addNewMetadata(To, From);
922 }
923 
924 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,
925                                       Instruction *From) {
926   for (Value *V : To) {
927     if (Instruction *I = dyn_cast<Instruction>(V))
928       addMetadata(I, From);
929   }
930 }
931 
932 namespace llvm {
933 
934 // Loop vectorization cost-model hints how the scalar epilogue loop should be
935 // lowered.
936 enum ScalarEpilogueLowering {
937 
938   // The default: allowing scalar epilogues.
939   CM_ScalarEpilogueAllowed,
940 
941   // Vectorization with OptForSize: don't allow epilogues.
942   CM_ScalarEpilogueNotAllowedOptSize,
943 
944   // A special case of vectorisation with OptForSize: loops with a very small
945   // trip count are considered for vectorization under OptForSize, thereby
946   // making sure the cost of their loop body is dominant, free of runtime
947   // guards and scalar iteration overheads.
948   CM_ScalarEpilogueNotAllowedLowTripLoop,
949 
950   // Loop hint predicate indicating an epilogue is undesired.
951   CM_ScalarEpilogueNotNeededUsePredicate
952 };
953 
954 /// LoopVectorizationCostModel - estimates the expected speedups due to
955 /// vectorization.
956 /// In many cases vectorization is not profitable. This can happen because of
957 /// a number of reasons. In this class we mainly attempt to predict the
958 /// expected speedup/slowdowns due to the supported instruction set. We use the
959 /// TargetTransformInfo to query the different backends for the cost of
960 /// different operations.
961 class LoopVectorizationCostModel {
962 public:
963   LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
964                              PredicatedScalarEvolution &PSE, LoopInfo *LI,
965                              LoopVectorizationLegality *Legal,
966                              const TargetTransformInfo &TTI,
967                              const TargetLibraryInfo *TLI, DemandedBits *DB,
968                              AssumptionCache *AC,
969                              OptimizationRemarkEmitter *ORE, const Function *F,
970                              const LoopVectorizeHints *Hints,
971                              InterleavedAccessInfo &IAI)
972       : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
973         TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
974         Hints(Hints), InterleaveInfo(IAI) {}
975 
976   /// \return An upper bound for the vectorization factor, or None if
977   /// vectorization and interleaving should be avoided up front.
978   Optional<unsigned> computeMaxVF();
979 
980   /// \return True if runtime checks are required for vectorization, and false
981   /// otherwise.
982   bool runtimeChecksRequired();
983 
984   /// \return The most profitable vectorization factor and the cost of that VF.
985   /// This method checks every power of two up to MaxVF. If UserVF is not ZERO
986   /// then this vectorization factor will be selected if vectorization is
987   /// possible.
988   VectorizationFactor selectVectorizationFactor(unsigned MaxVF);
989 
990   /// Setup cost-based decisions for user vectorization factor.
991   void selectUserVectorizationFactor(unsigned UserVF) {
992     collectUniformsAndScalars(UserVF);
993     collectInstsToScalarize(UserVF);
994   }
995 
996   /// \return The size (in bits) of the smallest and widest types in the code
997   /// that needs to be vectorized. We ignore values that remain scalar such as
998   /// 64 bit loop indices.
999   std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
1000 
1001   /// \return The desired interleave count.
1002   /// If interleave count has been specified by metadata it will be returned.
1003   /// Otherwise, the interleave count is computed and returned. VF and LoopCost
1004   /// are the selected vectorization factor and the cost of the selected VF.
1005   unsigned selectInterleaveCount(unsigned VF, unsigned LoopCost);
1006 
1007   /// Memory access instruction may be vectorized in more than one way.
1008   /// Form of instruction after vectorization depends on cost.
1009   /// This function takes cost-based decisions for Load/Store instructions
1010   /// and collects them in a map. This decisions map is used for building
1011   /// the lists of loop-uniform and loop-scalar instructions.
1012   /// The calculated cost is saved with widening decision in order to
1013   /// avoid redundant calculations.
1014   void setCostBasedWideningDecision(unsigned VF);
1015 
1016   /// A struct that represents some properties of the register usage
1017   /// of a loop.
1018   struct RegisterUsage {
1019     /// Holds the number of loop invariant values that are used in the loop.
1020     /// The key is ClassID of target-provided register class.
1021     SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs;
1022     /// Holds the maximum number of concurrent live intervals in the loop.
1023     /// The key is ClassID of target-provided register class.
1024     SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
1025   };
1026 
1027   /// \return Returns information about the register usages of the loop for the
1028   /// given vectorization factors.
1029   SmallVector<RegisterUsage, 8> calculateRegisterUsage(ArrayRef<unsigned> VFs);
1030 
1031   /// Collect values we want to ignore in the cost model.
1032   void collectValuesToIgnore();
1033 
1034   /// \returns The smallest bitwidth each instruction can be represented with.
1035   /// The vector equivalents of these instructions should be truncated to this
1036   /// type.
1037   const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
1038     return MinBWs;
1039   }
1040 
1041   /// \returns True if it is more profitable to scalarize instruction \p I for
1042   /// vectorization factor \p VF.
1043   bool isProfitableToScalarize(Instruction *I, unsigned VF) const {
1044     assert(VF > 1 && "Profitable to scalarize relevant only for VF > 1.");
1045 
1046     // Cost model is not run in the VPlan-native path - return conservative
1047     // result until this changes.
1048     if (EnableVPlanNativePath)
1049       return false;
1050 
1051     auto Scalars = InstsToScalarize.find(VF);
1052     assert(Scalars != InstsToScalarize.end() &&
1053            "VF not yet analyzed for scalarization profitability");
1054     return Scalars->second.find(I) != Scalars->second.end();
1055   }
1056 
1057   /// Returns true if \p I is known to be uniform after vectorization.
1058   bool isUniformAfterVectorization(Instruction *I, unsigned VF) const {
1059     if (VF == 1)
1060       return true;
1061 
1062     // Cost model is not run in the VPlan-native path - return conservative
1063     // result until this changes.
1064     if (EnableVPlanNativePath)
1065       return false;
1066 
1067     auto UniformsPerVF = Uniforms.find(VF);
1068     assert(UniformsPerVF != Uniforms.end() &&
1069            "VF not yet analyzed for uniformity");
1070     return UniformsPerVF->second.find(I) != UniformsPerVF->second.end();
1071   }
1072 
1073   /// Returns true if \p I is known to be scalar after vectorization.
1074   bool isScalarAfterVectorization(Instruction *I, unsigned VF) const {
1075     if (VF == 1)
1076       return true;
1077 
1078     // Cost model is not run in the VPlan-native path - return conservative
1079     // result until this changes.
1080     if (EnableVPlanNativePath)
1081       return false;
1082 
1083     auto ScalarsPerVF = Scalars.find(VF);
1084     assert(ScalarsPerVF != Scalars.end() &&
1085            "Scalar values are not calculated for VF");
1086     return ScalarsPerVF->second.find(I) != ScalarsPerVF->second.end();
1087   }
1088 
1089   /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1090   /// for vectorization factor \p VF.
1091   bool canTruncateToMinimalBitwidth(Instruction *I, unsigned VF) const {
1092     return VF > 1 && MinBWs.find(I) != MinBWs.end() &&
1093            !isProfitableToScalarize(I, VF) &&
1094            !isScalarAfterVectorization(I, VF);
1095   }
1096 
1097   /// Decision that was taken during cost calculation for memory instruction.
1098   enum InstWidening {
1099     CM_Unknown,
1100     CM_Widen,         // For consecutive accesses with stride +1.
1101     CM_Widen_Reverse, // For consecutive accesses with stride -1.
1102     CM_Interleave,
1103     CM_GatherScatter,
1104     CM_Scalarize
1105   };
1106 
1107   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1108   /// instruction \p I and vector width \p VF.
1109   void setWideningDecision(Instruction *I, unsigned VF, InstWidening W,
1110                            unsigned Cost) {
1111     assert(VF >= 2 && "Expected VF >=2");
1112     WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1113   }
1114 
1115   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1116   /// interleaving group \p Grp and vector width \p VF.
1117   void setWideningDecision(const InterleaveGroup<Instruction> *Grp, unsigned VF,
1118                            InstWidening W, unsigned Cost) {
1119     assert(VF >= 2 && "Expected VF >=2");
1120     /// Broadcast this decicion to all instructions inside the group.
1121     /// But the cost will be assigned to one instruction only.
1122     for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1123       if (auto *I = Grp->getMember(i)) {
1124         if (Grp->getInsertPos() == I)
1125           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1126         else
1127           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1128       }
1129     }
1130   }
1131 
1132   /// Return the cost model decision for the given instruction \p I and vector
1133   /// width \p VF. Return CM_Unknown if this instruction did not pass
1134   /// through the cost modeling.
1135   InstWidening getWideningDecision(Instruction *I, unsigned VF) {
1136     assert(VF >= 2 && "Expected VF >=2");
1137 
1138     // Cost model is not run in the VPlan-native path - return conservative
1139     // result until this changes.
1140     if (EnableVPlanNativePath)
1141       return CM_GatherScatter;
1142 
1143     std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
1144     auto Itr = WideningDecisions.find(InstOnVF);
1145     if (Itr == WideningDecisions.end())
1146       return CM_Unknown;
1147     return Itr->second.first;
1148   }
1149 
1150   /// Return the vectorization cost for the given instruction \p I and vector
1151   /// width \p VF.
1152   unsigned getWideningCost(Instruction *I, unsigned VF) {
1153     assert(VF >= 2 && "Expected VF >=2");
1154     std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
1155     assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&
1156            "The cost is not calculated");
1157     return WideningDecisions[InstOnVF].second;
1158   }
1159 
1160   /// Return True if instruction \p I is an optimizable truncate whose operand
1161   /// is an induction variable. Such a truncate will be removed by adding a new
1162   /// induction variable with the destination type.
1163   bool isOptimizableIVTruncate(Instruction *I, unsigned VF) {
1164     // If the instruction is not a truncate, return false.
1165     auto *Trunc = dyn_cast<TruncInst>(I);
1166     if (!Trunc)
1167       return false;
1168 
1169     // Get the source and destination types of the truncate.
1170     Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1171     Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1172 
1173     // If the truncate is free for the given types, return false. Replacing a
1174     // free truncate with an induction variable would add an induction variable
1175     // update instruction to each iteration of the loop. We exclude from this
1176     // check the primary induction variable since it will need an update
1177     // instruction regardless.
1178     Value *Op = Trunc->getOperand(0);
1179     if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1180       return false;
1181 
1182     // If the truncated value is not an induction variable, return false.
1183     return Legal->isInductionPhi(Op);
1184   }
1185 
1186   /// Collects the instructions to scalarize for each predicated instruction in
1187   /// the loop.
1188   void collectInstsToScalarize(unsigned VF);
1189 
1190   /// Collect Uniform and Scalar values for the given \p VF.
1191   /// The sets depend on CM decision for Load/Store instructions
1192   /// that may be vectorized as interleave, gather-scatter or scalarized.
1193   void collectUniformsAndScalars(unsigned VF) {
1194     // Do the analysis once.
1195     if (VF == 1 || Uniforms.find(VF) != Uniforms.end())
1196       return;
1197     setCostBasedWideningDecision(VF);
1198     collectLoopUniforms(VF);
1199     collectLoopScalars(VF);
1200   }
1201 
1202   /// Returns true if the target machine supports masked store operation
1203   /// for the given \p DataType and kind of access to \p Ptr.
1204   bool isLegalMaskedStore(Type *DataType, Value *Ptr, MaybeAlign Alignment) {
1205     return Legal->isConsecutivePtr(Ptr) &&
1206            TTI.isLegalMaskedStore(DataType, Alignment);
1207   }
1208 
1209   /// Returns true if the target machine supports masked load operation
1210   /// for the given \p DataType and kind of access to \p Ptr.
1211   bool isLegalMaskedLoad(Type *DataType, Value *Ptr, MaybeAlign Alignment) {
1212     return Legal->isConsecutivePtr(Ptr) &&
1213            TTI.isLegalMaskedLoad(DataType, Alignment);
1214   }
1215 
1216   /// Returns true if the target machine supports masked scatter operation
1217   /// for the given \p DataType.
1218   bool isLegalMaskedScatter(Type *DataType, MaybeAlign Alignment) {
1219     return TTI.isLegalMaskedScatter(DataType, Alignment);
1220   }
1221 
1222   /// Returns true if the target machine supports masked gather operation
1223   /// for the given \p DataType.
1224   bool isLegalMaskedGather(Type *DataType, MaybeAlign Alignment) {
1225     return TTI.isLegalMaskedGather(DataType, Alignment);
1226   }
1227 
1228   /// Returns true if the target machine can represent \p V as a masked gather
1229   /// or scatter operation.
1230   bool isLegalGatherOrScatter(Value *V) {
1231     bool LI = isa<LoadInst>(V);
1232     bool SI = isa<StoreInst>(V);
1233     if (!LI && !SI)
1234       return false;
1235     auto *Ty = getMemInstValueType(V);
1236     MaybeAlign Align = getLoadStoreAlignment(V);
1237     return (LI && isLegalMaskedGather(Ty, Align)) ||
1238            (SI && isLegalMaskedScatter(Ty, Align));
1239   }
1240 
1241   /// Returns true if \p I is an instruction that will be scalarized with
1242   /// predication. Such instructions include conditional stores and
1243   /// instructions that may divide by zero.
1244   /// If a non-zero VF has been calculated, we check if I will be scalarized
1245   /// predication for that VF.
1246   bool isScalarWithPredication(Instruction *I, unsigned VF = 1);
1247 
1248   // Returns true if \p I is an instruction that will be predicated either
1249   // through scalar predication or masked load/store or masked gather/scatter.
1250   // Superset of instructions that return true for isScalarWithPredication.
1251   bool isPredicatedInst(Instruction *I) {
1252     if (!blockNeedsPredication(I->getParent()))
1253       return false;
1254     // Loads and stores that need some form of masked operation are predicated
1255     // instructions.
1256     if (isa<LoadInst>(I) || isa<StoreInst>(I))
1257       return Legal->isMaskRequired(I);
1258     return isScalarWithPredication(I);
1259   }
1260 
1261   /// Returns true if \p I is a memory instruction with consecutive memory
1262   /// access that can be widened.
1263   bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1);
1264 
1265   /// Returns true if \p I is a memory instruction in an interleaved-group
1266   /// of memory accesses that can be vectorized with wide vector loads/stores
1267   /// and shuffles.
1268   bool interleavedAccessCanBeWidened(Instruction *I, unsigned VF = 1);
1269 
1270   /// Check if \p Instr belongs to any interleaved access group.
1271   bool isAccessInterleaved(Instruction *Instr) {
1272     return InterleaveInfo.isInterleaved(Instr);
1273   }
1274 
1275   /// Get the interleaved access group that \p Instr belongs to.
1276   const InterleaveGroup<Instruction> *
1277   getInterleavedAccessGroup(Instruction *Instr) {
1278     return InterleaveInfo.getInterleaveGroup(Instr);
1279   }
1280 
1281   /// Returns true if an interleaved group requires a scalar iteration
1282   /// to handle accesses with gaps, and there is nothing preventing us from
1283   /// creating a scalar epilogue.
1284   bool requiresScalarEpilogue() const {
1285     return isScalarEpilogueAllowed() && InterleaveInfo.requiresScalarEpilogue();
1286   }
1287 
1288   /// Returns true if a scalar epilogue is not allowed due to optsize or a
1289   /// loop hint annotation.
1290   bool isScalarEpilogueAllowed() const {
1291     return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1292   }
1293 
1294   /// Returns true if all loop blocks should be masked to fold tail loop.
1295   bool foldTailByMasking() const { return FoldTailByMasking; }
1296 
1297   bool blockNeedsPredication(BasicBlock *BB) {
1298     return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1299   }
1300 
1301   /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1302   /// with factor VF.  Return the cost of the instruction, including
1303   /// scalarization overhead if it's needed.
1304   unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF);
1305 
1306   /// Estimate cost of a call instruction CI if it were vectorized with factor
1307   /// VF. Return the cost of the instruction, including scalarization overhead
1308   /// if it's needed. The flag NeedToScalarize shows if the call needs to be
1309   /// scalarized -
1310   /// i.e. either vector version isn't available, or is too expensive.
1311   unsigned getVectorCallCost(CallInst *CI, unsigned VF, bool &NeedToScalarize);
1312 
1313   /// Invalidates decisions already taken by the cost model.
1314   void invalidateCostModelingDecisions() {
1315     WideningDecisions.clear();
1316     Uniforms.clear();
1317     Scalars.clear();
1318   }
1319 
1320 private:
1321   unsigned NumPredStores = 0;
1322 
1323   /// \return An upper bound for the vectorization factor, larger than zero.
1324   /// One is returned if vectorization should best be avoided due to cost.
1325   unsigned computeFeasibleMaxVF(unsigned ConstTripCount);
1326 
1327   /// The vectorization cost is a combination of the cost itself and a boolean
1328   /// indicating whether any of the contributing operations will actually
1329   /// operate on
1330   /// vector values after type legalization in the backend. If this latter value
1331   /// is
1332   /// false, then all operations will be scalarized (i.e. no vectorization has
1333   /// actually taken place).
1334   using VectorizationCostTy = std::pair<unsigned, bool>;
1335 
1336   /// Returns the expected execution cost. The unit of the cost does
1337   /// not matter because we use the 'cost' units to compare different
1338   /// vector widths. The cost that is returned is *not* normalized by
1339   /// the factor width.
1340   VectorizationCostTy expectedCost(unsigned VF);
1341 
1342   /// Returns the execution time cost of an instruction for a given vector
1343   /// width. Vector width of one means scalar.
1344   VectorizationCostTy getInstructionCost(Instruction *I, unsigned VF);
1345 
1346   /// The cost-computation logic from getInstructionCost which provides
1347   /// the vector type as an output parameter.
1348   unsigned getInstructionCost(Instruction *I, unsigned VF, Type *&VectorTy);
1349 
1350   /// Calculate vectorization cost of memory instruction \p I.
1351   unsigned getMemoryInstructionCost(Instruction *I, unsigned VF);
1352 
1353   /// The cost computation for scalarized memory instruction.
1354   unsigned getMemInstScalarizationCost(Instruction *I, unsigned VF);
1355 
1356   /// The cost computation for interleaving group of memory instructions.
1357   unsigned getInterleaveGroupCost(Instruction *I, unsigned VF);
1358 
1359   /// The cost computation for Gather/Scatter instruction.
1360   unsigned getGatherScatterCost(Instruction *I, unsigned VF);
1361 
1362   /// The cost computation for widening instruction \p I with consecutive
1363   /// memory access.
1364   unsigned getConsecutiveMemOpCost(Instruction *I, unsigned VF);
1365 
1366   /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1367   /// Load: scalar load + broadcast.
1368   /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1369   /// element)
1370   unsigned getUniformMemOpCost(Instruction *I, unsigned VF);
1371 
1372   /// Estimate the overhead of scalarizing an instruction. This is a
1373   /// convenience wrapper for the type-based getScalarizationOverhead API.
1374   unsigned getScalarizationOverhead(Instruction *I, unsigned VF);
1375 
1376   /// Returns whether the instruction is a load or store and will be a emitted
1377   /// as a vector operation.
1378   bool isConsecutiveLoadOrStore(Instruction *I);
1379 
1380   /// Returns true if an artificially high cost for emulated masked memrefs
1381   /// should be used.
1382   bool useEmulatedMaskMemRefHack(Instruction *I);
1383 
1384   /// Map of scalar integer values to the smallest bitwidth they can be legally
1385   /// represented as. The vector equivalents of these values should be truncated
1386   /// to this type.
1387   MapVector<Instruction *, uint64_t> MinBWs;
1388 
1389   /// A type representing the costs for instructions if they were to be
1390   /// scalarized rather than vectorized. The entries are Instruction-Cost
1391   /// pairs.
1392   using ScalarCostsTy = DenseMap<Instruction *, unsigned>;
1393 
1394   /// A set containing all BasicBlocks that are known to present after
1395   /// vectorization as a predicated block.
1396   SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;
1397 
1398   /// Records whether it is allowed to have the original scalar loop execute at
1399   /// least once. This may be needed as a fallback loop in case runtime
1400   /// aliasing/dependence checks fail, or to handle the tail/remainder
1401   /// iterations when the trip count is unknown or doesn't divide by the VF,
1402   /// or as a peel-loop to handle gaps in interleave-groups.
1403   /// Under optsize and when the trip count is very small we don't allow any
1404   /// iterations to execute in the scalar loop.
1405   ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1406 
1407   /// All blocks of loop are to be masked to fold tail of scalar iterations.
1408   bool FoldTailByMasking = false;
1409 
1410   /// A map holding scalar costs for different vectorization factors. The
1411   /// presence of a cost for an instruction in the mapping indicates that the
1412   /// instruction will be scalarized when vectorizing with the associated
1413   /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1414   DenseMap<unsigned, ScalarCostsTy> InstsToScalarize;
1415 
1416   /// Holds the instructions known to be uniform after vectorization.
1417   /// The data is collected per VF.
1418   DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Uniforms;
1419 
1420   /// Holds the instructions known to be scalar after vectorization.
1421   /// The data is collected per VF.
1422   DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Scalars;
1423 
1424   /// Holds the instructions (address computations) that are forced to be
1425   /// scalarized.
1426   DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1427 
1428   /// Returns the expected difference in cost from scalarizing the expression
1429   /// feeding a predicated instruction \p PredInst. The instructions to
1430   /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1431   /// non-negative return value implies the expression will be scalarized.
1432   /// Currently, only single-use chains are considered for scalarization.
1433   int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
1434                               unsigned VF);
1435 
1436   /// Collect the instructions that are uniform after vectorization. An
1437   /// instruction is uniform if we represent it with a single scalar value in
1438   /// the vectorized loop corresponding to each vector iteration. Examples of
1439   /// uniform instructions include pointer operands of consecutive or
1440   /// interleaved memory accesses. Note that although uniformity implies an
1441   /// instruction will be scalar, the reverse is not true. In general, a
1442   /// scalarized instruction will be represented by VF scalar values in the
1443   /// vectorized loop, each corresponding to an iteration of the original
1444   /// scalar loop.
1445   void collectLoopUniforms(unsigned VF);
1446 
1447   /// Collect the instructions that are scalar after vectorization. An
1448   /// instruction is scalar if it is known to be uniform or will be scalarized
1449   /// during vectorization. Non-uniform scalarized instructions will be
1450   /// represented by VF values in the vectorized loop, each corresponding to an
1451   /// iteration of the original scalar loop.
1452   void collectLoopScalars(unsigned VF);
1453 
1454   /// Keeps cost model vectorization decision and cost for instructions.
1455   /// Right now it is used for memory instructions only.
1456   using DecisionList = DenseMap<std::pair<Instruction *, unsigned>,
1457                                 std::pair<InstWidening, unsigned>>;
1458 
1459   DecisionList WideningDecisions;
1460 
1461   /// Returns true if \p V is expected to be vectorized and it needs to be
1462   /// extracted.
1463   bool needsExtract(Value *V, unsigned VF) const {
1464     Instruction *I = dyn_cast<Instruction>(V);
1465     if (VF == 1 || !I || !TheLoop->contains(I) || TheLoop->isLoopInvariant(I))
1466       return false;
1467 
1468     // Assume we can vectorize V (and hence we need extraction) if the
1469     // scalars are not computed yet. This can happen, because it is called
1470     // via getScalarizationOverhead from setCostBasedWideningDecision, before
1471     // the scalars are collected. That should be a safe assumption in most
1472     // cases, because we check if the operands have vectorizable types
1473     // beforehand in LoopVectorizationLegality.
1474     return Scalars.find(VF) == Scalars.end() ||
1475            !isScalarAfterVectorization(I, VF);
1476   };
1477 
1478   /// Returns a range containing only operands needing to be extracted.
1479   SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1480                                                    unsigned VF) {
1481     return SmallVector<Value *, 4>(make_filter_range(
1482         Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1483   }
1484 
1485 public:
1486   /// The loop that we evaluate.
1487   Loop *TheLoop;
1488 
1489   /// Predicated scalar evolution analysis.
1490   PredicatedScalarEvolution &PSE;
1491 
1492   /// Loop Info analysis.
1493   LoopInfo *LI;
1494 
1495   /// Vectorization legality.
1496   LoopVectorizationLegality *Legal;
1497 
1498   /// Vector target information.
1499   const TargetTransformInfo &TTI;
1500 
1501   /// Target Library Info.
1502   const TargetLibraryInfo *TLI;
1503 
1504   /// Demanded bits analysis.
1505   DemandedBits *DB;
1506 
1507   /// Assumption cache.
1508   AssumptionCache *AC;
1509 
1510   /// Interface to emit optimization remarks.
1511   OptimizationRemarkEmitter *ORE;
1512 
1513   const Function *TheFunction;
1514 
1515   /// Loop Vectorize Hint.
1516   const LoopVectorizeHints *Hints;
1517 
1518   /// The interleave access information contains groups of interleaved accesses
1519   /// with the same stride and close to each other.
1520   InterleavedAccessInfo &InterleaveInfo;
1521 
1522   /// Values to ignore in the cost model.
1523   SmallPtrSet<const Value *, 16> ValuesToIgnore;
1524 
1525   /// Values to ignore in the cost model when VF > 1.
1526   SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1527 };
1528 
1529 } // end namespace llvm
1530 
1531 // Return true if \p OuterLp is an outer loop annotated with hints for explicit
1532 // vectorization. The loop needs to be annotated with #pragma omp simd
1533 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
1534 // vector length information is not provided, vectorization is not considered
1535 // explicit. Interleave hints are not allowed either. These limitations will be
1536 // relaxed in the future.
1537 // Please, note that we are currently forced to abuse the pragma 'clang
1538 // vectorize' semantics. This pragma provides *auto-vectorization hints*
1539 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
1540 // provides *explicit vectorization hints* (LV can bypass legal checks and
1541 // assume that vectorization is legal). However, both hints are implemented
1542 // using the same metadata (llvm.loop.vectorize, processed by
1543 // LoopVectorizeHints). This will be fixed in the future when the native IR
1544 // representation for pragma 'omp simd' is introduced.
1545 static bool isExplicitVecOuterLoop(Loop *OuterLp,
1546                                    OptimizationRemarkEmitter *ORE) {
1547   assert(!OuterLp->empty() && "This is not an outer loop");
1548   LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
1549 
1550   // Only outer loops with an explicit vectorization hint are supported.
1551   // Unannotated outer loops are ignored.
1552   if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
1553     return false;
1554 
1555   Function *Fn = OuterLp->getHeader()->getParent();
1556   if (!Hints.allowVectorization(Fn, OuterLp,
1557                                 true /*VectorizeOnlyWhenForced*/)) {
1558     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
1559     return false;
1560   }
1561 
1562   if (Hints.getInterleave() > 1) {
1563     // TODO: Interleave support is future work.
1564     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
1565                          "outer loops.\n");
1566     Hints.emitRemarkWithHints();
1567     return false;
1568   }
1569 
1570   return true;
1571 }
1572 
1573 static void collectSupportedLoops(Loop &L, LoopInfo *LI,
1574                                   OptimizationRemarkEmitter *ORE,
1575                                   SmallVectorImpl<Loop *> &V) {
1576   // Collect inner loops and outer loops without irreducible control flow. For
1577   // now, only collect outer loops that have explicit vectorization hints. If we
1578   // are stress testing the VPlan H-CFG construction, we collect the outermost
1579   // loop of every loop nest.
1580   if (L.empty() || VPlanBuildStressTest ||
1581       (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
1582     LoopBlocksRPO RPOT(&L);
1583     RPOT.perform(LI);
1584     if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
1585       V.push_back(&L);
1586       // TODO: Collect inner loops inside marked outer loops in case
1587       // vectorization fails for the outer loop. Do not invoke
1588       // 'containsIrreducibleCFG' again for inner loops when the outer loop is
1589       // already known to be reducible. We can use an inherited attribute for
1590       // that.
1591       return;
1592     }
1593   }
1594   for (Loop *InnerL : L)
1595     collectSupportedLoops(*InnerL, LI, ORE, V);
1596 }
1597 
1598 namespace {
1599 
1600 /// The LoopVectorize Pass.
1601 struct LoopVectorize : public FunctionPass {
1602   /// Pass identification, replacement for typeid
1603   static char ID;
1604 
1605   LoopVectorizePass Impl;
1606 
1607   explicit LoopVectorize(bool InterleaveOnlyWhenForced = false,
1608                          bool VectorizeOnlyWhenForced = false)
1609       : FunctionPass(ID),
1610         Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) {
1611     initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
1612   }
1613 
1614   bool runOnFunction(Function &F) override {
1615     if (skipFunction(F))
1616       return false;
1617 
1618     auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
1619     auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
1620     auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
1621     auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
1622     auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
1623     auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
1624     auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
1625     auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
1626     auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
1627     auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
1628     auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
1629     auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
1630     auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
1631 
1632     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
1633         [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
1634 
1635     return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC,
1636                         GetLAA, *ORE, PSI).MadeAnyChange;
1637   }
1638 
1639   void getAnalysisUsage(AnalysisUsage &AU) const override {
1640     AU.addRequired<AssumptionCacheTracker>();
1641     AU.addRequired<BlockFrequencyInfoWrapperPass>();
1642     AU.addRequired<DominatorTreeWrapperPass>();
1643     AU.addRequired<LoopInfoWrapperPass>();
1644     AU.addRequired<ScalarEvolutionWrapperPass>();
1645     AU.addRequired<TargetTransformInfoWrapperPass>();
1646     AU.addRequired<AAResultsWrapperPass>();
1647     AU.addRequired<LoopAccessLegacyAnalysis>();
1648     AU.addRequired<DemandedBitsWrapperPass>();
1649     AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
1650     AU.addRequired<InjectTLIMappingsLegacy>();
1651 
1652     // We currently do not preserve loopinfo/dominator analyses with outer loop
1653     // vectorization. Until this is addressed, mark these analyses as preserved
1654     // only for non-VPlan-native path.
1655     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
1656     if (!EnableVPlanNativePath) {
1657       AU.addPreserved<LoopInfoWrapperPass>();
1658       AU.addPreserved<DominatorTreeWrapperPass>();
1659     }
1660 
1661     AU.addPreserved<BasicAAWrapperPass>();
1662     AU.addPreserved<GlobalsAAWrapperPass>();
1663     AU.addRequired<ProfileSummaryInfoWrapperPass>();
1664   }
1665 };
1666 
1667 } // end anonymous namespace
1668 
1669 //===----------------------------------------------------------------------===//
1670 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
1671 // LoopVectorizationCostModel and LoopVectorizationPlanner.
1672 //===----------------------------------------------------------------------===//
1673 
1674 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
1675   // We need to place the broadcast of invariant variables outside the loop,
1676   // but only if it's proven safe to do so. Else, broadcast will be inside
1677   // vector loop body.
1678   Instruction *Instr = dyn_cast<Instruction>(V);
1679   bool SafeToHoist = OrigLoop->isLoopInvariant(V) &&
1680                      (!Instr ||
1681                       DT->dominates(Instr->getParent(), LoopVectorPreHeader));
1682   // Place the code for broadcasting invariant variables in the new preheader.
1683   IRBuilder<>::InsertPointGuard Guard(Builder);
1684   if (SafeToHoist)
1685     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
1686 
1687   // Broadcast the scalar into all locations in the vector.
1688   Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
1689 
1690   return Shuf;
1691 }
1692 
1693 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
1694     const InductionDescriptor &II, Value *Step, Instruction *EntryVal) {
1695   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1696          "Expected either an induction phi-node or a truncate of it!");
1697   Value *Start = II.getStartValue();
1698 
1699   // Construct the initial value of the vector IV in the vector loop preheader
1700   auto CurrIP = Builder.saveIP();
1701   Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
1702   if (isa<TruncInst>(EntryVal)) {
1703     assert(Start->getType()->isIntegerTy() &&
1704            "Truncation requires an integer type");
1705     auto *TruncType = cast<IntegerType>(EntryVal->getType());
1706     Step = Builder.CreateTrunc(Step, TruncType);
1707     Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
1708   }
1709   Value *SplatStart = Builder.CreateVectorSplat(VF, Start);
1710   Value *SteppedStart =
1711       getStepVector(SplatStart, 0, Step, II.getInductionOpcode());
1712 
1713   // We create vector phi nodes for both integer and floating-point induction
1714   // variables. Here, we determine the kind of arithmetic we will perform.
1715   Instruction::BinaryOps AddOp;
1716   Instruction::BinaryOps MulOp;
1717   if (Step->getType()->isIntegerTy()) {
1718     AddOp = Instruction::Add;
1719     MulOp = Instruction::Mul;
1720   } else {
1721     AddOp = II.getInductionOpcode();
1722     MulOp = Instruction::FMul;
1723   }
1724 
1725   // Multiply the vectorization factor by the step using integer or
1726   // floating-point arithmetic as appropriate.
1727   Value *ConstVF = getSignedIntOrFpConstant(Step->getType(), VF);
1728   Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF));
1729 
1730   // Create a vector splat to use in the induction update.
1731   //
1732   // FIXME: If the step is non-constant, we create the vector splat with
1733   //        IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
1734   //        handle a constant vector splat.
1735   Value *SplatVF =
1736       isa<Constant>(Mul)
1737           ? ConstantVector::getSplat({VF, false}, cast<Constant>(Mul))
1738           : Builder.CreateVectorSplat(VF, Mul);
1739   Builder.restoreIP(CurrIP);
1740 
1741   // We may need to add the step a number of times, depending on the unroll
1742   // factor. The last of those goes into the PHI.
1743   PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
1744                                     &*LoopVectorBody->getFirstInsertionPt());
1745   VecInd->setDebugLoc(EntryVal->getDebugLoc());
1746   Instruction *LastInduction = VecInd;
1747   for (unsigned Part = 0; Part < UF; ++Part) {
1748     VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction);
1749 
1750     if (isa<TruncInst>(EntryVal))
1751       addMetadata(LastInduction, EntryVal);
1752     recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, Part);
1753 
1754     LastInduction = cast<Instruction>(addFastMathFlag(
1755         Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")));
1756     LastInduction->setDebugLoc(EntryVal->getDebugLoc());
1757   }
1758 
1759   // Move the last step to the end of the latch block. This ensures consistent
1760   // placement of all induction updates.
1761   auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
1762   auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator());
1763   auto *ICmp = cast<Instruction>(Br->getCondition());
1764   LastInduction->moveBefore(ICmp);
1765   LastInduction->setName("vec.ind.next");
1766 
1767   VecInd->addIncoming(SteppedStart, LoopVectorPreHeader);
1768   VecInd->addIncoming(LastInduction, LoopVectorLatch);
1769 }
1770 
1771 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const {
1772   return Cost->isScalarAfterVectorization(I, VF) ||
1773          Cost->isProfitableToScalarize(I, VF);
1774 }
1775 
1776 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const {
1777   if (shouldScalarizeInstruction(IV))
1778     return true;
1779   auto isScalarInst = [&](User *U) -> bool {
1780     auto *I = cast<Instruction>(U);
1781     return (OrigLoop->contains(I) && shouldScalarizeInstruction(I));
1782   };
1783   return llvm::any_of(IV->users(), isScalarInst);
1784 }
1785 
1786 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast(
1787     const InductionDescriptor &ID, const Instruction *EntryVal,
1788     Value *VectorLoopVal, unsigned Part, unsigned Lane) {
1789   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1790          "Expected either an induction phi-node or a truncate of it!");
1791 
1792   // This induction variable is not the phi from the original loop but the
1793   // newly-created IV based on the proof that casted Phi is equal to the
1794   // uncasted Phi in the vectorized loop (under a runtime guard possibly). It
1795   // re-uses the same InductionDescriptor that original IV uses but we don't
1796   // have to do any recording in this case - that is done when original IV is
1797   // processed.
1798   if (isa<TruncInst>(EntryVal))
1799     return;
1800 
1801   const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts();
1802   if (Casts.empty())
1803     return;
1804   // Only the first Cast instruction in the Casts vector is of interest.
1805   // The rest of the Casts (if exist) have no uses outside the
1806   // induction update chain itself.
1807   Instruction *CastInst = *Casts.begin();
1808   if (Lane < UINT_MAX)
1809     VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal);
1810   else
1811     VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal);
1812 }
1813 
1814 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) {
1815   assert((IV->getType()->isIntegerTy() || IV != OldInduction) &&
1816          "Primary induction variable must have an integer type");
1817 
1818   auto II = Legal->getInductionVars().find(IV);
1819   assert(II != Legal->getInductionVars().end() && "IV is not an induction");
1820 
1821   auto ID = II->second;
1822   assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
1823 
1824   // The value from the original loop to which we are mapping the new induction
1825   // variable.
1826   Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
1827 
1828   auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
1829 
1830   // Generate code for the induction step. Note that induction steps are
1831   // required to be loop-invariant
1832   auto CreateStepValue = [&](const SCEV *Step) -> Value * {
1833     assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) &&
1834            "Induction step should be loop invariant");
1835     if (PSE.getSE()->isSCEVable(IV->getType())) {
1836       SCEVExpander Exp(*PSE.getSE(), DL, "induction");
1837       return Exp.expandCodeFor(Step, Step->getType(),
1838                                LoopVectorPreHeader->getTerminator());
1839     }
1840     return cast<SCEVUnknown>(Step)->getValue();
1841   };
1842 
1843   // The scalar value to broadcast. This is derived from the canonical
1844   // induction variable. If a truncation type is given, truncate the canonical
1845   // induction variable and step. Otherwise, derive these values from the
1846   // induction descriptor.
1847   auto CreateScalarIV = [&](Value *&Step) -> Value * {
1848     Value *ScalarIV = Induction;
1849     if (IV != OldInduction) {
1850       ScalarIV = IV->getType()->isIntegerTy()
1851                      ? Builder.CreateSExtOrTrunc(Induction, IV->getType())
1852                      : Builder.CreateCast(Instruction::SIToFP, Induction,
1853                                           IV->getType());
1854       ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID);
1855       ScalarIV->setName("offset.idx");
1856     }
1857     if (Trunc) {
1858       auto *TruncType = cast<IntegerType>(Trunc->getType());
1859       assert(Step->getType()->isIntegerTy() &&
1860              "Truncation requires an integer step");
1861       ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType);
1862       Step = Builder.CreateTrunc(Step, TruncType);
1863     }
1864     return ScalarIV;
1865   };
1866 
1867   // Create the vector values from the scalar IV, in the absence of creating a
1868   // vector IV.
1869   auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) {
1870     Value *Broadcasted = getBroadcastInstrs(ScalarIV);
1871     for (unsigned Part = 0; Part < UF; ++Part) {
1872       Value *EntryPart =
1873           getStepVector(Broadcasted, VF * Part, Step, ID.getInductionOpcode());
1874       VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart);
1875       if (Trunc)
1876         addMetadata(EntryPart, Trunc);
1877       recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part);
1878     }
1879   };
1880 
1881   // Now do the actual transformations, and start with creating the step value.
1882   Value *Step = CreateStepValue(ID.getStep());
1883   if (VF <= 1) {
1884     Value *ScalarIV = CreateScalarIV(Step);
1885     CreateSplatIV(ScalarIV, Step);
1886     return;
1887   }
1888 
1889   // Determine if we want a scalar version of the induction variable. This is
1890   // true if the induction variable itself is not widened, or if it has at
1891   // least one user in the loop that is not widened.
1892   auto NeedsScalarIV = needsScalarInduction(EntryVal);
1893   if (!NeedsScalarIV) {
1894     createVectorIntOrFpInductionPHI(ID, Step, EntryVal);
1895     return;
1896   }
1897 
1898   // Try to create a new independent vector induction variable. If we can't
1899   // create the phi node, we will splat the scalar induction variable in each
1900   // loop iteration.
1901   if (!shouldScalarizeInstruction(EntryVal)) {
1902     createVectorIntOrFpInductionPHI(ID, Step, EntryVal);
1903     Value *ScalarIV = CreateScalarIV(Step);
1904     // Create scalar steps that can be used by instructions we will later
1905     // scalarize. Note that the addition of the scalar steps will not increase
1906     // the number of instructions in the loop in the common case prior to
1907     // InstCombine. We will be trading one vector extract for each scalar step.
1908     buildScalarSteps(ScalarIV, Step, EntryVal, ID);
1909     return;
1910   }
1911 
1912   // All IV users are scalar instructions, so only emit a scalar IV, not a
1913   // vectorised IV.
1914   Value *ScalarIV = CreateScalarIV(Step);
1915   buildScalarSteps(ScalarIV, Step, EntryVal, ID);
1916 }
1917 
1918 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step,
1919                                           Instruction::BinaryOps BinOp) {
1920   // Create and check the types.
1921   auto *ValVTy = cast<VectorType>(Val->getType());
1922   int VLen = ValVTy->getNumElements();
1923 
1924   Type *STy = Val->getType()->getScalarType();
1925   assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
1926          "Induction Step must be an integer or FP");
1927   assert(Step->getType() == STy && "Step has wrong type");
1928 
1929   SmallVector<Constant *, 8> Indices;
1930 
1931   if (STy->isIntegerTy()) {
1932     // Create a vector of consecutive numbers from zero to VF.
1933     for (int i = 0; i < VLen; ++i)
1934       Indices.push_back(ConstantInt::get(STy, StartIdx + i));
1935 
1936     // Add the consecutive indices to the vector value.
1937     Constant *Cv = ConstantVector::get(Indices);
1938     assert(Cv->getType() == Val->getType() && "Invalid consecutive vec");
1939     Step = Builder.CreateVectorSplat(VLen, Step);
1940     assert(Step->getType() == Val->getType() && "Invalid step vec");
1941     // FIXME: The newly created binary instructions should contain nsw/nuw flags,
1942     // which can be found from the original scalar operations.
1943     Step = Builder.CreateMul(Cv, Step);
1944     return Builder.CreateAdd(Val, Step, "induction");
1945   }
1946 
1947   // Floating point induction.
1948   assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
1949          "Binary Opcode should be specified for FP induction");
1950   // Create a vector of consecutive numbers from zero to VF.
1951   for (int i = 0; i < VLen; ++i)
1952     Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i)));
1953 
1954   // Add the consecutive indices to the vector value.
1955   Constant *Cv = ConstantVector::get(Indices);
1956 
1957   Step = Builder.CreateVectorSplat(VLen, Step);
1958 
1959   // Floating point operations had to be 'fast' to enable the induction.
1960   FastMathFlags Flags;
1961   Flags.setFast();
1962 
1963   Value *MulOp = Builder.CreateFMul(Cv, Step);
1964   if (isa<Instruction>(MulOp))
1965     // Have to check, MulOp may be a constant
1966     cast<Instruction>(MulOp)->setFastMathFlags(Flags);
1967 
1968   Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
1969   if (isa<Instruction>(BOp))
1970     cast<Instruction>(BOp)->setFastMathFlags(Flags);
1971   return BOp;
1972 }
1973 
1974 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
1975                                            Instruction *EntryVal,
1976                                            const InductionDescriptor &ID) {
1977   // We shouldn't have to build scalar steps if we aren't vectorizing.
1978   assert(VF > 1 && "VF should be greater than one");
1979 
1980   // Get the value type and ensure it and the step have the same integer type.
1981   Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
1982   assert(ScalarIVTy == Step->getType() &&
1983          "Val and Step should have the same type");
1984 
1985   // We build scalar steps for both integer and floating-point induction
1986   // variables. Here, we determine the kind of arithmetic we will perform.
1987   Instruction::BinaryOps AddOp;
1988   Instruction::BinaryOps MulOp;
1989   if (ScalarIVTy->isIntegerTy()) {
1990     AddOp = Instruction::Add;
1991     MulOp = Instruction::Mul;
1992   } else {
1993     AddOp = ID.getInductionOpcode();
1994     MulOp = Instruction::FMul;
1995   }
1996 
1997   // Determine the number of scalars we need to generate for each unroll
1998   // iteration. If EntryVal is uniform, we only need to generate the first
1999   // lane. Otherwise, we generate all VF values.
2000   unsigned Lanes =
2001       Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) ? 1
2002                                                                          : VF;
2003   // Compute the scalar steps and save the results in VectorLoopValueMap.
2004   for (unsigned Part = 0; Part < UF; ++Part) {
2005     for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
2006       auto *StartIdx = getSignedIntOrFpConstant(ScalarIVTy, VF * Part + Lane);
2007       auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step));
2008       auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul));
2009       VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add);
2010       recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane);
2011     }
2012   }
2013 }
2014 
2015 Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) {
2016   assert(V != Induction && "The new induction variable should not be used.");
2017   assert(!V->getType()->isVectorTy() && "Can't widen a vector");
2018   assert(!V->getType()->isVoidTy() && "Type does not produce a value");
2019 
2020   // If we have a stride that is replaced by one, do it here. Defer this for
2021   // the VPlan-native path until we start running Legal checks in that path.
2022   if (!EnableVPlanNativePath && Legal->hasStride(V))
2023     V = ConstantInt::get(V->getType(), 1);
2024 
2025   // If we have a vector mapped to this value, return it.
2026   if (VectorLoopValueMap.hasVectorValue(V, Part))
2027     return VectorLoopValueMap.getVectorValue(V, Part);
2028 
2029   // If the value has not been vectorized, check if it has been scalarized
2030   // instead. If it has been scalarized, and we actually need the value in
2031   // vector form, we will construct the vector values on demand.
2032   if (VectorLoopValueMap.hasAnyScalarValue(V)) {
2033     Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0});
2034 
2035     // If we've scalarized a value, that value should be an instruction.
2036     auto *I = cast<Instruction>(V);
2037 
2038     // If we aren't vectorizing, we can just copy the scalar map values over to
2039     // the vector map.
2040     if (VF == 1) {
2041       VectorLoopValueMap.setVectorValue(V, Part, ScalarValue);
2042       return ScalarValue;
2043     }
2044 
2045     // Get the last scalar instruction we generated for V and Part. If the value
2046     // is known to be uniform after vectorization, this corresponds to lane zero
2047     // of the Part unroll iteration. Otherwise, the last instruction is the one
2048     // we created for the last vector lane of the Part unroll iteration.
2049     unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) ? 0 : VF - 1;
2050     auto *LastInst = cast<Instruction>(
2051         VectorLoopValueMap.getScalarValue(V, {Part, LastLane}));
2052 
2053     // Set the insert point after the last scalarized instruction. This ensures
2054     // the insertelement sequence will directly follow the scalar definitions.
2055     auto OldIP = Builder.saveIP();
2056     auto NewIP = std::next(BasicBlock::iterator(LastInst));
2057     Builder.SetInsertPoint(&*NewIP);
2058 
2059     // However, if we are vectorizing, we need to construct the vector values.
2060     // If the value is known to be uniform after vectorization, we can just
2061     // broadcast the scalar value corresponding to lane zero for each unroll
2062     // iteration. Otherwise, we construct the vector values using insertelement
2063     // instructions. Since the resulting vectors are stored in
2064     // VectorLoopValueMap, we will only generate the insertelements once.
2065     Value *VectorValue = nullptr;
2066     if (Cost->isUniformAfterVectorization(I, VF)) {
2067       VectorValue = getBroadcastInstrs(ScalarValue);
2068       VectorLoopValueMap.setVectorValue(V, Part, VectorValue);
2069     } else {
2070       // Initialize packing with insertelements to start from undef.
2071       Value *Undef = UndefValue::get(VectorType::get(V->getType(), VF));
2072       VectorLoopValueMap.setVectorValue(V, Part, Undef);
2073       for (unsigned Lane = 0; Lane < VF; ++Lane)
2074         packScalarIntoVectorValue(V, {Part, Lane});
2075       VectorValue = VectorLoopValueMap.getVectorValue(V, Part);
2076     }
2077     Builder.restoreIP(OldIP);
2078     return VectorValue;
2079   }
2080 
2081   // If this scalar is unknown, assume that it is a constant or that it is
2082   // loop invariant. Broadcast V and save the value for future uses.
2083   Value *B = getBroadcastInstrs(V);
2084   VectorLoopValueMap.setVectorValue(V, Part, B);
2085   return B;
2086 }
2087 
2088 Value *
2089 InnerLoopVectorizer::getOrCreateScalarValue(Value *V,
2090                                             const VPIteration &Instance) {
2091   // If the value is not an instruction contained in the loop, it should
2092   // already be scalar.
2093   if (OrigLoop->isLoopInvariant(V))
2094     return V;
2095 
2096   assert(Instance.Lane > 0
2097              ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF)
2098              : true && "Uniform values only have lane zero");
2099 
2100   // If the value from the original loop has not been vectorized, it is
2101   // represented by UF x VF scalar values in the new loop. Return the requested
2102   // scalar value.
2103   if (VectorLoopValueMap.hasScalarValue(V, Instance))
2104     return VectorLoopValueMap.getScalarValue(V, Instance);
2105 
2106   // If the value has not been scalarized, get its entry in VectorLoopValueMap
2107   // for the given unroll part. If this entry is not a vector type (i.e., the
2108   // vectorization factor is one), there is no need to generate an
2109   // extractelement instruction.
2110   auto *U = getOrCreateVectorValue(V, Instance.Part);
2111   if (!U->getType()->isVectorTy()) {
2112     assert(VF == 1 && "Value not scalarized has non-vector type");
2113     return U;
2114   }
2115 
2116   // Otherwise, the value from the original loop has been vectorized and is
2117   // represented by UF vector values. Extract and return the requested scalar
2118   // value from the appropriate vector lane.
2119   return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane));
2120 }
2121 
2122 void InnerLoopVectorizer::packScalarIntoVectorValue(
2123     Value *V, const VPIteration &Instance) {
2124   assert(V != Induction && "The new induction variable should not be used.");
2125   assert(!V->getType()->isVectorTy() && "Can't pack a vector");
2126   assert(!V->getType()->isVoidTy() && "Type does not produce a value");
2127 
2128   Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance);
2129   Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part);
2130   VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst,
2131                                             Builder.getInt32(Instance.Lane));
2132   VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue);
2133 }
2134 
2135 Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
2136   assert(Vec->getType()->isVectorTy() && "Invalid type");
2137   SmallVector<int, 8> ShuffleMask;
2138   for (unsigned i = 0; i < VF; ++i)
2139     ShuffleMask.push_back(VF - i - 1);
2140 
2141   return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()),
2142                                      ShuffleMask, "reverse");
2143 }
2144 
2145 // Return whether we allow using masked interleave-groups (for dealing with
2146 // strided loads/stores that reside in predicated blocks, or for dealing
2147 // with gaps).
2148 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2149   // If an override option has been passed in for interleaved accesses, use it.
2150   if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2151     return EnableMaskedInterleavedMemAccesses;
2152 
2153   return TTI.enableMaskedInterleavedAccessVectorization();
2154 }
2155 
2156 // Try to vectorize the interleave group that \p Instr belongs to.
2157 //
2158 // E.g. Translate following interleaved load group (factor = 3):
2159 //   for (i = 0; i < N; i+=3) {
2160 //     R = Pic[i];             // Member of index 0
2161 //     G = Pic[i+1];           // Member of index 1
2162 //     B = Pic[i+2];           // Member of index 2
2163 //     ... // do something to R, G, B
2164 //   }
2165 // To:
2166 //   %wide.vec = load <12 x i32>                       ; Read 4 tuples of R,G,B
2167 //   %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9>   ; R elements
2168 //   %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10>  ; G elements
2169 //   %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11>  ; B elements
2170 //
2171 // Or translate following interleaved store group (factor = 3):
2172 //   for (i = 0; i < N; i+=3) {
2173 //     ... do something to R, G, B
2174 //     Pic[i]   = R;           // Member of index 0
2175 //     Pic[i+1] = G;           // Member of index 1
2176 //     Pic[i+2] = B;           // Member of index 2
2177 //   }
2178 // To:
2179 //   %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2180 //   %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u>
2181 //   %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2182 //        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements
2183 //   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
2184 void InnerLoopVectorizer::vectorizeInterleaveGroup(
2185     const InterleaveGroup<Instruction> *Group, VPTransformState &State,
2186     VPValue *Addr, VPValue *BlockInMask) {
2187   Instruction *Instr = Group->getInsertPos();
2188   const DataLayout &DL = Instr->getModule()->getDataLayout();
2189 
2190   // Prepare for the vector type of the interleaved load/store.
2191   Type *ScalarTy = getMemInstValueType(Instr);
2192   unsigned InterleaveFactor = Group->getFactor();
2193   Type *VecTy = VectorType::get(ScalarTy, InterleaveFactor * VF);
2194 
2195   // Prepare for the new pointers.
2196   SmallVector<Value *, 2> AddrParts;
2197   unsigned Index = Group->getIndex(Instr);
2198 
2199   // TODO: extend the masked interleaved-group support to reversed access.
2200   assert((!BlockInMask || !Group->isReverse()) &&
2201          "Reversed masked interleave-group not supported.");
2202 
2203   // If the group is reverse, adjust the index to refer to the last vector lane
2204   // instead of the first. We adjust the index from the first vector lane,
2205   // rather than directly getting the pointer for lane VF - 1, because the
2206   // pointer operand of the interleaved access is supposed to be uniform. For
2207   // uniform instructions, we're only required to generate a value for the
2208   // first vector lane in each unroll iteration.
2209   if (Group->isReverse())
2210     Index += (VF - 1) * Group->getFactor();
2211 
2212   for (unsigned Part = 0; Part < UF; Part++) {
2213     Value *AddrPart = State.get(Addr, {Part, 0});
2214     setDebugLocFromInst(Builder, AddrPart);
2215 
2216     // Notice current instruction could be any index. Need to adjust the address
2217     // to the member of index 0.
2218     //
2219     // E.g.  a = A[i+1];     // Member of index 1 (Current instruction)
2220     //       b = A[i];       // Member of index 0
2221     // Current pointer is pointed to A[i+1], adjust it to A[i].
2222     //
2223     // E.g.  A[i+1] = a;     // Member of index 1
2224     //       A[i]   = b;     // Member of index 0
2225     //       A[i+2] = c;     // Member of index 2 (Current instruction)
2226     // Current pointer is pointed to A[i+2], adjust it to A[i].
2227 
2228     bool InBounds = false;
2229     if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts()))
2230       InBounds = gep->isInBounds();
2231     AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index));
2232     cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds);
2233 
2234     // Cast to the vector pointer type.
2235     unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace();
2236     Type *PtrTy = VecTy->getPointerTo(AddressSpace);
2237     AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy));
2238   }
2239 
2240   setDebugLocFromInst(Builder, Instr);
2241   Value *UndefVec = UndefValue::get(VecTy);
2242 
2243   Value *MaskForGaps = nullptr;
2244   if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) {
2245     MaskForGaps = createBitMaskForGaps(Builder, VF, *Group);
2246     assert(MaskForGaps && "Mask for Gaps is required but it is null");
2247   }
2248 
2249   // Vectorize the interleaved load group.
2250   if (isa<LoadInst>(Instr)) {
2251     // For each unroll part, create a wide load for the group.
2252     SmallVector<Value *, 2> NewLoads;
2253     for (unsigned Part = 0; Part < UF; Part++) {
2254       Instruction *NewLoad;
2255       if (BlockInMask || MaskForGaps) {
2256         assert(useMaskedInterleavedAccesses(*TTI) &&
2257                "masked interleaved groups are not allowed.");
2258         Value *GroupMask = MaskForGaps;
2259         if (BlockInMask) {
2260           Value *BlockInMaskPart = State.get(BlockInMask, Part);
2261           auto *Undefs = UndefValue::get(BlockInMaskPart->getType());
2262           Value *ShuffledMask = Builder.CreateShuffleVector(
2263               BlockInMaskPart, Undefs,
2264               createReplicatedMask(InterleaveFactor, VF), "interleaved.mask");
2265           GroupMask = MaskForGaps
2266                           ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
2267                                                 MaskForGaps)
2268                           : ShuffledMask;
2269         }
2270         NewLoad =
2271             Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlign(),
2272                                      GroupMask, UndefVec, "wide.masked.vec");
2273       }
2274       else
2275         NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part],
2276                                             Group->getAlign(), "wide.vec");
2277       Group->addMetadata(NewLoad);
2278       NewLoads.push_back(NewLoad);
2279     }
2280 
2281     // For each member in the group, shuffle out the appropriate data from the
2282     // wide loads.
2283     for (unsigned I = 0; I < InterleaveFactor; ++I) {
2284       Instruction *Member = Group->getMember(I);
2285 
2286       // Skip the gaps in the group.
2287       if (!Member)
2288         continue;
2289 
2290       auto StrideMask = createStrideMask(I, InterleaveFactor, VF);
2291       for (unsigned Part = 0; Part < UF; Part++) {
2292         Value *StridedVec = Builder.CreateShuffleVector(
2293             NewLoads[Part], UndefVec, StrideMask, "strided.vec");
2294 
2295         // If this member has different type, cast the result type.
2296         if (Member->getType() != ScalarTy) {
2297           VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2298           StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2299         }
2300 
2301         if (Group->isReverse())
2302           StridedVec = reverseVector(StridedVec);
2303 
2304         VectorLoopValueMap.setVectorValue(Member, Part, StridedVec);
2305       }
2306     }
2307     return;
2308   }
2309 
2310   // The sub vector type for current instruction.
2311   VectorType *SubVT = VectorType::get(ScalarTy, VF);
2312 
2313   // Vectorize the interleaved store group.
2314   for (unsigned Part = 0; Part < UF; Part++) {
2315     // Collect the stored vector from each member.
2316     SmallVector<Value *, 4> StoredVecs;
2317     for (unsigned i = 0; i < InterleaveFactor; i++) {
2318       // Interleaved store group doesn't allow a gap, so each index has a member
2319       Instruction *Member = Group->getMember(i);
2320       assert(Member && "Fail to get a member from an interleaved store group");
2321 
2322       Value *StoredVec = getOrCreateVectorValue(
2323           cast<StoreInst>(Member)->getValueOperand(), Part);
2324       if (Group->isReverse())
2325         StoredVec = reverseVector(StoredVec);
2326 
2327       // If this member has different type, cast it to a unified type.
2328 
2329       if (StoredVec->getType() != SubVT)
2330         StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2331 
2332       StoredVecs.push_back(StoredVec);
2333     }
2334 
2335     // Concatenate all vectors into a wide vector.
2336     Value *WideVec = concatenateVectors(Builder, StoredVecs);
2337 
2338     // Interleave the elements in the wide vector.
2339     Value *IVec = Builder.CreateShuffleVector(
2340         WideVec, UndefVec, createInterleaveMask(VF, InterleaveFactor),
2341         "interleaved.vec");
2342 
2343     Instruction *NewStoreInstr;
2344     if (BlockInMask) {
2345       Value *BlockInMaskPart = State.get(BlockInMask, Part);
2346       auto *Undefs = UndefValue::get(BlockInMaskPart->getType());
2347       Value *ShuffledMask = Builder.CreateShuffleVector(
2348           BlockInMaskPart, Undefs, createReplicatedMask(InterleaveFactor, VF),
2349           "interleaved.mask");
2350       NewStoreInstr = Builder.CreateMaskedStore(
2351           IVec, AddrParts[Part], Group->getAlign(), ShuffledMask);
2352     }
2353     else
2354       NewStoreInstr =
2355           Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign());
2356 
2357     Group->addMetadata(NewStoreInstr);
2358   }
2359 }
2360 
2361 void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
2362                                                      VPTransformState &State,
2363                                                      VPValue *Addr,
2364                                                      VPValue *StoredValue,
2365                                                      VPValue *BlockInMask) {
2366   // Attempt to issue a wide load.
2367   LoadInst *LI = dyn_cast<LoadInst>(Instr);
2368   StoreInst *SI = dyn_cast<StoreInst>(Instr);
2369 
2370   assert((LI || SI) && "Invalid Load/Store instruction");
2371   assert((!SI || StoredValue) && "No stored value provided for widened store");
2372   assert((!LI || !StoredValue) && "Stored value provided for widened load");
2373 
2374   LoopVectorizationCostModel::InstWidening Decision =
2375       Cost->getWideningDecision(Instr, VF);
2376   assert((Decision == LoopVectorizationCostModel::CM_Widen ||
2377           Decision == LoopVectorizationCostModel::CM_Widen_Reverse ||
2378           Decision == LoopVectorizationCostModel::CM_GatherScatter) &&
2379          "CM decision is not to widen the memory instruction");
2380 
2381   Type *ScalarDataTy = getMemInstValueType(Instr);
2382   Type *DataTy = VectorType::get(ScalarDataTy, VF);
2383   // An alignment of 0 means target abi alignment. We need to use the scalar's
2384   // target abi alignment in such a case.
2385   const DataLayout &DL = Instr->getModule()->getDataLayout();
2386   const Align Alignment =
2387       DL.getValueOrABITypeAlignment(getLoadStoreAlignment(Instr), ScalarDataTy);
2388 
2389   // Determine if the pointer operand of the access is either consecutive or
2390   // reverse consecutive.
2391   bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse);
2392   bool ConsecutiveStride =
2393       Reverse || (Decision == LoopVectorizationCostModel::CM_Widen);
2394   bool CreateGatherScatter =
2395       (Decision == LoopVectorizationCostModel::CM_GatherScatter);
2396 
2397   // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector
2398   // gather/scatter. Otherwise Decision should have been to Scalarize.
2399   assert((ConsecutiveStride || CreateGatherScatter) &&
2400          "The instruction should be scalarized");
2401   (void)ConsecutiveStride;
2402 
2403   VectorParts BlockInMaskParts(UF);
2404   bool isMaskRequired = BlockInMask;
2405   if (isMaskRequired)
2406     for (unsigned Part = 0; Part < UF; ++Part)
2407       BlockInMaskParts[Part] = State.get(BlockInMask, Part);
2408 
2409   const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
2410     // Calculate the pointer for the specific unroll-part.
2411     GetElementPtrInst *PartPtr = nullptr;
2412 
2413     bool InBounds = false;
2414     if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
2415       InBounds = gep->isInBounds();
2416 
2417     if (Reverse) {
2418       // If the address is consecutive but reversed, then the
2419       // wide store needs to start at the last vector element.
2420       PartPtr = cast<GetElementPtrInst>(
2421           Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(-Part * VF)));
2422       PartPtr->setIsInBounds(InBounds);
2423       PartPtr = cast<GetElementPtrInst>(
2424           Builder.CreateGEP(ScalarDataTy, PartPtr, Builder.getInt32(1 - VF)));
2425       PartPtr->setIsInBounds(InBounds);
2426       if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
2427         BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]);
2428     } else {
2429       PartPtr = cast<GetElementPtrInst>(
2430           Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(Part * VF)));
2431       PartPtr->setIsInBounds(InBounds);
2432     }
2433 
2434     unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
2435     return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
2436   };
2437 
2438   // Handle Stores:
2439   if (SI) {
2440     setDebugLocFromInst(Builder, SI);
2441 
2442     for (unsigned Part = 0; Part < UF; ++Part) {
2443       Instruction *NewSI = nullptr;
2444       Value *StoredVal = State.get(StoredValue, Part);
2445       if (CreateGatherScatter) {
2446         Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
2447         Value *VectorGep = State.get(Addr, Part);
2448         NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
2449                                             MaskPart);
2450       } else {
2451         if (Reverse) {
2452           // If we store to reverse consecutive memory locations, then we need
2453           // to reverse the order of elements in the stored value.
2454           StoredVal = reverseVector(StoredVal);
2455           // We don't want to update the value in the map as it might be used in
2456           // another expression. So don't call resetVectorValue(StoredVal).
2457         }
2458         auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0}));
2459         if (isMaskRequired)
2460           NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
2461                                             BlockInMaskParts[Part]);
2462         else
2463           NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
2464       }
2465       addMetadata(NewSI, SI);
2466     }
2467     return;
2468   }
2469 
2470   // Handle loads.
2471   assert(LI && "Must have a load instruction");
2472   setDebugLocFromInst(Builder, LI);
2473   for (unsigned Part = 0; Part < UF; ++Part) {
2474     Value *NewLI;
2475     if (CreateGatherScatter) {
2476       Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
2477       Value *VectorGep = State.get(Addr, Part);
2478       NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart,
2479                                          nullptr, "wide.masked.gather");
2480       addMetadata(NewLI, LI);
2481     } else {
2482       auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0}));
2483       if (isMaskRequired)
2484         NewLI = Builder.CreateMaskedLoad(
2485             VecPtr, Alignment, BlockInMaskParts[Part], UndefValue::get(DataTy),
2486             "wide.masked.load");
2487       else
2488         NewLI =
2489             Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");
2490 
2491       // Add metadata to the load, but setVectorValue to the reverse shuffle.
2492       addMetadata(NewLI, LI);
2493       if (Reverse)
2494         NewLI = reverseVector(NewLI);
2495     }
2496     VectorLoopValueMap.setVectorValue(Instr, Part, NewLI);
2497   }
2498 }
2499 
2500 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
2501                                                const VPIteration &Instance,
2502                                                bool IfPredicateInstr) {
2503   assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
2504 
2505   setDebugLocFromInst(Builder, Instr);
2506 
2507   // Does this instruction return a value ?
2508   bool IsVoidRetTy = Instr->getType()->isVoidTy();
2509 
2510   Instruction *Cloned = Instr->clone();
2511   if (!IsVoidRetTy)
2512     Cloned->setName(Instr->getName() + ".cloned");
2513 
2514   // Replace the operands of the cloned instructions with their scalar
2515   // equivalents in the new loop.
2516   for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
2517     auto *NewOp = getOrCreateScalarValue(Instr->getOperand(op), Instance);
2518     Cloned->setOperand(op, NewOp);
2519   }
2520   addNewMetadata(Cloned, Instr);
2521 
2522   // Place the cloned scalar in the new loop.
2523   Builder.Insert(Cloned);
2524 
2525   // Add the cloned scalar to the scalar map entry.
2526   VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned);
2527 
2528   // If we just cloned a new assumption, add it the assumption cache.
2529   if (auto *II = dyn_cast<IntrinsicInst>(Cloned))
2530     if (II->getIntrinsicID() == Intrinsic::assume)
2531       AC->registerAssumption(II);
2532 
2533   // End if-block.
2534   if (IfPredicateInstr)
2535     PredicatedInstructions.push_back(Cloned);
2536 }
2537 
2538 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start,
2539                                                       Value *End, Value *Step,
2540                                                       Instruction *DL) {
2541   BasicBlock *Header = L->getHeader();
2542   BasicBlock *Latch = L->getLoopLatch();
2543   // As we're just creating this loop, it's possible no latch exists
2544   // yet. If so, use the header as this will be a single block loop.
2545   if (!Latch)
2546     Latch = Header;
2547 
2548   IRBuilder<> Builder(&*Header->getFirstInsertionPt());
2549   Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction);
2550   setDebugLocFromInst(Builder, OldInst);
2551   auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index");
2552 
2553   Builder.SetInsertPoint(Latch->getTerminator());
2554   setDebugLocFromInst(Builder, OldInst);
2555 
2556   // Create i+1 and fill the PHINode.
2557   Value *Next = Builder.CreateAdd(Induction, Step, "index.next");
2558   Induction->addIncoming(Start, L->getLoopPreheader());
2559   Induction->addIncoming(Next, Latch);
2560   // Create the compare.
2561   Value *ICmp = Builder.CreateICmpEQ(Next, End);
2562   Builder.CreateCondBr(ICmp, L->getExitBlock(), Header);
2563 
2564   // Now we have two terminators. Remove the old one from the block.
2565   Latch->getTerminator()->eraseFromParent();
2566 
2567   return Induction;
2568 }
2569 
2570 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
2571   if (TripCount)
2572     return TripCount;
2573 
2574   assert(L && "Create Trip Count for null loop.");
2575   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2576   // Find the loop boundaries.
2577   ScalarEvolution *SE = PSE.getSE();
2578   const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
2579   assert(BackedgeTakenCount != SE->getCouldNotCompute() &&
2580          "Invalid loop count");
2581 
2582   Type *IdxTy = Legal->getWidestInductionType();
2583   assert(IdxTy && "No type for induction");
2584 
2585   // The exit count might have the type of i64 while the phi is i32. This can
2586   // happen if we have an induction variable that is sign extended before the
2587   // compare. The only way that we get a backedge taken count is that the
2588   // induction variable was signed and as such will not overflow. In such a case
2589   // truncation is legal.
2590   if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) >
2591       IdxTy->getPrimitiveSizeInBits())
2592     BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
2593   BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
2594 
2595   // Get the total trip count from the count by adding 1.
2596   const SCEV *ExitCount = SE->getAddExpr(
2597       BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
2598 
2599   const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
2600 
2601   // Expand the trip count and place the new instructions in the preheader.
2602   // Notice that the pre-header does not change, only the loop body.
2603   SCEVExpander Exp(*SE, DL, "induction");
2604 
2605   // Count holds the overall loop count (N).
2606   TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
2607                                 L->getLoopPreheader()->getTerminator());
2608 
2609   if (TripCount->getType()->isPointerTy())
2610     TripCount =
2611         CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
2612                                     L->getLoopPreheader()->getTerminator());
2613 
2614   return TripCount;
2615 }
2616 
2617 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
2618   if (VectorTripCount)
2619     return VectorTripCount;
2620 
2621   Value *TC = getOrCreateTripCount(L);
2622   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2623 
2624   Type *Ty = TC->getType();
2625   Constant *Step = ConstantInt::get(Ty, VF * UF);
2626 
2627   // If the tail is to be folded by masking, round the number of iterations N
2628   // up to a multiple of Step instead of rounding down. This is done by first
2629   // adding Step-1 and then rounding down. Note that it's ok if this addition
2630   // overflows: the vector induction variable will eventually wrap to zero given
2631   // that it starts at zero and its Step is a power of two; the loop will then
2632   // exit, with the last early-exit vector comparison also producing all-true.
2633   if (Cost->foldTailByMasking()) {
2634     assert(isPowerOf2_32(VF * UF) &&
2635            "VF*UF must be a power of 2 when folding tail by masking");
2636     TC = Builder.CreateAdd(TC, ConstantInt::get(Ty, VF * UF - 1), "n.rnd.up");
2637   }
2638 
2639   // Now we need to generate the expression for the part of the loop that the
2640   // vectorized body will execute. This is equal to N - (N % Step) if scalar
2641   // iterations are not required for correctness, or N - Step, otherwise. Step
2642   // is equal to the vectorization factor (number of SIMD elements) times the
2643   // unroll factor (number of SIMD instructions).
2644   Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
2645 
2646   // If there is a non-reversed interleaved group that may speculatively access
2647   // memory out-of-bounds, we need to ensure that there will be at least one
2648   // iteration of the scalar epilogue loop. Thus, if the step evenly divides
2649   // the trip count, we set the remainder to be equal to the step. If the step
2650   // does not evenly divide the trip count, no adjustment is necessary since
2651   // there will already be scalar iterations. Note that the minimum iterations
2652   // check ensures that N >= Step.
2653   if (VF > 1 && Cost->requiresScalarEpilogue()) {
2654     auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
2655     R = Builder.CreateSelect(IsZero, Step, R);
2656   }
2657 
2658   VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
2659 
2660   return VectorTripCount;
2661 }
2662 
2663 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
2664                                                    const DataLayout &DL) {
2665   // Verify that V is a vector type with same number of elements as DstVTy.
2666   unsigned VF = DstVTy->getNumElements();
2667   VectorType *SrcVecTy = cast<VectorType>(V->getType());
2668   assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match");
2669   Type *SrcElemTy = SrcVecTy->getElementType();
2670   Type *DstElemTy = DstVTy->getElementType();
2671   assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
2672          "Vector elements must have same size");
2673 
2674   // Do a direct cast if element types are castable.
2675   if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
2676     return Builder.CreateBitOrPointerCast(V, DstVTy);
2677   }
2678   // V cannot be directly casted to desired vector type.
2679   // May happen when V is a floating point vector but DstVTy is a vector of
2680   // pointers or vice-versa. Handle this using a two-step bitcast using an
2681   // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
2682   assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
2683          "Only one type should be a pointer type");
2684   assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
2685          "Only one type should be a floating point type");
2686   Type *IntTy =
2687       IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
2688   VectorType *VecIntTy = VectorType::get(IntTy, VF);
2689   Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
2690   return Builder.CreateBitOrPointerCast(CastVal, DstVTy);
2691 }
2692 
2693 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
2694                                                          BasicBlock *Bypass) {
2695   Value *Count = getOrCreateTripCount(L);
2696   // Reuse existing vector loop preheader for TC checks.
2697   // Note that new preheader block is generated for vector loop.
2698   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
2699   IRBuilder<> Builder(TCCheckBlock->getTerminator());
2700 
2701   // Generate code to check if the loop's trip count is less than VF * UF, or
2702   // equal to it in case a scalar epilogue is required; this implies that the
2703   // vector trip count is zero. This check also covers the case where adding one
2704   // to the backedge-taken count overflowed leading to an incorrect trip count
2705   // of zero. In this case we will also jump to the scalar loop.
2706   auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE
2707                                           : ICmpInst::ICMP_ULT;
2708 
2709   // If tail is to be folded, vector loop takes care of all iterations.
2710   Value *CheckMinIters = Builder.getFalse();
2711   if (!Cost->foldTailByMasking())
2712     CheckMinIters = Builder.CreateICmp(
2713         P, Count, ConstantInt::get(Count->getType(), VF * UF),
2714         "min.iters.check");
2715 
2716   // Create new preheader for vector loop.
2717   LoopVectorPreHeader =
2718       SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
2719                  "vector.ph");
2720 
2721   assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
2722                                DT->getNode(Bypass)->getIDom()) &&
2723          "TC check is expected to dominate Bypass");
2724 
2725   // Update dominator for Bypass & LoopExit.
2726   DT->changeImmediateDominator(Bypass, TCCheckBlock);
2727   DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
2728 
2729   ReplaceInstWithInst(
2730       TCCheckBlock->getTerminator(),
2731       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
2732   LoopBypassBlocks.push_back(TCCheckBlock);
2733 }
2734 
2735 void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
2736   // Reuse existing vector loop preheader for SCEV checks.
2737   // Note that new preheader block is generated for vector loop.
2738   BasicBlock *const SCEVCheckBlock = LoopVectorPreHeader;
2739 
2740   // Generate the code to check that the SCEV assumptions that we made.
2741   // We want the new basic block to start at the first instruction in a
2742   // sequence of instructions that form a check.
2743   SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(),
2744                    "scev.check");
2745   Value *SCEVCheck = Exp.expandCodeForPredicate(
2746       &PSE.getUnionPredicate(), SCEVCheckBlock->getTerminator());
2747 
2748   if (auto *C = dyn_cast<ConstantInt>(SCEVCheck))
2749     if (C->isZero())
2750       return;
2751 
2752   assert(!SCEVCheckBlock->getParent()->hasOptSize() &&
2753          "Cannot SCEV check stride or overflow when optimizing for size");
2754 
2755   SCEVCheckBlock->setName("vector.scevcheck");
2756   // Create new preheader for vector loop.
2757   LoopVectorPreHeader =
2758       SplitBlock(SCEVCheckBlock, SCEVCheckBlock->getTerminator(), DT, LI,
2759                  nullptr, "vector.ph");
2760 
2761   // Update dominator only if this is first RT check.
2762   if (LoopBypassBlocks.empty()) {
2763     DT->changeImmediateDominator(Bypass, SCEVCheckBlock);
2764     DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock);
2765   }
2766 
2767   ReplaceInstWithInst(
2768       SCEVCheckBlock->getTerminator(),
2769       BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheck));
2770   LoopBypassBlocks.push_back(SCEVCheckBlock);
2771   AddedSafetyChecks = true;
2772 }
2773 
2774 void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {
2775   // VPlan-native path does not do any analysis for runtime checks currently.
2776   if (EnableVPlanNativePath)
2777     return;
2778 
2779   // Reuse existing vector loop preheader for runtime memory checks.
2780   // Note that new preheader block is generated for vector loop.
2781   BasicBlock *const MemCheckBlock = L->getLoopPreheader();
2782 
2783   // Generate the code that checks in runtime if arrays overlap. We put the
2784   // checks into a separate block to make the more common case of few elements
2785   // faster.
2786   Instruction *FirstCheckInst;
2787   Instruction *MemRuntimeCheck;
2788   std::tie(FirstCheckInst, MemRuntimeCheck) =
2789       Legal->getLAI()->addRuntimeChecks(MemCheckBlock->getTerminator());
2790   if (!MemRuntimeCheck)
2791     return;
2792 
2793   if (MemCheckBlock->getParent()->hasOptSize()) {
2794     assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
2795            "Cannot emit memory checks when optimizing for size, unless forced "
2796            "to vectorize.");
2797     ORE->emit([&]() {
2798       return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
2799                                         L->getStartLoc(), L->getHeader())
2800              << "Code-size may be reduced by not forcing "
2801                 "vectorization, or by source-code modifications "
2802                 "eliminating the need for runtime checks "
2803                 "(e.g., adding 'restrict').";
2804     });
2805   }
2806 
2807   MemCheckBlock->setName("vector.memcheck");
2808   // Create new preheader for vector loop.
2809   LoopVectorPreHeader =
2810       SplitBlock(MemCheckBlock, MemCheckBlock->getTerminator(), DT, LI, nullptr,
2811                  "vector.ph");
2812 
2813   // Update dominator only if this is first RT check.
2814   if (LoopBypassBlocks.empty()) {
2815     DT->changeImmediateDominator(Bypass, MemCheckBlock);
2816     DT->changeImmediateDominator(LoopExitBlock, MemCheckBlock);
2817   }
2818 
2819   ReplaceInstWithInst(
2820       MemCheckBlock->getTerminator(),
2821       BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheck));
2822   LoopBypassBlocks.push_back(MemCheckBlock);
2823   AddedSafetyChecks = true;
2824 
2825   // We currently don't use LoopVersioning for the actual loop cloning but we
2826   // still use it to add the noalias metadata.
2827   LVer = std::make_unique<LoopVersioning>(*Legal->getLAI(), OrigLoop, LI, DT,
2828                                           PSE.getSE());
2829   LVer->prepareNoAliasMetadata();
2830 }
2831 
2832 Value *InnerLoopVectorizer::emitTransformedIndex(
2833     IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL,
2834     const InductionDescriptor &ID) const {
2835 
2836   SCEVExpander Exp(*SE, DL, "induction");
2837   auto Step = ID.getStep();
2838   auto StartValue = ID.getStartValue();
2839   assert(Index->getType() == Step->getType() &&
2840          "Index type does not match StepValue type");
2841 
2842   // Note: the IR at this point is broken. We cannot use SE to create any new
2843   // SCEV and then expand it, hoping that SCEV's simplification will give us
2844   // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
2845   // lead to various SCEV crashes. So all we can do is to use builder and rely
2846   // on InstCombine for future simplifications. Here we handle some trivial
2847   // cases only.
2848   auto CreateAdd = [&B](Value *X, Value *Y) {
2849     assert(X->getType() == Y->getType() && "Types don't match!");
2850     if (auto *CX = dyn_cast<ConstantInt>(X))
2851       if (CX->isZero())
2852         return Y;
2853     if (auto *CY = dyn_cast<ConstantInt>(Y))
2854       if (CY->isZero())
2855         return X;
2856     return B.CreateAdd(X, Y);
2857   };
2858 
2859   auto CreateMul = [&B](Value *X, Value *Y) {
2860     assert(X->getType() == Y->getType() && "Types don't match!");
2861     if (auto *CX = dyn_cast<ConstantInt>(X))
2862       if (CX->isOne())
2863         return Y;
2864     if (auto *CY = dyn_cast<ConstantInt>(Y))
2865       if (CY->isOne())
2866         return X;
2867     return B.CreateMul(X, Y);
2868   };
2869 
2870   switch (ID.getKind()) {
2871   case InductionDescriptor::IK_IntInduction: {
2872     assert(Index->getType() == StartValue->getType() &&
2873            "Index type does not match StartValue type");
2874     if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne())
2875       return B.CreateSub(StartValue, Index);
2876     auto *Offset = CreateMul(
2877         Index, Exp.expandCodeFor(Step, Index->getType(), &*B.GetInsertPoint()));
2878     return CreateAdd(StartValue, Offset);
2879   }
2880   case InductionDescriptor::IK_PtrInduction: {
2881     assert(isa<SCEVConstant>(Step) &&
2882            "Expected constant step for pointer induction");
2883     return B.CreateGEP(
2884         StartValue->getType()->getPointerElementType(), StartValue,
2885         CreateMul(Index, Exp.expandCodeFor(Step, Index->getType(),
2886                                            &*B.GetInsertPoint())));
2887   }
2888   case InductionDescriptor::IK_FpInduction: {
2889     assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
2890     auto InductionBinOp = ID.getInductionBinOp();
2891     assert(InductionBinOp &&
2892            (InductionBinOp->getOpcode() == Instruction::FAdd ||
2893             InductionBinOp->getOpcode() == Instruction::FSub) &&
2894            "Original bin op should be defined for FP induction");
2895 
2896     Value *StepValue = cast<SCEVUnknown>(Step)->getValue();
2897 
2898     // Floating point operations had to be 'fast' to enable the induction.
2899     FastMathFlags Flags;
2900     Flags.setFast();
2901 
2902     Value *MulExp = B.CreateFMul(StepValue, Index);
2903     if (isa<Instruction>(MulExp))
2904       // We have to check, the MulExp may be a constant.
2905       cast<Instruction>(MulExp)->setFastMathFlags(Flags);
2906 
2907     Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
2908                                "induction");
2909     if (isa<Instruction>(BOp))
2910       cast<Instruction>(BOp)->setFastMathFlags(Flags);
2911 
2912     return BOp;
2913   }
2914   case InductionDescriptor::IK_NoInduction:
2915     return nullptr;
2916   }
2917   llvm_unreachable("invalid enum");
2918 }
2919 
2920 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
2921   /*
2922    In this function we generate a new loop. The new loop will contain
2923    the vectorized instructions while the old loop will continue to run the
2924    scalar remainder.
2925 
2926        [ ] <-- loop iteration number check.
2927     /   |
2928    /    v
2929   |    [ ] <-- vector loop bypass (may consist of multiple blocks).
2930   |  /  |
2931   | /   v
2932   ||   [ ]     <-- vector pre header.
2933   |/    |
2934   |     v
2935   |    [  ] \
2936   |    [  ]_|   <-- vector loop.
2937   |     |
2938   |     v
2939   |   -[ ]   <--- middle-block.
2940   |  /  |
2941   | /   v
2942   -|- >[ ]     <--- new preheader.
2943    |    |
2944    |    v
2945    |   [ ] \
2946    |   [ ]_|   <-- old scalar loop to handle remainder.
2947     \   |
2948      \  v
2949       >[ ]     <-- exit block.
2950    ...
2951    */
2952 
2953   MDNode *OrigLoopID = OrigLoop->getLoopID();
2954 
2955   // Some loops have a single integer induction variable, while other loops
2956   // don't. One example is c++ iterators that often have multiple pointer
2957   // induction variables. In the code below we also support a case where we
2958   // don't have a single induction variable.
2959   //
2960   // We try to obtain an induction variable from the original loop as hard
2961   // as possible. However if we don't find one that:
2962   //   - is an integer
2963   //   - counts from zero, stepping by one
2964   //   - is the size of the widest induction variable type
2965   // then we create a new one.
2966   OldInduction = Legal->getPrimaryInduction();
2967   Type *IdxTy = Legal->getWidestInductionType();
2968 
2969   // Split the single block loop into the two loop structure described above.
2970   LoopScalarBody = OrigLoop->getHeader();
2971   LoopVectorPreHeader = OrigLoop->getLoopPreheader();
2972   LoopExitBlock = OrigLoop->getExitBlock();
2973   assert(LoopExitBlock && "Must have an exit block");
2974   assert(LoopVectorPreHeader && "Invalid loop structure");
2975 
2976   LoopMiddleBlock =
2977       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
2978                  LI, nullptr, "middle.block");
2979   LoopScalarPreHeader =
2980       SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,
2981                  nullptr, "scalar.ph");
2982   // We intentionally don't let SplitBlock to update LoopInfo since
2983   // LoopVectorBody should belong to another loop than LoopVectorPreHeader.
2984   // LoopVectorBody is explicitly added to the correct place few lines later.
2985   LoopVectorBody =
2986       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
2987                  nullptr, nullptr, "vector.body");
2988 
2989   // Update dominator for loop exit.
2990   DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
2991 
2992   // Create and register the new vector loop.
2993   Loop *Lp = LI->AllocateLoop();
2994   Loop *ParentLoop = OrigLoop->getParentLoop();
2995 
2996   // Insert the new loop into the loop nest and register the new basic blocks
2997   // before calling any utilities such as SCEV that require valid LoopInfo.
2998   if (ParentLoop) {
2999     ParentLoop->addChildLoop(Lp);
3000   } else {
3001     LI->addTopLevelLoop(Lp);
3002   }
3003   Lp->addBasicBlockToLoop(LoopVectorBody, *LI);
3004 
3005   // Find the loop boundaries.
3006   Value *Count = getOrCreateTripCount(Lp);
3007 
3008   Value *StartIdx = ConstantInt::get(IdxTy, 0);
3009 
3010   // Now, compare the new count to zero. If it is zero skip the vector loop and
3011   // jump to the scalar loop. This check also covers the case where the
3012   // backedge-taken count is uint##_max: adding one to it will overflow leading
3013   // to an incorrect trip count of zero. In this (rare) case we will also jump
3014   // to the scalar loop.
3015   emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader);
3016 
3017   // Generate the code to check any assumptions that we've made for SCEV
3018   // expressions.
3019   emitSCEVChecks(Lp, LoopScalarPreHeader);
3020 
3021   // Generate the code that checks in runtime if arrays overlap. We put the
3022   // checks into a separate block to make the more common case of few elements
3023   // faster.
3024   emitMemRuntimeChecks(Lp, LoopScalarPreHeader);
3025 
3026   // Generate the induction variable.
3027   // The loop step is equal to the vectorization factor (num of SIMD elements)
3028   // times the unroll factor (num of SIMD instructions).
3029   Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
3030   Constant *Step = ConstantInt::get(IdxTy, VF * UF);
3031   Induction =
3032       createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
3033                               getDebugLocFromInstOrOperands(OldInduction));
3034 
3035   // We are going to resume the execution of the scalar loop.
3036   // Go over all of the induction variables that we found and fix the
3037   // PHIs that are left in the scalar version of the loop.
3038   // The starting values of PHI nodes depend on the counter of the last
3039   // iteration in the vectorized loop.
3040   // If we come from a bypass edge then we need to start from the original
3041   // start value.
3042 
3043   // This variable saves the new starting index for the scalar loop. It is used
3044   // to test if there are any tail iterations left once the vector loop has
3045   // completed.
3046   for (auto &InductionEntry : Legal->getInductionVars()) {
3047     PHINode *OrigPhi = InductionEntry.first;
3048     InductionDescriptor II = InductionEntry.second;
3049 
3050     // Create phi nodes to merge from the  backedge-taken check block.
3051     PHINode *BCResumeVal =
3052         PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
3053                         LoopScalarPreHeader->getTerminator());
3054     // Copy original phi DL over to the new one.
3055     BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
3056     Value *&EndValue = IVEndValues[OrigPhi];
3057     if (OrigPhi == OldInduction) {
3058       // We know what the end value is.
3059       EndValue = CountRoundDown;
3060     } else {
3061       IRBuilder<> B(Lp->getLoopPreheader()->getTerminator());
3062       Type *StepType = II.getStep()->getType();
3063       Instruction::CastOps CastOp =
3064           CastInst::getCastOpcode(CountRoundDown, true, StepType, true);
3065       Value *CRD = B.CreateCast(CastOp, CountRoundDown, StepType, "cast.crd");
3066       const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout();
3067       EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
3068       EndValue->setName("ind.end");
3069     }
3070 
3071     // The new PHI merges the original incoming value, in case of a bypass,
3072     // or the value at the end of the vectorized loop.
3073     BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);
3074 
3075     // Fix the scalar body counter (PHI node).
3076     // The old induction's phi node in the scalar body needs the truncated
3077     // value.
3078     for (BasicBlock *BB : LoopBypassBlocks)
3079       BCResumeVal->addIncoming(II.getStartValue(), BB);
3080     OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
3081   }
3082 
3083   // We need the OrigLoop (scalar loop part) latch terminator to help
3084   // produce correct debug info for the middle block BB instructions.
3085   // The legality check stage guarantees that the loop will have a single
3086   // latch.
3087   assert(isa<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()) &&
3088          "Scalar loop latch terminator isn't a branch");
3089   BranchInst *ScalarLatchBr =
3090       cast<BranchInst>(OrigLoop->getLoopLatch()->getTerminator());
3091 
3092   // Add a check in the middle block to see if we have completed
3093   // all of the iterations in the first vector loop.
3094   // If (N - N%VF) == N, then we *don't* need to run the remainder.
3095   // If tail is to be folded, we know we don't need to run the remainder.
3096   Value *CmpN = Builder.getTrue();
3097   if (!Cost->foldTailByMasking()) {
3098     CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count,
3099                            CountRoundDown, "cmp.n",
3100                            LoopMiddleBlock->getTerminator());
3101 
3102     // Here we use the same DebugLoc as the scalar loop latch branch instead
3103     // of the corresponding compare because they may have ended up with
3104     // different line numbers and we want to avoid awkward line stepping while
3105     // debugging. Eg. if the compare has got a line number inside the loop.
3106     cast<Instruction>(CmpN)->setDebugLoc(ScalarLatchBr->getDebugLoc());
3107   }
3108 
3109   BranchInst *BrInst =
3110       BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, CmpN);
3111   BrInst->setDebugLoc(ScalarLatchBr->getDebugLoc());
3112   ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst);
3113 
3114   // Get ready to start creating new instructions into the vectorized body.
3115   assert(LoopVectorPreHeader == Lp->getLoopPreheader() &&
3116          "Inconsistent vector loop preheader");
3117   Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt());
3118 
3119   Optional<MDNode *> VectorizedLoopID =
3120       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
3121                                       LLVMLoopVectorizeFollowupVectorized});
3122   if (VectorizedLoopID.hasValue()) {
3123     Lp->setLoopID(VectorizedLoopID.getValue());
3124 
3125     // Do not setAlreadyVectorized if loop attributes have been defined
3126     // explicitly.
3127     return LoopVectorPreHeader;
3128   }
3129 
3130   // Keep all loop hints from the original loop on the vector loop (we'll
3131   // replace the vectorizer-specific hints below).
3132   if (MDNode *LID = OrigLoop->getLoopID())
3133     Lp->setLoopID(LID);
3134 
3135   LoopVectorizeHints Hints(Lp, true, *ORE);
3136   Hints.setAlreadyVectorized();
3137 
3138 #ifdef EXPENSIVE_CHECKS
3139   assert(DT->verify(DominatorTree::VerificationLevel::Fast));
3140   LI->verify(*DT);
3141 #endif
3142 
3143   return LoopVectorPreHeader;
3144 }
3145 
3146 // Fix up external users of the induction variable. At this point, we are
3147 // in LCSSA form, with all external PHIs that use the IV having one input value,
3148 // coming from the remainder loop. We need those PHIs to also have a correct
3149 // value for the IV when arriving directly from the middle block.
3150 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
3151                                        const InductionDescriptor &II,
3152                                        Value *CountRoundDown, Value *EndValue,
3153                                        BasicBlock *MiddleBlock) {
3154   // There are two kinds of external IV usages - those that use the value
3155   // computed in the last iteration (the PHI) and those that use the penultimate
3156   // value (the value that feeds into the phi from the loop latch).
3157   // We allow both, but they, obviously, have different values.
3158 
3159   assert(OrigLoop->getExitBlock() && "Expected a single exit block");
3160 
3161   DenseMap<Value *, Value *> MissingVals;
3162 
3163   // An external user of the last iteration's value should see the value that
3164   // the remainder loop uses to initialize its own IV.
3165   Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
3166   for (User *U : PostInc->users()) {
3167     Instruction *UI = cast<Instruction>(U);
3168     if (!OrigLoop->contains(UI)) {
3169       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3170       MissingVals[UI] = EndValue;
3171     }
3172   }
3173 
3174   // An external user of the penultimate value need to see EndValue - Step.
3175   // The simplest way to get this is to recompute it from the constituent SCEVs,
3176   // that is Start + (Step * (CRD - 1)).
3177   for (User *U : OrigPhi->users()) {
3178     auto *UI = cast<Instruction>(U);
3179     if (!OrigLoop->contains(UI)) {
3180       const DataLayout &DL =
3181           OrigLoop->getHeader()->getModule()->getDataLayout();
3182       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3183 
3184       IRBuilder<> B(MiddleBlock->getTerminator());
3185       Value *CountMinusOne = B.CreateSub(
3186           CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1));
3187       Value *CMO =
3188           !II.getStep()->getType()->isIntegerTy()
3189               ? B.CreateCast(Instruction::SIToFP, CountMinusOne,
3190                              II.getStep()->getType())
3191               : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType());
3192       CMO->setName("cast.cmo");
3193       Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II);
3194       Escape->setName("ind.escape");
3195       MissingVals[UI] = Escape;
3196     }
3197   }
3198 
3199   for (auto &I : MissingVals) {
3200     PHINode *PHI = cast<PHINode>(I.first);
3201     // One corner case we have to handle is two IVs "chasing" each-other,
3202     // that is %IV2 = phi [...], [ %IV1, %latch ]
3203     // In this case, if IV1 has an external use, we need to avoid adding both
3204     // "last value of IV1" and "penultimate value of IV2". So, verify that we
3205     // don't already have an incoming value for the middle block.
3206     if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
3207       PHI->addIncoming(I.second, MiddleBlock);
3208   }
3209 }
3210 
3211 namespace {
3212 
3213 struct CSEDenseMapInfo {
3214   static bool canHandle(const Instruction *I) {
3215     return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
3216            isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
3217   }
3218 
3219   static inline Instruction *getEmptyKey() {
3220     return DenseMapInfo<Instruction *>::getEmptyKey();
3221   }
3222 
3223   static inline Instruction *getTombstoneKey() {
3224     return DenseMapInfo<Instruction *>::getTombstoneKey();
3225   }
3226 
3227   static unsigned getHashValue(const Instruction *I) {
3228     assert(canHandle(I) && "Unknown instruction!");
3229     return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
3230                                                            I->value_op_end()));
3231   }
3232 
3233   static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
3234     if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3235         LHS == getTombstoneKey() || RHS == getTombstoneKey())
3236       return LHS == RHS;
3237     return LHS->isIdenticalTo(RHS);
3238   }
3239 };
3240 
3241 } // end anonymous namespace
3242 
3243 ///Perform cse of induction variable instructions.
3244 static void cse(BasicBlock *BB) {
3245   // Perform simple cse.
3246   SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
3247   for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
3248     Instruction *In = &*I++;
3249 
3250     if (!CSEDenseMapInfo::canHandle(In))
3251       continue;
3252 
3253     // Check if we can replace this instruction with any of the
3254     // visited instructions.
3255     if (Instruction *V = CSEMap.lookup(In)) {
3256       In->replaceAllUsesWith(V);
3257       In->eraseFromParent();
3258       continue;
3259     }
3260 
3261     CSEMap[In] = In;
3262   }
3263 }
3264 
3265 unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
3266                                                        unsigned VF,
3267                                                        bool &NeedToScalarize) {
3268   Function *F = CI->getCalledFunction();
3269   Type *ScalarRetTy = CI->getType();
3270   SmallVector<Type *, 4> Tys, ScalarTys;
3271   for (auto &ArgOp : CI->arg_operands())
3272     ScalarTys.push_back(ArgOp->getType());
3273 
3274   // Estimate cost of scalarized vector call. The source operands are assumed
3275   // to be vectors, so we need to extract individual elements from there,
3276   // execute VF scalar calls, and then gather the result into the vector return
3277   // value.
3278   unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys,
3279                                                  TTI::TCK_RecipThroughput);
3280   if (VF == 1)
3281     return ScalarCallCost;
3282 
3283   // Compute corresponding vector type for return value and arguments.
3284   Type *RetTy = ToVectorTy(ScalarRetTy, VF);
3285   for (Type *ScalarTy : ScalarTys)
3286     Tys.push_back(ToVectorTy(ScalarTy, VF));
3287 
3288   // Compute costs of unpacking argument values for the scalar calls and
3289   // packing the return values to a vector.
3290   unsigned ScalarizationCost = getScalarizationOverhead(CI, VF);
3291 
3292   unsigned Cost = ScalarCallCost * VF + ScalarizationCost;
3293 
3294   // If we can't emit a vector call for this function, then the currently found
3295   // cost is the cost we need to return.
3296   NeedToScalarize = true;
3297   VFShape Shape = VFShape::get(*CI, {VF, false}, false /*HasGlobalPred*/);
3298   Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
3299 
3300   if (!TLI || CI->isNoBuiltin() || !VecFunc)
3301     return Cost;
3302 
3303   // If the corresponding vector cost is cheaper, return its cost.
3304   unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys,
3305                                                  TTI::TCK_RecipThroughput);
3306   if (VectorCallCost < Cost) {
3307     NeedToScalarize = false;
3308     return VectorCallCost;
3309   }
3310   return Cost;
3311 }
3312 
3313 unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
3314                                                             unsigned VF) {
3315   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
3316   assert(ID && "Expected intrinsic call!");
3317 
3318   FastMathFlags FMF;
3319   if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
3320     FMF = FPMO->getFastMathFlags();
3321 
3322   SmallVector<Value *, 4> Operands(CI->arg_operands());
3323   return TTI.getIntrinsicInstrCost(ID, CI->getType(), Operands, FMF, VF,
3324                                    TargetTransformInfo::TCK_RecipThroughput,
3325                                    CI);
3326 }
3327 
3328 static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
3329   auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3330   auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3331   return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3332 }
3333 
3334 static Type *largestIntegerVectorType(Type *T1, Type *T2) {
3335   auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3336   auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3337   return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3338 }
3339 
3340 void InnerLoopVectorizer::truncateToMinimalBitwidths() {
3341   // For every instruction `I` in MinBWs, truncate the operands, create a
3342   // truncated version of `I` and reextend its result. InstCombine runs
3343   // later and will remove any ext/trunc pairs.
3344   SmallPtrSet<Value *, 4> Erased;
3345   for (const auto &KV : Cost->getMinimalBitwidths()) {
3346     // If the value wasn't vectorized, we must maintain the original scalar
3347     // type. The absence of the value from VectorLoopValueMap indicates that it
3348     // wasn't vectorized.
3349     if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3350       continue;
3351     for (unsigned Part = 0; Part < UF; ++Part) {
3352       Value *I = getOrCreateVectorValue(KV.first, Part);
3353       if (Erased.find(I) != Erased.end() || I->use_empty() ||
3354           !isa<Instruction>(I))
3355         continue;
3356       Type *OriginalTy = I->getType();
3357       Type *ScalarTruncatedTy =
3358           IntegerType::get(OriginalTy->getContext(), KV.second);
3359       Type *TruncatedTy = VectorType::get(
3360           ScalarTruncatedTy, cast<VectorType>(OriginalTy)->getNumElements());
3361       if (TruncatedTy == OriginalTy)
3362         continue;
3363 
3364       IRBuilder<> B(cast<Instruction>(I));
3365       auto ShrinkOperand = [&](Value *V) -> Value * {
3366         if (auto *ZI = dyn_cast<ZExtInst>(V))
3367           if (ZI->getSrcTy() == TruncatedTy)
3368             return ZI->getOperand(0);
3369         return B.CreateZExtOrTrunc(V, TruncatedTy);
3370       };
3371 
3372       // The actual instruction modification depends on the instruction type,
3373       // unfortunately.
3374       Value *NewI = nullptr;
3375       if (auto *BO = dyn_cast<BinaryOperator>(I)) {
3376         NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
3377                              ShrinkOperand(BO->getOperand(1)));
3378 
3379         // Any wrapping introduced by shrinking this operation shouldn't be
3380         // considered undefined behavior. So, we can't unconditionally copy
3381         // arithmetic wrapping flags to NewI.
3382         cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
3383       } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
3384         NewI =
3385             B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
3386                          ShrinkOperand(CI->getOperand(1)));
3387       } else if (auto *SI = dyn_cast<SelectInst>(I)) {
3388         NewI = B.CreateSelect(SI->getCondition(),
3389                               ShrinkOperand(SI->getTrueValue()),
3390                               ShrinkOperand(SI->getFalseValue()));
3391       } else if (auto *CI = dyn_cast<CastInst>(I)) {
3392         switch (CI->getOpcode()) {
3393         default:
3394           llvm_unreachable("Unhandled cast!");
3395         case Instruction::Trunc:
3396           NewI = ShrinkOperand(CI->getOperand(0));
3397           break;
3398         case Instruction::SExt:
3399           NewI = B.CreateSExtOrTrunc(
3400               CI->getOperand(0),
3401               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3402           break;
3403         case Instruction::ZExt:
3404           NewI = B.CreateZExtOrTrunc(
3405               CI->getOperand(0),
3406               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3407           break;
3408         }
3409       } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
3410         auto Elements0 =
3411             cast<VectorType>(SI->getOperand(0)->getType())->getNumElements();
3412         auto *O0 = B.CreateZExtOrTrunc(
3413             SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0));
3414         auto Elements1 =
3415             cast<VectorType>(SI->getOperand(1)->getType())->getNumElements();
3416         auto *O1 = B.CreateZExtOrTrunc(
3417             SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1));
3418 
3419         NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask());
3420       } else if (isa<LoadInst>(I) || isa<PHINode>(I)) {
3421         // Don't do anything with the operands, just extend the result.
3422         continue;
3423       } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
3424         auto Elements =
3425             cast<VectorType>(IE->getOperand(0)->getType())->getNumElements();
3426         auto *O0 = B.CreateZExtOrTrunc(
3427             IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3428         auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
3429         NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
3430       } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
3431         auto Elements =
3432             cast<VectorType>(EE->getOperand(0)->getType())->getNumElements();
3433         auto *O0 = B.CreateZExtOrTrunc(
3434             EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3435         NewI = B.CreateExtractElement(O0, EE->getOperand(2));
3436       } else {
3437         // If we don't know what to do, be conservative and don't do anything.
3438         continue;
3439       }
3440 
3441       // Lastly, extend the result.
3442       NewI->takeName(cast<Instruction>(I));
3443       Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
3444       I->replaceAllUsesWith(Res);
3445       cast<Instruction>(I)->eraseFromParent();
3446       Erased.insert(I);
3447       VectorLoopValueMap.resetVectorValue(KV.first, Part, Res);
3448     }
3449   }
3450 
3451   // We'll have created a bunch of ZExts that are now parentless. Clean up.
3452   for (const auto &KV : Cost->getMinimalBitwidths()) {
3453     // If the value wasn't vectorized, we must maintain the original scalar
3454     // type. The absence of the value from VectorLoopValueMap indicates that it
3455     // wasn't vectorized.
3456     if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3457       continue;
3458     for (unsigned Part = 0; Part < UF; ++Part) {
3459       Value *I = getOrCreateVectorValue(KV.first, Part);
3460       ZExtInst *Inst = dyn_cast<ZExtInst>(I);
3461       if (Inst && Inst->use_empty()) {
3462         Value *NewI = Inst->getOperand(0);
3463         Inst->eraseFromParent();
3464         VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI);
3465       }
3466     }
3467   }
3468 }
3469 
3470 void InnerLoopVectorizer::fixVectorizedLoop() {
3471   // Insert truncates and extends for any truncated instructions as hints to
3472   // InstCombine.
3473   if (VF > 1)
3474     truncateToMinimalBitwidths();
3475 
3476   // Fix widened non-induction PHIs by setting up the PHI operands.
3477   if (OrigPHIsToFix.size()) {
3478     assert(EnableVPlanNativePath &&
3479            "Unexpected non-induction PHIs for fixup in non VPlan-native path");
3480     fixNonInductionPHIs();
3481   }
3482 
3483   // At this point every instruction in the original loop is widened to a
3484   // vector form. Now we need to fix the recurrences in the loop. These PHI
3485   // nodes are currently empty because we did not want to introduce cycles.
3486   // This is the second stage of vectorizing recurrences.
3487   fixCrossIterationPHIs();
3488 
3489   // Forget the original basic block.
3490   PSE.getSE()->forgetLoop(OrigLoop);
3491 
3492   // Fix-up external users of the induction variables.
3493   for (auto &Entry : Legal->getInductionVars())
3494     fixupIVUsers(Entry.first, Entry.second,
3495                  getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)),
3496                  IVEndValues[Entry.first], LoopMiddleBlock);
3497 
3498   fixLCSSAPHIs();
3499   for (Instruction *PI : PredicatedInstructions)
3500     sinkScalarOperands(&*PI);
3501 
3502   // Remove redundant induction instructions.
3503   cse(LoopVectorBody);
3504 
3505   // Set/update profile weights for the vector and remainder loops as original
3506   // loop iterations are now distributed among them. Note that original loop
3507   // represented by LoopScalarBody becomes remainder loop after vectorization.
3508   //
3509   // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
3510   // end up getting slightly roughened result but that should be OK since
3511   // profile is not inherently precise anyway. Note also possible bypass of
3512   // vector code caused by legality checks is ignored, assigning all the weight
3513   // to the vector loop, optimistically.
3514   setProfileInfoAfterUnrolling(LI->getLoopFor(LoopScalarBody),
3515                                LI->getLoopFor(LoopVectorBody),
3516                                LI->getLoopFor(LoopScalarBody), VF * UF);
3517 }
3518 
3519 void InnerLoopVectorizer::fixCrossIterationPHIs() {
3520   // In order to support recurrences we need to be able to vectorize Phi nodes.
3521   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3522   // stage #2: We now need to fix the recurrences by adding incoming edges to
3523   // the currently empty PHI nodes. At this point every instruction in the
3524   // original loop is widened to a vector form so we can use them to construct
3525   // the incoming edges.
3526   for (PHINode &Phi : OrigLoop->getHeader()->phis()) {
3527     // Handle first-order recurrences and reductions that need to be fixed.
3528     if (Legal->isFirstOrderRecurrence(&Phi))
3529       fixFirstOrderRecurrence(&Phi);
3530     else if (Legal->isReductionVariable(&Phi))
3531       fixReduction(&Phi);
3532   }
3533 }
3534 
3535 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
3536   // This is the second phase of vectorizing first-order recurrences. An
3537   // overview of the transformation is described below. Suppose we have the
3538   // following loop.
3539   //
3540   //   for (int i = 0; i < n; ++i)
3541   //     b[i] = a[i] - a[i - 1];
3542   //
3543   // There is a first-order recurrence on "a". For this loop, the shorthand
3544   // scalar IR looks like:
3545   //
3546   //   scalar.ph:
3547   //     s_init = a[-1]
3548   //     br scalar.body
3549   //
3550   //   scalar.body:
3551   //     i = phi [0, scalar.ph], [i+1, scalar.body]
3552   //     s1 = phi [s_init, scalar.ph], [s2, scalar.body]
3553   //     s2 = a[i]
3554   //     b[i] = s2 - s1
3555   //     br cond, scalar.body, ...
3556   //
3557   // In this example, s1 is a recurrence because it's value depends on the
3558   // previous iteration. In the first phase of vectorization, we created a
3559   // temporary value for s1. We now complete the vectorization and produce the
3560   // shorthand vector IR shown below (for VF = 4, UF = 1).
3561   //
3562   //   vector.ph:
3563   //     v_init = vector(..., ..., ..., a[-1])
3564   //     br vector.body
3565   //
3566   //   vector.body
3567   //     i = phi [0, vector.ph], [i+4, vector.body]
3568   //     v1 = phi [v_init, vector.ph], [v2, vector.body]
3569   //     v2 = a[i, i+1, i+2, i+3];
3570   //     v3 = vector(v1(3), v2(0, 1, 2))
3571   //     b[i, i+1, i+2, i+3] = v2 - v3
3572   //     br cond, vector.body, middle.block
3573   //
3574   //   middle.block:
3575   //     x = v2(3)
3576   //     br scalar.ph
3577   //
3578   //   scalar.ph:
3579   //     s_init = phi [x, middle.block], [a[-1], otherwise]
3580   //     br scalar.body
3581   //
3582   // After execution completes the vector loop, we extract the next value of
3583   // the recurrence (x) to use as the initial value in the scalar loop.
3584 
3585   // Get the original loop preheader and single loop latch.
3586   auto *Preheader = OrigLoop->getLoopPreheader();
3587   auto *Latch = OrigLoop->getLoopLatch();
3588 
3589   // Get the initial and previous values of the scalar recurrence.
3590   auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader);
3591   auto *Previous = Phi->getIncomingValueForBlock(Latch);
3592 
3593   // Create a vector from the initial value.
3594   auto *VectorInit = ScalarInit;
3595   if (VF > 1) {
3596     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
3597     VectorInit = Builder.CreateInsertElement(
3598         UndefValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit,
3599         Builder.getInt32(VF - 1), "vector.recur.init");
3600   }
3601 
3602   // We constructed a temporary phi node in the first phase of vectorization.
3603   // This phi node will eventually be deleted.
3604   Builder.SetInsertPoint(
3605       cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0)));
3606 
3607   // Create a phi node for the new recurrence. The current value will either be
3608   // the initial value inserted into a vector or loop-varying vector value.
3609   auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur");
3610   VecPhi->addIncoming(VectorInit, LoopVectorPreHeader);
3611 
3612   // Get the vectorized previous value of the last part UF - 1. It appears last
3613   // among all unrolled iterations, due to the order of their construction.
3614   Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1);
3615 
3616   // Find and set the insertion point after the previous value if it is an
3617   // instruction.
3618   BasicBlock::iterator InsertPt;
3619   // Note that the previous value may have been constant-folded so it is not
3620   // guaranteed to be an instruction in the vector loop.
3621   // FIXME: Loop invariant values do not form recurrences. We should deal with
3622   //        them earlier.
3623   if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart))
3624     InsertPt = LoopVectorBody->getFirstInsertionPt();
3625   else {
3626     Instruction *PreviousInst = cast<Instruction>(PreviousLastPart);
3627     if (isa<PHINode>(PreviousLastPart))
3628       // If the previous value is a phi node, we should insert after all the phi
3629       // nodes in the block containing the PHI to avoid breaking basic block
3630       // verification. Note that the basic block may be different to
3631       // LoopVectorBody, in case we predicate the loop.
3632       InsertPt = PreviousInst->getParent()->getFirstInsertionPt();
3633     else
3634       InsertPt = ++PreviousInst->getIterator();
3635   }
3636   Builder.SetInsertPoint(&*InsertPt);
3637 
3638   // We will construct a vector for the recurrence by combining the values for
3639   // the current and previous iterations. This is the required shuffle mask.
3640   SmallVector<int, 8> ShuffleMask(VF);
3641   ShuffleMask[0] = VF - 1;
3642   for (unsigned I = 1; I < VF; ++I)
3643     ShuffleMask[I] = I + VF - 1;
3644 
3645   // The vector from which to take the initial value for the current iteration
3646   // (actual or unrolled). Initially, this is the vector phi node.
3647   Value *Incoming = VecPhi;
3648 
3649   // Shuffle the current and previous vector and update the vector parts.
3650   for (unsigned Part = 0; Part < UF; ++Part) {
3651     Value *PreviousPart = getOrCreateVectorValue(Previous, Part);
3652     Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part);
3653     auto *Shuffle = VF > 1 ? Builder.CreateShuffleVector(Incoming, PreviousPart,
3654                                                          ShuffleMask)
3655                            : Incoming;
3656     PhiPart->replaceAllUsesWith(Shuffle);
3657     cast<Instruction>(PhiPart)->eraseFromParent();
3658     VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle);
3659     Incoming = PreviousPart;
3660   }
3661 
3662   // Fix the latch value of the new recurrence in the vector loop.
3663   VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
3664 
3665   // Extract the last vector element in the middle block. This will be the
3666   // initial value for the recurrence when jumping to the scalar loop.
3667   auto *ExtractForScalar = Incoming;
3668   if (VF > 1) {
3669     Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
3670     ExtractForScalar = Builder.CreateExtractElement(
3671         ExtractForScalar, Builder.getInt32(VF - 1), "vector.recur.extract");
3672   }
3673   // Extract the second last element in the middle block if the
3674   // Phi is used outside the loop. We need to extract the phi itself
3675   // and not the last element (the phi update in the current iteration). This
3676   // will be the value when jumping to the exit block from the LoopMiddleBlock,
3677   // when the scalar loop is not run at all.
3678   Value *ExtractForPhiUsedOutsideLoop = nullptr;
3679   if (VF > 1)
3680     ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
3681         Incoming, Builder.getInt32(VF - 2), "vector.recur.extract.for.phi");
3682   // When loop is unrolled without vectorizing, initialize
3683   // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of
3684   // `Incoming`. This is analogous to the vectorized case above: extracting the
3685   // second last element when VF > 1.
3686   else if (UF > 1)
3687     ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2);
3688 
3689   // Fix the initial value of the original recurrence in the scalar loop.
3690   Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
3691   auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
3692   for (auto *BB : predecessors(LoopScalarPreHeader)) {
3693     auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
3694     Start->addIncoming(Incoming, BB);
3695   }
3696 
3697   Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start);
3698   Phi->setName("scalar.recur");
3699 
3700   // Finally, fix users of the recurrence outside the loop. The users will need
3701   // either the last value of the scalar recurrence or the last value of the
3702   // vector recurrence we extracted in the middle block. Since the loop is in
3703   // LCSSA form, we just need to find all the phi nodes for the original scalar
3704   // recurrence in the exit block, and then add an edge for the middle block.
3705   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3706     if (LCSSAPhi.getIncomingValue(0) == Phi) {
3707       LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
3708     }
3709   }
3710 }
3711 
3712 void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
3713   Constant *Zero = Builder.getInt32(0);
3714 
3715   // Get it's reduction variable descriptor.
3716   assert(Legal->isReductionVariable(Phi) &&
3717          "Unable to find the reduction variable");
3718   RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi];
3719 
3720   RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();
3721   TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
3722   Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
3723   RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind =
3724     RdxDesc.getMinMaxRecurrenceKind();
3725   setDebugLocFromInst(Builder, ReductionStartValue);
3726 
3727   // We need to generate a reduction vector from the incoming scalar.
3728   // To do so, we need to generate the 'identity' vector and override
3729   // one of the elements with the incoming scalar reduction. We need
3730   // to do it in the vector-loop preheader.
3731   Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
3732 
3733   // This is the vector-clone of the value that leaves the loop.
3734   Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType();
3735 
3736   // Find the reduction identity variable. Zero for addition, or, xor,
3737   // one for multiplication, -1 for And.
3738   Value *Identity;
3739   Value *VectorStart;
3740   if (RK == RecurrenceDescriptor::RK_IntegerMinMax ||
3741       RK == RecurrenceDescriptor::RK_FloatMinMax) {
3742     // MinMax reduction have the start value as their identify.
3743     if (VF == 1) {
3744       VectorStart = Identity = ReductionStartValue;
3745     } else {
3746       VectorStart = Identity =
3747         Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident");
3748     }
3749   } else {
3750     // Handle other reduction kinds:
3751     Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity(
3752         RK, VecTy->getScalarType());
3753     if (VF == 1) {
3754       Identity = Iden;
3755       // This vector is the Identity vector where the first element is the
3756       // incoming scalar reduction.
3757       VectorStart = ReductionStartValue;
3758     } else {
3759       Identity = ConstantVector::getSplat({VF, false}, Iden);
3760 
3761       // This vector is the Identity vector where the first element is the
3762       // incoming scalar reduction.
3763       VectorStart =
3764         Builder.CreateInsertElement(Identity, ReductionStartValue, Zero);
3765     }
3766   }
3767 
3768   // Wrap flags are in general invalid after vectorization, clear them.
3769   clearReductionWrapFlags(RdxDesc);
3770 
3771   // Fix the vector-loop phi.
3772 
3773   // Reductions do not have to start at zero. They can start with
3774   // any loop invariant values.
3775   BasicBlock *Latch = OrigLoop->getLoopLatch();
3776   Value *LoopVal = Phi->getIncomingValueForBlock(Latch);
3777 
3778   for (unsigned Part = 0; Part < UF; ++Part) {
3779     Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part);
3780     Value *Val = getOrCreateVectorValue(LoopVal, Part);
3781     // Make sure to add the reduction start value only to the
3782     // first unroll part.
3783     Value *StartVal = (Part == 0) ? VectorStart : Identity;
3784     cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader);
3785     cast<PHINode>(VecRdxPhi)
3786       ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
3787   }
3788 
3789   // Before each round, move the insertion point right between
3790   // the PHIs and the values we are going to write.
3791   // This allows us to write both PHINodes and the extractelement
3792   // instructions.
3793   Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3794 
3795   setDebugLocFromInst(Builder, LoopExitInst);
3796 
3797   // If tail is folded by masking, the vector value to leave the loop should be
3798   // a Select choosing between the vectorized LoopExitInst and vectorized Phi,
3799   // instead of the former.
3800   if (Cost->foldTailByMasking()) {
3801     for (unsigned Part = 0; Part < UF; ++Part) {
3802       Value *VecLoopExitInst =
3803           VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3804       Value *Sel = nullptr;
3805       for (User *U : VecLoopExitInst->users()) {
3806         if (isa<SelectInst>(U)) {
3807           assert(!Sel && "Reduction exit feeding two selects");
3808           Sel = U;
3809         } else
3810           assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select");
3811       }
3812       assert(Sel && "Reduction exit feeds no select");
3813       VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, Sel);
3814     }
3815   }
3816 
3817   // If the vector reduction can be performed in a smaller type, we truncate
3818   // then extend the loop exit value to enable InstCombine to evaluate the
3819   // entire expression in the smaller type.
3820   if (VF > 1 && Phi->getType() != RdxDesc.getRecurrenceType()) {
3821     Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
3822     Builder.SetInsertPoint(
3823         LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator());
3824     VectorParts RdxParts(UF);
3825     for (unsigned Part = 0; Part < UF; ++Part) {
3826       RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3827       Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3828       Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
3829                                         : Builder.CreateZExt(Trunc, VecTy);
3830       for (Value::user_iterator UI = RdxParts[Part]->user_begin();
3831            UI != RdxParts[Part]->user_end();)
3832         if (*UI != Trunc) {
3833           (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd);
3834           RdxParts[Part] = Extnd;
3835         } else {
3836           ++UI;
3837         }
3838     }
3839     Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3840     for (unsigned Part = 0; Part < UF; ++Part) {
3841       RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3842       VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]);
3843     }
3844   }
3845 
3846   // Reduce all of the unrolled parts into a single vector.
3847   Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0);
3848   unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK);
3849 
3850   // The middle block terminator has already been assigned a DebugLoc here (the
3851   // OrigLoop's single latch terminator). We want the whole middle block to
3852   // appear to execute on this line because: (a) it is all compiler generated,
3853   // (b) these instructions are always executed after evaluating the latch
3854   // conditional branch, and (c) other passes may add new predecessors which
3855   // terminate on this line. This is the easiest way to ensure we don't
3856   // accidentally cause an extra step back into the loop while debugging.
3857   setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator());
3858   for (unsigned Part = 1; Part < UF; ++Part) {
3859     Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3860     if (Op != Instruction::ICmp && Op != Instruction::FCmp)
3861       // Floating point operations had to be 'fast' to enable the reduction.
3862       ReducedPartRdx = addFastMathFlag(
3863           Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxPart,
3864                               ReducedPartRdx, "bin.rdx"),
3865           RdxDesc.getFastMathFlags());
3866     else
3867       ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx,
3868                                       RdxPart);
3869   }
3870 
3871   if (VF > 1) {
3872     bool NoNaN = Legal->hasFunNoNaNAttr();
3873     ReducedPartRdx =
3874         createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN);
3875     // If the reduction can be performed in a smaller type, we need to extend
3876     // the reduction to the wider type before we branch to the original loop.
3877     if (Phi->getType() != RdxDesc.getRecurrenceType())
3878       ReducedPartRdx =
3879         RdxDesc.isSigned()
3880         ? Builder.CreateSExt(ReducedPartRdx, Phi->getType())
3881         : Builder.CreateZExt(ReducedPartRdx, Phi->getType());
3882   }
3883 
3884   // Create a phi node that merges control-flow from the backedge-taken check
3885   // block and the middle block.
3886   PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx",
3887                                         LoopScalarPreHeader->getTerminator());
3888   for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
3889     BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]);
3890   BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
3891 
3892   // Now, we need to fix the users of the reduction variable
3893   // inside and outside of the scalar remainder loop.
3894   // We know that the loop is in LCSSA form. We need to update the
3895   // PHI nodes in the exit blocks.
3896   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3897     // All PHINodes need to have a single entry edge, or two if
3898     // we already fixed them.
3899     assert(LCSSAPhi.getNumIncomingValues() < 3 && "Invalid LCSSA PHI");
3900 
3901     // We found a reduction value exit-PHI. Update it with the
3902     // incoming bypass edge.
3903     if (LCSSAPhi.getIncomingValue(0) == LoopExitInst)
3904       LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
3905   } // end of the LCSSA phi scan.
3906 
3907     // Fix the scalar loop reduction variable with the incoming reduction sum
3908     // from the vector body and from the backedge value.
3909   int IncomingEdgeBlockIdx =
3910     Phi->getBasicBlockIndex(OrigLoop->getLoopLatch());
3911   assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
3912   // Pick the other block.
3913   int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
3914   Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
3915   Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
3916 }
3917 
3918 void InnerLoopVectorizer::clearReductionWrapFlags(
3919     RecurrenceDescriptor &RdxDesc) {
3920   RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();
3921   if (RK != RecurrenceDescriptor::RK_IntegerAdd &&
3922       RK != RecurrenceDescriptor::RK_IntegerMult)
3923     return;
3924 
3925   Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr();
3926   assert(LoopExitInstr && "null loop exit instruction");
3927   SmallVector<Instruction *, 8> Worklist;
3928   SmallPtrSet<Instruction *, 8> Visited;
3929   Worklist.push_back(LoopExitInstr);
3930   Visited.insert(LoopExitInstr);
3931 
3932   while (!Worklist.empty()) {
3933     Instruction *Cur = Worklist.pop_back_val();
3934     if (isa<OverflowingBinaryOperator>(Cur))
3935       for (unsigned Part = 0; Part < UF; ++Part) {
3936         Value *V = getOrCreateVectorValue(Cur, Part);
3937         cast<Instruction>(V)->dropPoisonGeneratingFlags();
3938       }
3939 
3940     for (User *U : Cur->users()) {
3941       Instruction *UI = cast<Instruction>(U);
3942       if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) &&
3943           Visited.insert(UI).second)
3944         Worklist.push_back(UI);
3945     }
3946   }
3947 }
3948 
3949 void InnerLoopVectorizer::fixLCSSAPHIs() {
3950   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3951     if (LCSSAPhi.getNumIncomingValues() == 1) {
3952       auto *IncomingValue = LCSSAPhi.getIncomingValue(0);
3953       // Non-instruction incoming values will have only one value.
3954       unsigned LastLane = 0;
3955       if (isa<Instruction>(IncomingValue))
3956           LastLane = Cost->isUniformAfterVectorization(
3957                          cast<Instruction>(IncomingValue), VF)
3958                          ? 0
3959                          : VF - 1;
3960       // Can be a loop invariant incoming value or the last scalar value to be
3961       // extracted from the vectorized loop.
3962       Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
3963       Value *lastIncomingValue =
3964           getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane });
3965       LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock);
3966     }
3967   }
3968 }
3969 
3970 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
3971   // The basic block and loop containing the predicated instruction.
3972   auto *PredBB = PredInst->getParent();
3973   auto *VectorLoop = LI->getLoopFor(PredBB);
3974 
3975   // Initialize a worklist with the operands of the predicated instruction.
3976   SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
3977 
3978   // Holds instructions that we need to analyze again. An instruction may be
3979   // reanalyzed if we don't yet know if we can sink it or not.
3980   SmallVector<Instruction *, 8> InstsToReanalyze;
3981 
3982   // Returns true if a given use occurs in the predicated block. Phi nodes use
3983   // their operands in their corresponding predecessor blocks.
3984   auto isBlockOfUsePredicated = [&](Use &U) -> bool {
3985     auto *I = cast<Instruction>(U.getUser());
3986     BasicBlock *BB = I->getParent();
3987     if (auto *Phi = dyn_cast<PHINode>(I))
3988       BB = Phi->getIncomingBlock(
3989           PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
3990     return BB == PredBB;
3991   };
3992 
3993   // Iteratively sink the scalarized operands of the predicated instruction
3994   // into the block we created for it. When an instruction is sunk, it's
3995   // operands are then added to the worklist. The algorithm ends after one pass
3996   // through the worklist doesn't sink a single instruction.
3997   bool Changed;
3998   do {
3999     // Add the instructions that need to be reanalyzed to the worklist, and
4000     // reset the changed indicator.
4001     Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
4002     InstsToReanalyze.clear();
4003     Changed = false;
4004 
4005     while (!Worklist.empty()) {
4006       auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
4007 
4008       // We can't sink an instruction if it is a phi node, is already in the
4009       // predicated block, is not in the loop, or may have side effects.
4010       if (!I || isa<PHINode>(I) || I->getParent() == PredBB ||
4011           !VectorLoop->contains(I) || I->mayHaveSideEffects())
4012         continue;
4013 
4014       // It's legal to sink the instruction if all its uses occur in the
4015       // predicated block. Otherwise, there's nothing to do yet, and we may
4016       // need to reanalyze the instruction.
4017       if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
4018         InstsToReanalyze.push_back(I);
4019         continue;
4020       }
4021 
4022       // Move the instruction to the beginning of the predicated block, and add
4023       // it's operands to the worklist.
4024       I->moveBefore(&*PredBB->getFirstInsertionPt());
4025       Worklist.insert(I->op_begin(), I->op_end());
4026 
4027       // The sinking may have enabled other instructions to be sunk, so we will
4028       // need to iterate.
4029       Changed = true;
4030     }
4031   } while (Changed);
4032 }
4033 
4034 void InnerLoopVectorizer::fixNonInductionPHIs() {
4035   for (PHINode *OrigPhi : OrigPHIsToFix) {
4036     PHINode *NewPhi =
4037         cast<PHINode>(VectorLoopValueMap.getVectorValue(OrigPhi, 0));
4038     unsigned NumIncomingValues = OrigPhi->getNumIncomingValues();
4039 
4040     SmallVector<BasicBlock *, 2> ScalarBBPredecessors(
4041         predecessors(OrigPhi->getParent()));
4042     SmallVector<BasicBlock *, 2> VectorBBPredecessors(
4043         predecessors(NewPhi->getParent()));
4044     assert(ScalarBBPredecessors.size() == VectorBBPredecessors.size() &&
4045            "Scalar and Vector BB should have the same number of predecessors");
4046 
4047     // The insertion point in Builder may be invalidated by the time we get
4048     // here. Force the Builder insertion point to something valid so that we do
4049     // not run into issues during insertion point restore in
4050     // getOrCreateVectorValue calls below.
4051     Builder.SetInsertPoint(NewPhi);
4052 
4053     // The predecessor order is preserved and we can rely on mapping between
4054     // scalar and vector block predecessors.
4055     for (unsigned i = 0; i < NumIncomingValues; ++i) {
4056       BasicBlock *NewPredBB = VectorBBPredecessors[i];
4057 
4058       // When looking up the new scalar/vector values to fix up, use incoming
4059       // values from original phi.
4060       Value *ScIncV =
4061           OrigPhi->getIncomingValueForBlock(ScalarBBPredecessors[i]);
4062 
4063       // Scalar incoming value may need a broadcast
4064       Value *NewIncV = getOrCreateVectorValue(ScIncV, 0);
4065       NewPhi->addIncoming(NewIncV, NewPredBB);
4066     }
4067   }
4068 }
4069 
4070 void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, unsigned UF,
4071                                    unsigned VF, bool IsPtrLoopInvariant,
4072                                    SmallBitVector &IsIndexLoopInvariant) {
4073   // Construct a vector GEP by widening the operands of the scalar GEP as
4074   // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
4075   // results in a vector of pointers when at least one operand of the GEP
4076   // is vector-typed. Thus, to keep the representation compact, we only use
4077   // vector-typed operands for loop-varying values.
4078 
4079   if (VF > 1 && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) {
4080     // If we are vectorizing, but the GEP has only loop-invariant operands,
4081     // the GEP we build (by only using vector-typed operands for
4082     // loop-varying values) would be a scalar pointer. Thus, to ensure we
4083     // produce a vector of pointers, we need to either arbitrarily pick an
4084     // operand to broadcast, or broadcast a clone of the original GEP.
4085     // Here, we broadcast a clone of the original.
4086     //
4087     // TODO: If at some point we decide to scalarize instructions having
4088     //       loop-invariant operands, this special case will no longer be
4089     //       required. We would add the scalarization decision to
4090     //       collectLoopScalars() and teach getVectorValue() to broadcast
4091     //       the lane-zero scalar value.
4092     auto *Clone = Builder.Insert(GEP->clone());
4093     for (unsigned Part = 0; Part < UF; ++Part) {
4094       Value *EntryPart = Builder.CreateVectorSplat(VF, Clone);
4095       VectorLoopValueMap.setVectorValue(GEP, Part, EntryPart);
4096       addMetadata(EntryPart, GEP);
4097     }
4098   } else {
4099     // If the GEP has at least one loop-varying operand, we are sure to
4100     // produce a vector of pointers. But if we are only unrolling, we want
4101     // to produce a scalar GEP for each unroll part. Thus, the GEP we
4102     // produce with the code below will be scalar (if VF == 1) or vector
4103     // (otherwise). Note that for the unroll-only case, we still maintain
4104     // values in the vector mapping with initVector, as we do for other
4105     // instructions.
4106     for (unsigned Part = 0; Part < UF; ++Part) {
4107       // The pointer operand of the new GEP. If it's loop-invariant, we
4108       // won't broadcast it.
4109       auto *Ptr = IsPtrLoopInvariant
4110                       ? GEP->getPointerOperand()
4111                       : getOrCreateVectorValue(GEP->getPointerOperand(), Part);
4112 
4113       // Collect all the indices for the new GEP. If any index is
4114       // loop-invariant, we won't broadcast it.
4115       SmallVector<Value *, 4> Indices;
4116       for (auto Index : enumerate(GEP->indices())) {
4117         Value *User = Index.value().get();
4118         if (IsIndexLoopInvariant[Index.index()])
4119           Indices.push_back(User);
4120         else
4121           Indices.push_back(getOrCreateVectorValue(User, Part));
4122       }
4123 
4124       // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
4125       // but it should be a vector, otherwise.
4126       auto *NewGEP =
4127           GEP->isInBounds()
4128               ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr,
4129                                           Indices)
4130               : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices);
4131       assert((VF == 1 || NewGEP->getType()->isVectorTy()) &&
4132              "NewGEP is not a pointer vector");
4133       VectorLoopValueMap.setVectorValue(GEP, Part, NewGEP);
4134       addMetadata(NewGEP, GEP);
4135     }
4136   }
4137 }
4138 
4139 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
4140                                               unsigned VF) {
4141   PHINode *P = cast<PHINode>(PN);
4142   if (EnableVPlanNativePath) {
4143     // Currently we enter here in the VPlan-native path for non-induction
4144     // PHIs where all control flow is uniform. We simply widen these PHIs.
4145     // Create a vector phi with no operands - the vector phi operands will be
4146     // set at the end of vector code generation.
4147     Type *VecTy =
4148         (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF);
4149     Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi");
4150     VectorLoopValueMap.setVectorValue(P, 0, VecPhi);
4151     OrigPHIsToFix.push_back(P);
4152 
4153     return;
4154   }
4155 
4156   assert(PN->getParent() == OrigLoop->getHeader() &&
4157          "Non-header phis should have been handled elsewhere");
4158 
4159   // In order to support recurrences we need to be able to vectorize Phi nodes.
4160   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
4161   // stage #1: We create a new vector PHI node with no incoming edges. We'll use
4162   // this value when we vectorize all of the instructions that use the PHI.
4163   if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) {
4164     for (unsigned Part = 0; Part < UF; ++Part) {
4165       // This is phase one of vectorizing PHIs.
4166       Type *VecTy =
4167           (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF);
4168       Value *EntryPart = PHINode::Create(
4169           VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt());
4170       VectorLoopValueMap.setVectorValue(P, Part, EntryPart);
4171     }
4172     return;
4173   }
4174 
4175   setDebugLocFromInst(Builder, P);
4176 
4177   // This PHINode must be an induction variable.
4178   // Make sure that we know about it.
4179   assert(Legal->getInductionVars().count(P) && "Not an induction variable");
4180 
4181   InductionDescriptor II = Legal->getInductionVars().lookup(P);
4182   const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
4183 
4184   // FIXME: The newly created binary instructions should contain nsw/nuw flags,
4185   // which can be found from the original scalar operations.
4186   switch (II.getKind()) {
4187   case InductionDescriptor::IK_NoInduction:
4188     llvm_unreachable("Unknown induction");
4189   case InductionDescriptor::IK_IntInduction:
4190   case InductionDescriptor::IK_FpInduction:
4191     llvm_unreachable("Integer/fp induction is handled elsewhere.");
4192   case InductionDescriptor::IK_PtrInduction: {
4193     // Handle the pointer induction variable case.
4194     assert(P->getType()->isPointerTy() && "Unexpected type.");
4195     // This is the normalized GEP that starts counting at zero.
4196     Value *PtrInd = Induction;
4197     PtrInd = Builder.CreateSExtOrTrunc(PtrInd, II.getStep()->getType());
4198     // Determine the number of scalars we need to generate for each unroll
4199     // iteration. If the instruction is uniform, we only need to generate the
4200     // first lane. Otherwise, we generate all VF values.
4201     unsigned Lanes = Cost->isUniformAfterVectorization(P, VF) ? 1 : VF;
4202     // These are the scalar results. Notice that we don't generate vector GEPs
4203     // because scalar GEPs result in better code.
4204     for (unsigned Part = 0; Part < UF; ++Part) {
4205       for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
4206         Constant *Idx = ConstantInt::get(PtrInd->getType(), Lane + Part * VF);
4207         Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
4208         Value *SclrGep =
4209             emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II);
4210         SclrGep->setName("next.gep");
4211         VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep);
4212       }
4213     }
4214     return;
4215   }
4216   }
4217 }
4218 
4219 /// A helper function for checking whether an integer division-related
4220 /// instruction may divide by zero (in which case it must be predicated if
4221 /// executed conditionally in the scalar code).
4222 /// TODO: It may be worthwhile to generalize and check isKnownNonZero().
4223 /// Non-zero divisors that are non compile-time constants will not be
4224 /// converted into multiplication, so we will still end up scalarizing
4225 /// the division, but can do so w/o predication.
4226 static bool mayDivideByZero(Instruction &I) {
4227   assert((I.getOpcode() == Instruction::UDiv ||
4228           I.getOpcode() == Instruction::SDiv ||
4229           I.getOpcode() == Instruction::URem ||
4230           I.getOpcode() == Instruction::SRem) &&
4231          "Unexpected instruction");
4232   Value *Divisor = I.getOperand(1);
4233   auto *CInt = dyn_cast<ConstantInt>(Divisor);
4234   return !CInt || CInt->isZero();
4235 }
4236 
4237 void InnerLoopVectorizer::widenInstruction(Instruction &I, VPUser &User,
4238                                            VPTransformState &State) {
4239   switch (I.getOpcode()) {
4240   case Instruction::Call:
4241   case Instruction::Br:
4242   case Instruction::PHI:
4243   case Instruction::GetElementPtr:
4244   case Instruction::Select:
4245     llvm_unreachable("This instruction is handled by a different recipe.");
4246   case Instruction::UDiv:
4247   case Instruction::SDiv:
4248   case Instruction::SRem:
4249   case Instruction::URem:
4250   case Instruction::Add:
4251   case Instruction::FAdd:
4252   case Instruction::Sub:
4253   case Instruction::FSub:
4254   case Instruction::FNeg:
4255   case Instruction::Mul:
4256   case Instruction::FMul:
4257   case Instruction::FDiv:
4258   case Instruction::FRem:
4259   case Instruction::Shl:
4260   case Instruction::LShr:
4261   case Instruction::AShr:
4262   case Instruction::And:
4263   case Instruction::Or:
4264   case Instruction::Xor: {
4265     // Just widen unops and binops.
4266     setDebugLocFromInst(Builder, &I);
4267 
4268     for (unsigned Part = 0; Part < UF; ++Part) {
4269       SmallVector<Value *, 2> Ops;
4270       for (VPValue *VPOp : User.operands())
4271         Ops.push_back(State.get(VPOp, Part));
4272 
4273       Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops);
4274 
4275       if (auto *VecOp = dyn_cast<Instruction>(V))
4276         VecOp->copyIRFlags(&I);
4277 
4278       // Use this vector value for all users of the original instruction.
4279       VectorLoopValueMap.setVectorValue(&I, Part, V);
4280       addMetadata(V, &I);
4281     }
4282 
4283     break;
4284   }
4285   case Instruction::ICmp:
4286   case Instruction::FCmp: {
4287     // Widen compares. Generate vector compares.
4288     bool FCmp = (I.getOpcode() == Instruction::FCmp);
4289     auto *Cmp = cast<CmpInst>(&I);
4290     setDebugLocFromInst(Builder, Cmp);
4291     for (unsigned Part = 0; Part < UF; ++Part) {
4292       Value *A = State.get(User.getOperand(0), Part);
4293       Value *B = State.get(User.getOperand(1), Part);
4294       Value *C = nullptr;
4295       if (FCmp) {
4296         // Propagate fast math flags.
4297         IRBuilder<>::FastMathFlagGuard FMFG(Builder);
4298         Builder.setFastMathFlags(Cmp->getFastMathFlags());
4299         C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
4300       } else {
4301         C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
4302       }
4303       VectorLoopValueMap.setVectorValue(&I, Part, C);
4304       addMetadata(C, &I);
4305     }
4306 
4307     break;
4308   }
4309 
4310   case Instruction::ZExt:
4311   case Instruction::SExt:
4312   case Instruction::FPToUI:
4313   case Instruction::FPToSI:
4314   case Instruction::FPExt:
4315   case Instruction::PtrToInt:
4316   case Instruction::IntToPtr:
4317   case Instruction::SIToFP:
4318   case Instruction::UIToFP:
4319   case Instruction::Trunc:
4320   case Instruction::FPTrunc:
4321   case Instruction::BitCast: {
4322     auto *CI = cast<CastInst>(&I);
4323     setDebugLocFromInst(Builder, CI);
4324 
4325     /// Vectorize casts.
4326     Type *DestTy =
4327         (VF == 1) ? CI->getType() : VectorType::get(CI->getType(), VF);
4328 
4329     for (unsigned Part = 0; Part < UF; ++Part) {
4330       Value *A = State.get(User.getOperand(0), Part);
4331       Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
4332       VectorLoopValueMap.setVectorValue(&I, Part, Cast);
4333       addMetadata(Cast, &I);
4334     }
4335     break;
4336   }
4337   default:
4338     // This instruction is not vectorized by simple widening.
4339     LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
4340     llvm_unreachable("Unhandled instruction!");
4341   } // end of switch.
4342 }
4343 
4344 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPUser &ArgOperands,
4345                                                VPTransformState &State) {
4346   assert(!isa<DbgInfoIntrinsic>(I) &&
4347          "DbgInfoIntrinsic should have been dropped during VPlan construction");
4348   setDebugLocFromInst(Builder, &I);
4349 
4350   Module *M = I.getParent()->getParent()->getParent();
4351   auto *CI = cast<CallInst>(&I);
4352 
4353   SmallVector<Type *, 4> Tys;
4354   for (Value *ArgOperand : CI->arg_operands())
4355     Tys.push_back(ToVectorTy(ArgOperand->getType(), VF));
4356 
4357   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
4358 
4359   // The flag shows whether we use Intrinsic or a usual Call for vectorized
4360   // version of the instruction.
4361   // Is it beneficial to perform intrinsic call compared to lib call?
4362   bool NeedToScalarize = false;
4363   unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize);
4364   bool UseVectorIntrinsic =
4365       ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost;
4366   assert((UseVectorIntrinsic || !NeedToScalarize) &&
4367          "Instruction should be scalarized elsewhere.");
4368 
4369   for (unsigned Part = 0; Part < UF; ++Part) {
4370     SmallVector<Value *, 4> Args;
4371     for (auto &I : enumerate(ArgOperands.operands())) {
4372       // Some intrinsics have a scalar argument - don't replace it with a
4373       // vector.
4374       Value *Arg;
4375       if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index()))
4376         Arg = State.get(I.value(), Part);
4377       else
4378         Arg = State.get(I.value(), {0, 0});
4379       Args.push_back(Arg);
4380     }
4381 
4382     Function *VectorF;
4383     if (UseVectorIntrinsic) {
4384       // Use vector version of the intrinsic.
4385       Type *TysForDecl[] = {CI->getType()};
4386       if (VF > 1)
4387         TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
4388       VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
4389       assert(VectorF && "Can't retrieve vector intrinsic.");
4390     } else {
4391       // Use vector version of the function call.
4392       const VFShape Shape =
4393           VFShape::get(*CI, {VF, false} /*EC*/, false /*HasGlobalPred*/);
4394 #ifndef NDEBUG
4395       assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr &&
4396              "Can't create vector function.");
4397 #endif
4398         VectorF = VFDatabase(*CI).getVectorizedFunction(Shape);
4399     }
4400       SmallVector<OperandBundleDef, 1> OpBundles;
4401       CI->getOperandBundlesAsDefs(OpBundles);
4402       CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
4403 
4404       if (isa<FPMathOperator>(V))
4405         V->copyFastMathFlags(CI);
4406 
4407       VectorLoopValueMap.setVectorValue(&I, Part, V);
4408       addMetadata(V, &I);
4409   }
4410 }
4411 
4412 void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I,
4413                                                  bool InvariantCond) {
4414   setDebugLocFromInst(Builder, &I);
4415 
4416   // The condition can be loop invariant  but still defined inside the
4417   // loop. This means that we can't just use the original 'cond' value.
4418   // We have to take the 'vectorized' value and pick the first lane.
4419   // Instcombine will make this a no-op.
4420 
4421   auto *ScalarCond = getOrCreateScalarValue(I.getOperand(0), {0, 0});
4422 
4423   for (unsigned Part = 0; Part < UF; ++Part) {
4424     Value *Cond = getOrCreateVectorValue(I.getOperand(0), Part);
4425     Value *Op0 = getOrCreateVectorValue(I.getOperand(1), Part);
4426     Value *Op1 = getOrCreateVectorValue(I.getOperand(2), Part);
4427     Value *Sel =
4428         Builder.CreateSelect(InvariantCond ? ScalarCond : Cond, Op0, Op1);
4429     VectorLoopValueMap.setVectorValue(&I, Part, Sel);
4430     addMetadata(Sel, &I);
4431   }
4432 }
4433 
4434 void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
4435   // We should not collect Scalars more than once per VF. Right now, this
4436   // function is called from collectUniformsAndScalars(), which already does
4437   // this check. Collecting Scalars for VF=1 does not make any sense.
4438   assert(VF >= 2 && Scalars.find(VF) == Scalars.end() &&
4439          "This function should not be visited twice for the same VF");
4440 
4441   SmallSetVector<Instruction *, 8> Worklist;
4442 
4443   // These sets are used to seed the analysis with pointers used by memory
4444   // accesses that will remain scalar.
4445   SmallSetVector<Instruction *, 8> ScalarPtrs;
4446   SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
4447 
4448   // A helper that returns true if the use of Ptr by MemAccess will be scalar.
4449   // The pointer operands of loads and stores will be scalar as long as the
4450   // memory access is not a gather or scatter operation. The value operand of a
4451   // store will remain scalar if the store is scalarized.
4452   auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
4453     InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
4454     assert(WideningDecision != CM_Unknown &&
4455            "Widening decision should be ready at this moment");
4456     if (auto *Store = dyn_cast<StoreInst>(MemAccess))
4457       if (Ptr == Store->getValueOperand())
4458         return WideningDecision == CM_Scalarize;
4459     assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
4460            "Ptr is neither a value or pointer operand");
4461     return WideningDecision != CM_GatherScatter;
4462   };
4463 
4464   // A helper that returns true if the given value is a bitcast or
4465   // getelementptr instruction contained in the loop.
4466   auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
4467     return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
4468             isa<GetElementPtrInst>(V)) &&
4469            !TheLoop->isLoopInvariant(V);
4470   };
4471 
4472   // A helper that evaluates a memory access's use of a pointer. If the use
4473   // will be a scalar use, and the pointer is only used by memory accesses, we
4474   // place the pointer in ScalarPtrs. Otherwise, the pointer is placed in
4475   // PossibleNonScalarPtrs.
4476   auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
4477     // We only care about bitcast and getelementptr instructions contained in
4478     // the loop.
4479     if (!isLoopVaryingBitCastOrGEP(Ptr))
4480       return;
4481 
4482     // If the pointer has already been identified as scalar (e.g., if it was
4483     // also identified as uniform), there's nothing to do.
4484     auto *I = cast<Instruction>(Ptr);
4485     if (Worklist.count(I))
4486       return;
4487 
4488     // If the use of the pointer will be a scalar use, and all users of the
4489     // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
4490     // place the pointer in PossibleNonScalarPtrs.
4491     if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
4492           return isa<LoadInst>(U) || isa<StoreInst>(U);
4493         }))
4494       ScalarPtrs.insert(I);
4495     else
4496       PossibleNonScalarPtrs.insert(I);
4497   };
4498 
4499   // We seed the scalars analysis with three classes of instructions: (1)
4500   // instructions marked uniform-after-vectorization, (2) bitcast and
4501   // getelementptr instructions used by memory accesses requiring a scalar use,
4502   // and (3) pointer induction variables and their update instructions (we
4503   // currently only scalarize these).
4504   //
4505   // (1) Add to the worklist all instructions that have been identified as
4506   // uniform-after-vectorization.
4507   Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
4508 
4509   // (2) Add to the worklist all bitcast and getelementptr instructions used by
4510   // memory accesses requiring a scalar use. The pointer operands of loads and
4511   // stores will be scalar as long as the memory accesses is not a gather or
4512   // scatter operation. The value operand of a store will remain scalar if the
4513   // store is scalarized.
4514   for (auto *BB : TheLoop->blocks())
4515     for (auto &I : *BB) {
4516       if (auto *Load = dyn_cast<LoadInst>(&I)) {
4517         evaluatePtrUse(Load, Load->getPointerOperand());
4518       } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
4519         evaluatePtrUse(Store, Store->getPointerOperand());
4520         evaluatePtrUse(Store, Store->getValueOperand());
4521       }
4522     }
4523   for (auto *I : ScalarPtrs)
4524     if (PossibleNonScalarPtrs.find(I) == PossibleNonScalarPtrs.end()) {
4525       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
4526       Worklist.insert(I);
4527     }
4528 
4529   // (3) Add to the worklist all pointer induction variables and their update
4530   // instructions.
4531   //
4532   // TODO: Once we are able to vectorize pointer induction variables we should
4533   //       no longer insert them into the worklist here.
4534   auto *Latch = TheLoop->getLoopLatch();
4535   for (auto &Induction : Legal->getInductionVars()) {
4536     auto *Ind = Induction.first;
4537     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4538     if (Induction.second.getKind() != InductionDescriptor::IK_PtrInduction)
4539       continue;
4540     Worklist.insert(Ind);
4541     Worklist.insert(IndUpdate);
4542     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
4543     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
4544                       << "\n");
4545   }
4546 
4547   // Insert the forced scalars.
4548   // FIXME: Currently widenPHIInstruction() often creates a dead vector
4549   // induction variable when the PHI user is scalarized.
4550   auto ForcedScalar = ForcedScalars.find(VF);
4551   if (ForcedScalar != ForcedScalars.end())
4552     for (auto *I : ForcedScalar->second)
4553       Worklist.insert(I);
4554 
4555   // Expand the worklist by looking through any bitcasts and getelementptr
4556   // instructions we've already identified as scalar. This is similar to the
4557   // expansion step in collectLoopUniforms(); however, here we're only
4558   // expanding to include additional bitcasts and getelementptr instructions.
4559   unsigned Idx = 0;
4560   while (Idx != Worklist.size()) {
4561     Instruction *Dst = Worklist[Idx++];
4562     if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
4563       continue;
4564     auto *Src = cast<Instruction>(Dst->getOperand(0));
4565     if (llvm::all_of(Src->users(), [&](User *U) -> bool {
4566           auto *J = cast<Instruction>(U);
4567           return !TheLoop->contains(J) || Worklist.count(J) ||
4568                  ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
4569                   isScalarUse(J, Src));
4570         })) {
4571       Worklist.insert(Src);
4572       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
4573     }
4574   }
4575 
4576   // An induction variable will remain scalar if all users of the induction
4577   // variable and induction variable update remain scalar.
4578   for (auto &Induction : Legal->getInductionVars()) {
4579     auto *Ind = Induction.first;
4580     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4581 
4582     // We already considered pointer induction variables, so there's no reason
4583     // to look at their users again.
4584     //
4585     // TODO: Once we are able to vectorize pointer induction variables we
4586     //       should no longer skip over them here.
4587     if (Induction.second.getKind() == InductionDescriptor::IK_PtrInduction)
4588       continue;
4589 
4590     // If tail-folding is applied, the primary induction variable will be used
4591     // to feed a vector compare.
4592     if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
4593       continue;
4594 
4595     // Determine if all users of the induction variable are scalar after
4596     // vectorization.
4597     auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4598       auto *I = cast<Instruction>(U);
4599       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I);
4600     });
4601     if (!ScalarInd)
4602       continue;
4603 
4604     // Determine if all users of the induction variable update instruction are
4605     // scalar after vectorization.
4606     auto ScalarIndUpdate =
4607         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4608           auto *I = cast<Instruction>(U);
4609           return I == Ind || !TheLoop->contains(I) || Worklist.count(I);
4610         });
4611     if (!ScalarIndUpdate)
4612       continue;
4613 
4614     // The induction variable and its update instruction will remain scalar.
4615     Worklist.insert(Ind);
4616     Worklist.insert(IndUpdate);
4617     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
4618     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
4619                       << "\n");
4620   }
4621 
4622   Scalars[VF].insert(Worklist.begin(), Worklist.end());
4623 }
4624 
4625 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, unsigned VF) {
4626   if (!blockNeedsPredication(I->getParent()))
4627     return false;
4628   switch(I->getOpcode()) {
4629   default:
4630     break;
4631   case Instruction::Load:
4632   case Instruction::Store: {
4633     if (!Legal->isMaskRequired(I))
4634       return false;
4635     auto *Ptr = getLoadStorePointerOperand(I);
4636     auto *Ty = getMemInstValueType(I);
4637     // We have already decided how to vectorize this instruction, get that
4638     // result.
4639     if (VF > 1) {
4640       InstWidening WideningDecision = getWideningDecision(I, VF);
4641       assert(WideningDecision != CM_Unknown &&
4642              "Widening decision should be ready at this moment");
4643       return WideningDecision == CM_Scalarize;
4644     }
4645     const MaybeAlign Alignment = getLoadStoreAlignment(I);
4646     return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
4647                                 isLegalMaskedGather(Ty, Alignment))
4648                             : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
4649                                 isLegalMaskedScatter(Ty, Alignment));
4650   }
4651   case Instruction::UDiv:
4652   case Instruction::SDiv:
4653   case Instruction::SRem:
4654   case Instruction::URem:
4655     return mayDivideByZero(*I);
4656   }
4657   return false;
4658 }
4659 
4660 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(Instruction *I,
4661                                                                unsigned VF) {
4662   assert(isAccessInterleaved(I) && "Expecting interleaved access.");
4663   assert(getWideningDecision(I, VF) == CM_Unknown &&
4664          "Decision should not be set yet.");
4665   auto *Group = getInterleavedAccessGroup(I);
4666   assert(Group && "Must have a group.");
4667 
4668   // If the instruction's allocated size doesn't equal it's type size, it
4669   // requires padding and will be scalarized.
4670   auto &DL = I->getModule()->getDataLayout();
4671   auto *ScalarTy = getMemInstValueType(I);
4672   if (hasIrregularType(ScalarTy, DL, VF))
4673     return false;
4674 
4675   // Check if masking is required.
4676   // A Group may need masking for one of two reasons: it resides in a block that
4677   // needs predication, or it was decided to use masking to deal with gaps.
4678   bool PredicatedAccessRequiresMasking =
4679       Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I);
4680   bool AccessWithGapsRequiresMasking =
4681       Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
4682   if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking)
4683     return true;
4684 
4685   // If masked interleaving is required, we expect that the user/target had
4686   // enabled it, because otherwise it either wouldn't have been created or
4687   // it should have been invalidated by the CostModel.
4688   assert(useMaskedInterleavedAccesses(TTI) &&
4689          "Masked interleave-groups for predicated accesses are not enabled.");
4690 
4691   auto *Ty = getMemInstValueType(I);
4692   const MaybeAlign Alignment = getLoadStoreAlignment(I);
4693   return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
4694                           : TTI.isLegalMaskedStore(Ty, Alignment);
4695 }
4696 
4697 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(Instruction *I,
4698                                                                unsigned VF) {
4699   // Get and ensure we have a valid memory instruction.
4700   LoadInst *LI = dyn_cast<LoadInst>(I);
4701   StoreInst *SI = dyn_cast<StoreInst>(I);
4702   assert((LI || SI) && "Invalid memory instruction");
4703 
4704   auto *Ptr = getLoadStorePointerOperand(I);
4705 
4706   // In order to be widened, the pointer should be consecutive, first of all.
4707   if (!Legal->isConsecutivePtr(Ptr))
4708     return false;
4709 
4710   // If the instruction is a store located in a predicated block, it will be
4711   // scalarized.
4712   if (isScalarWithPredication(I))
4713     return false;
4714 
4715   // If the instruction's allocated size doesn't equal it's type size, it
4716   // requires padding and will be scalarized.
4717   auto &DL = I->getModule()->getDataLayout();
4718   auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType();
4719   if (hasIrregularType(ScalarTy, DL, VF))
4720     return false;
4721 
4722   return true;
4723 }
4724 
4725 void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
4726   // We should not collect Uniforms more than once per VF. Right now,
4727   // this function is called from collectUniformsAndScalars(), which
4728   // already does this check. Collecting Uniforms for VF=1 does not make any
4729   // sense.
4730 
4731   assert(VF >= 2 && Uniforms.find(VF) == Uniforms.end() &&
4732          "This function should not be visited twice for the same VF");
4733 
4734   // Visit the list of Uniforms. If we'll not find any uniform value, we'll
4735   // not analyze again.  Uniforms.count(VF) will return 1.
4736   Uniforms[VF].clear();
4737 
4738   // We now know that the loop is vectorizable!
4739   // Collect instructions inside the loop that will remain uniform after
4740   // vectorization.
4741 
4742   // Global values, params and instructions outside of current loop are out of
4743   // scope.
4744   auto isOutOfScope = [&](Value *V) -> bool {
4745     Instruction *I = dyn_cast<Instruction>(V);
4746     return (!I || !TheLoop->contains(I));
4747   };
4748 
4749   SetVector<Instruction *> Worklist;
4750   BasicBlock *Latch = TheLoop->getLoopLatch();
4751 
4752   // Instructions that are scalar with predication must not be considered
4753   // uniform after vectorization, because that would create an erroneous
4754   // replicating region where only a single instance out of VF should be formed.
4755   // TODO: optimize such seldom cases if found important, see PR40816.
4756   auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
4757     if (isScalarWithPredication(I, VF)) {
4758       LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "
4759                         << *I << "\n");
4760       return;
4761     }
4762     LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
4763     Worklist.insert(I);
4764   };
4765 
4766   // Start with the conditional branch. If the branch condition is an
4767   // instruction contained in the loop that is only used by the branch, it is
4768   // uniform.
4769   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
4770   if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
4771     addToWorklistIfAllowed(Cmp);
4772 
4773   // Holds consecutive and consecutive-like pointers. Consecutive-like pointers
4774   // are pointers that are treated like consecutive pointers during
4775   // vectorization. The pointer operands of interleaved accesses are an
4776   // example.
4777   SmallSetVector<Instruction *, 8> ConsecutiveLikePtrs;
4778 
4779   // Holds pointer operands of instructions that are possibly non-uniform.
4780   SmallPtrSet<Instruction *, 8> PossibleNonUniformPtrs;
4781 
4782   auto isUniformDecision = [&](Instruction *I, unsigned VF) {
4783     InstWidening WideningDecision = getWideningDecision(I, VF);
4784     assert(WideningDecision != CM_Unknown &&
4785            "Widening decision should be ready at this moment");
4786 
4787     return (WideningDecision == CM_Widen ||
4788             WideningDecision == CM_Widen_Reverse ||
4789             WideningDecision == CM_Interleave);
4790   };
4791   // Iterate over the instructions in the loop, and collect all
4792   // consecutive-like pointer operands in ConsecutiveLikePtrs. If it's possible
4793   // that a consecutive-like pointer operand will be scalarized, we collect it
4794   // in PossibleNonUniformPtrs instead. We use two sets here because a single
4795   // getelementptr instruction can be used by both vectorized and scalarized
4796   // memory instructions. For example, if a loop loads and stores from the same
4797   // location, but the store is conditional, the store will be scalarized, and
4798   // the getelementptr won't remain uniform.
4799   for (auto *BB : TheLoop->blocks())
4800     for (auto &I : *BB) {
4801       // If there's no pointer operand, there's nothing to do.
4802       auto *Ptr = dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
4803       if (!Ptr)
4804         continue;
4805 
4806       // True if all users of Ptr are memory accesses that have Ptr as their
4807       // pointer operand.
4808       auto UsersAreMemAccesses =
4809           llvm::all_of(Ptr->users(), [&](User *U) -> bool {
4810             return getLoadStorePointerOperand(U) == Ptr;
4811           });
4812 
4813       // Ensure the memory instruction will not be scalarized or used by
4814       // gather/scatter, making its pointer operand non-uniform. If the pointer
4815       // operand is used by any instruction other than a memory access, we
4816       // conservatively assume the pointer operand may be non-uniform.
4817       if (!UsersAreMemAccesses || !isUniformDecision(&I, VF))
4818         PossibleNonUniformPtrs.insert(Ptr);
4819 
4820       // If the memory instruction will be vectorized and its pointer operand
4821       // is consecutive-like, or interleaving - the pointer operand should
4822       // remain uniform.
4823       else
4824         ConsecutiveLikePtrs.insert(Ptr);
4825     }
4826 
4827   // Add to the Worklist all consecutive and consecutive-like pointers that
4828   // aren't also identified as possibly non-uniform.
4829   for (auto *V : ConsecutiveLikePtrs)
4830     if (PossibleNonUniformPtrs.find(V) == PossibleNonUniformPtrs.end())
4831       addToWorklistIfAllowed(V);
4832 
4833   // Expand Worklist in topological order: whenever a new instruction
4834   // is added , its users should be already inside Worklist.  It ensures
4835   // a uniform instruction will only be used by uniform instructions.
4836   unsigned idx = 0;
4837   while (idx != Worklist.size()) {
4838     Instruction *I = Worklist[idx++];
4839 
4840     for (auto OV : I->operand_values()) {
4841       // isOutOfScope operands cannot be uniform instructions.
4842       if (isOutOfScope(OV))
4843         continue;
4844       // First order recurrence Phi's should typically be considered
4845       // non-uniform.
4846       auto *OP = dyn_cast<PHINode>(OV);
4847       if (OP && Legal->isFirstOrderRecurrence(OP))
4848         continue;
4849       // If all the users of the operand are uniform, then add the
4850       // operand into the uniform worklist.
4851       auto *OI = cast<Instruction>(OV);
4852       if (llvm::all_of(OI->users(), [&](User *U) -> bool {
4853             auto *J = cast<Instruction>(U);
4854             return Worklist.count(J) ||
4855                    (OI == getLoadStorePointerOperand(J) &&
4856                     isUniformDecision(J, VF));
4857           }))
4858         addToWorklistIfAllowed(OI);
4859     }
4860   }
4861 
4862   // Returns true if Ptr is the pointer operand of a memory access instruction
4863   // I, and I is known to not require scalarization.
4864   auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
4865     return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF);
4866   };
4867 
4868   // For an instruction to be added into Worklist above, all its users inside
4869   // the loop should also be in Worklist. However, this condition cannot be
4870   // true for phi nodes that form a cyclic dependence. We must process phi
4871   // nodes separately. An induction variable will remain uniform if all users
4872   // of the induction variable and induction variable update remain uniform.
4873   // The code below handles both pointer and non-pointer induction variables.
4874   for (auto &Induction : Legal->getInductionVars()) {
4875     auto *Ind = Induction.first;
4876     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4877 
4878     // Determine if all users of the induction variable are uniform after
4879     // vectorization.
4880     auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4881       auto *I = cast<Instruction>(U);
4882       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
4883              isVectorizedMemAccessUse(I, Ind);
4884     });
4885     if (!UniformInd)
4886       continue;
4887 
4888     // Determine if all users of the induction variable update instruction are
4889     // uniform after vectorization.
4890     auto UniformIndUpdate =
4891         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4892           auto *I = cast<Instruction>(U);
4893           return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
4894                  isVectorizedMemAccessUse(I, IndUpdate);
4895         });
4896     if (!UniformIndUpdate)
4897       continue;
4898 
4899     // The induction variable and its update instruction will remain uniform.
4900     addToWorklistIfAllowed(Ind);
4901     addToWorklistIfAllowed(IndUpdate);
4902   }
4903 
4904   Uniforms[VF].insert(Worklist.begin(), Worklist.end());
4905 }
4906 
4907 bool LoopVectorizationCostModel::runtimeChecksRequired() {
4908   LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
4909 
4910   if (Legal->getRuntimePointerChecking()->Need) {
4911     reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
4912         "runtime pointer checks needed. Enable vectorization of this "
4913         "loop with '#pragma clang loop vectorize(enable)' when "
4914         "compiling with -Os/-Oz",
4915         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4916     return true;
4917   }
4918 
4919   if (!PSE.getUnionPredicate().getPredicates().empty()) {
4920     reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
4921         "runtime SCEV checks needed. Enable vectorization of this "
4922         "loop with '#pragma clang loop vectorize(enable)' when "
4923         "compiling with -Os/-Oz",
4924         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4925     return true;
4926   }
4927 
4928   // FIXME: Avoid specializing for stride==1 instead of bailing out.
4929   if (!Legal->getLAI()->getSymbolicStrides().empty()) {
4930     reportVectorizationFailure("Runtime stride check is required with -Os/-Oz",
4931         "runtime stride == 1 checks needed. Enable vectorization of "
4932         "this loop with '#pragma clang loop vectorize(enable)' when "
4933         "compiling with -Os/-Oz",
4934         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4935     return true;
4936   }
4937 
4938   return false;
4939 }
4940 
4941 Optional<unsigned> LoopVectorizationCostModel::computeMaxVF() {
4942   if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
4943     // TODO: It may by useful to do since it's still likely to be dynamically
4944     // uniform if the target can skip.
4945     reportVectorizationFailure(
4946         "Not inserting runtime ptr check for divergent target",
4947         "runtime pointer checks needed. Not enabled for divergent target",
4948         "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
4949     return None;
4950   }
4951 
4952   unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
4953   LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
4954   if (TC == 1) {
4955     reportVectorizationFailure("Single iteration (non) loop",
4956         "loop trip count is one, irrelevant for vectorization",
4957         "SingleIterationLoop", ORE, TheLoop);
4958     return None;
4959   }
4960 
4961   switch (ScalarEpilogueStatus) {
4962   case CM_ScalarEpilogueAllowed:
4963     return computeFeasibleMaxVF(TC);
4964   case CM_ScalarEpilogueNotNeededUsePredicate:
4965     LLVM_DEBUG(
4966         dbgs() << "LV: vector predicate hint/switch found.\n"
4967                << "LV: Not allowing scalar epilogue, creating predicated "
4968                << "vector loop.\n");
4969     break;
4970   case CM_ScalarEpilogueNotAllowedLowTripLoop:
4971     // fallthrough as a special case of OptForSize
4972   case CM_ScalarEpilogueNotAllowedOptSize:
4973     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
4974       LLVM_DEBUG(
4975           dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
4976     else
4977       LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
4978                         << "count.\n");
4979 
4980     // Bail if runtime checks are required, which are not good when optimising
4981     // for size.
4982     if (runtimeChecksRequired())
4983       return None;
4984     break;
4985   }
4986 
4987   // Now try the tail folding
4988 
4989   // Invalidate interleave groups that require an epilogue if we can't mask
4990   // the interleave-group.
4991   if (!useMaskedInterleavedAccesses(TTI)) {
4992     assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() &&
4993            "No decisions should have been taken at this point");
4994     // Note: There is no need to invalidate any cost modeling decisions here, as
4995     // non where taken so far.
4996     InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
4997   }
4998 
4999   unsigned MaxVF = computeFeasibleMaxVF(TC);
5000   if (TC > 0 && TC % MaxVF == 0) {
5001     // Accept MaxVF if we do not have a tail.
5002     LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
5003     return MaxVF;
5004   }
5005 
5006   // If we don't know the precise trip count, or if the trip count that we
5007   // found modulo the vectorization factor is not zero, try to fold the tail
5008   // by masking.
5009   // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
5010   if (Legal->prepareToFoldTailByMasking()) {
5011     FoldTailByMasking = true;
5012     return MaxVF;
5013   }
5014 
5015   if (TC == 0) {
5016     reportVectorizationFailure(
5017         "Unable to calculate the loop count due to complex control flow",
5018         "unable to calculate the loop count due to complex control flow",
5019         "UnknownLoopCountComplexCFG", ORE, TheLoop);
5020     return None;
5021   }
5022 
5023   reportVectorizationFailure(
5024       "Cannot optimize for size and vectorize at the same time.",
5025       "cannot optimize for size and vectorize at the same time. "
5026       "Enable vectorization of this loop with '#pragma clang loop "
5027       "vectorize(enable)' when compiling with -Os/-Oz",
5028       "NoTailLoopWithOptForSize", ORE, TheLoop);
5029   return None;
5030 }
5031 
5032 unsigned
5033 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) {
5034   MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
5035   unsigned SmallestType, WidestType;
5036   std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
5037   unsigned WidestRegister = TTI.getRegisterBitWidth(true);
5038 
5039   // Get the maximum safe dependence distance in bits computed by LAA.
5040   // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
5041   // the memory accesses that is most restrictive (involved in the smallest
5042   // dependence distance).
5043   unsigned MaxSafeRegisterWidth = Legal->getMaxSafeRegisterWidth();
5044 
5045   WidestRegister = std::min(WidestRegister, MaxSafeRegisterWidth);
5046 
5047   unsigned MaxVectorSize = WidestRegister / WidestType;
5048 
5049   LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
5050                     << " / " << WidestType << " bits.\n");
5051   LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
5052                     << WidestRegister << " bits.\n");
5053 
5054   assert(MaxVectorSize <= 256 && "Did not expect to pack so many elements"
5055                                  " into one vector!");
5056   if (MaxVectorSize == 0) {
5057     LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n");
5058     MaxVectorSize = 1;
5059     return MaxVectorSize;
5060   } else if (ConstTripCount && ConstTripCount < MaxVectorSize &&
5061              isPowerOf2_32(ConstTripCount)) {
5062     // We need to clamp the VF to be the ConstTripCount. There is no point in
5063     // choosing a higher viable VF as done in the loop below.
5064     LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: "
5065                       << ConstTripCount << "\n");
5066     MaxVectorSize = ConstTripCount;
5067     return MaxVectorSize;
5068   }
5069 
5070   unsigned MaxVF = MaxVectorSize;
5071   if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) ||
5072       (MaximizeBandwidth && isScalarEpilogueAllowed())) {
5073     // Collect all viable vectorization factors larger than the default MaxVF
5074     // (i.e. MaxVectorSize).
5075     SmallVector<unsigned, 8> VFs;
5076     unsigned NewMaxVectorSize = WidestRegister / SmallestType;
5077     for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2)
5078       VFs.push_back(VS);
5079 
5080     // For each VF calculate its register usage.
5081     auto RUs = calculateRegisterUsage(VFs);
5082 
5083     // Select the largest VF which doesn't require more registers than existing
5084     // ones.
5085     for (int i = RUs.size() - 1; i >= 0; --i) {
5086       bool Selected = true;
5087       for (auto& pair : RUs[i].MaxLocalUsers) {
5088         unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5089         if (pair.second > TargetNumRegisters)
5090           Selected = false;
5091       }
5092       if (Selected) {
5093         MaxVF = VFs[i];
5094         break;
5095       }
5096     }
5097     if (unsigned MinVF = TTI.getMinimumVF(SmallestType)) {
5098       if (MaxVF < MinVF) {
5099         LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
5100                           << ") with target's minimum: " << MinVF << '\n');
5101         MaxVF = MinVF;
5102       }
5103     }
5104   }
5105   return MaxVF;
5106 }
5107 
5108 VectorizationFactor
5109 LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) {
5110   float Cost = expectedCost(1).first;
5111   const float ScalarCost = Cost;
5112   unsigned Width = 1;
5113   LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n");
5114 
5115   bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
5116   if (ForceVectorization && MaxVF > 1) {
5117     // Ignore scalar width, because the user explicitly wants vectorization.
5118     // Initialize cost to max so that VF = 2 is, at least, chosen during cost
5119     // evaluation.
5120     Cost = std::numeric_limits<float>::max();
5121   }
5122 
5123   for (unsigned i = 2; i <= MaxVF; i *= 2) {
5124     // Notice that the vector loop needs to be executed less times, so
5125     // we need to divide the cost of the vector loops by the width of
5126     // the vector elements.
5127     VectorizationCostTy C = expectedCost(i);
5128     float VectorCost = C.first / (float)i;
5129     LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
5130                       << " costs: " << (int)VectorCost << ".\n");
5131     if (!C.second && !ForceVectorization) {
5132       LLVM_DEBUG(
5133           dbgs() << "LV: Not considering vector loop of width " << i
5134                  << " because it will not generate any vector instructions.\n");
5135       continue;
5136     }
5137     if (VectorCost < Cost) {
5138       Cost = VectorCost;
5139       Width = i;
5140     }
5141   }
5142 
5143   if (!EnableCondStoresVectorization && NumPredStores) {
5144     reportVectorizationFailure("There are conditional stores.",
5145         "store that is conditionally executed prevents vectorization",
5146         "ConditionalStore", ORE, TheLoop);
5147     Width = 1;
5148     Cost = ScalarCost;
5149   }
5150 
5151   LLVM_DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs()
5152              << "LV: Vectorization seems to be not beneficial, "
5153              << "but was forced by a user.\n");
5154   LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n");
5155   VectorizationFactor Factor = {Width, (unsigned)(Width * Cost)};
5156   return Factor;
5157 }
5158 
5159 std::pair<unsigned, unsigned>
5160 LoopVectorizationCostModel::getSmallestAndWidestTypes() {
5161   unsigned MinWidth = -1U;
5162   unsigned MaxWidth = 8;
5163   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5164 
5165   // For each block.
5166   for (BasicBlock *BB : TheLoop->blocks()) {
5167     // For each instruction in the loop.
5168     for (Instruction &I : BB->instructionsWithoutDebug()) {
5169       Type *T = I.getType();
5170 
5171       // Skip ignored values.
5172       if (ValuesToIgnore.find(&I) != ValuesToIgnore.end())
5173         continue;
5174 
5175       // Only examine Loads, Stores and PHINodes.
5176       if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
5177         continue;
5178 
5179       // Examine PHI nodes that are reduction variables. Update the type to
5180       // account for the recurrence type.
5181       if (auto *PN = dyn_cast<PHINode>(&I)) {
5182         if (!Legal->isReductionVariable(PN))
5183           continue;
5184         RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[PN];
5185         T = RdxDesc.getRecurrenceType();
5186       }
5187 
5188       // Examine the stored values.
5189       if (auto *ST = dyn_cast<StoreInst>(&I))
5190         T = ST->getValueOperand()->getType();
5191 
5192       // Ignore loaded pointer types and stored pointer types that are not
5193       // vectorizable.
5194       //
5195       // FIXME: The check here attempts to predict whether a load or store will
5196       //        be vectorized. We only know this for certain after a VF has
5197       //        been selected. Here, we assume that if an access can be
5198       //        vectorized, it will be. We should also look at extending this
5199       //        optimization to non-pointer types.
5200       //
5201       if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) &&
5202           !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I))
5203         continue;
5204 
5205       MinWidth = std::min(MinWidth,
5206                           (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
5207       MaxWidth = std::max(MaxWidth,
5208                           (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
5209     }
5210   }
5211 
5212   return {MinWidth, MaxWidth};
5213 }
5214 
5215 unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF,
5216                                                            unsigned LoopCost) {
5217   // -- The interleave heuristics --
5218   // We interleave the loop in order to expose ILP and reduce the loop overhead.
5219   // There are many micro-architectural considerations that we can't predict
5220   // at this level. For example, frontend pressure (on decode or fetch) due to
5221   // code size, or the number and capabilities of the execution ports.
5222   //
5223   // We use the following heuristics to select the interleave count:
5224   // 1. If the code has reductions, then we interleave to break the cross
5225   // iteration dependency.
5226   // 2. If the loop is really small, then we interleave to reduce the loop
5227   // overhead.
5228   // 3. We don't interleave if we think that we will spill registers to memory
5229   // due to the increased register pressure.
5230 
5231   if (!isScalarEpilogueAllowed())
5232     return 1;
5233 
5234   // We used the distance for the interleave count.
5235   if (Legal->getMaxSafeDepDistBytes() != -1U)
5236     return 1;
5237 
5238   // Do not interleave loops with a relatively small known or estimated trip
5239   // count.
5240   auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
5241   if (BestKnownTC && *BestKnownTC < TinyTripCountInterleaveThreshold)
5242     return 1;
5243 
5244   RegisterUsage R = calculateRegisterUsage({VF})[0];
5245   // We divide by these constants so assume that we have at least one
5246   // instruction that uses at least one register.
5247   for (auto& pair : R.MaxLocalUsers) {
5248     pair.second = std::max(pair.second, 1U);
5249   }
5250 
5251   // We calculate the interleave count using the following formula.
5252   // Subtract the number of loop invariants from the number of available
5253   // registers. These registers are used by all of the interleaved instances.
5254   // Next, divide the remaining registers by the number of registers that is
5255   // required by the loop, in order to estimate how many parallel instances
5256   // fit without causing spills. All of this is rounded down if necessary to be
5257   // a power of two. We want power of two interleave count to simplify any
5258   // addressing operations or alignment considerations.
5259   // We also want power of two interleave counts to ensure that the induction
5260   // variable of the vector loop wraps to zero, when tail is folded by masking;
5261   // this currently happens when OptForSize, in which case IC is set to 1 above.
5262   unsigned IC = UINT_MAX;
5263 
5264   for (auto& pair : R.MaxLocalUsers) {
5265     unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5266     LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
5267                       << " registers of "
5268                       << TTI.getRegisterClassName(pair.first) << " register class\n");
5269     if (VF == 1) {
5270       if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
5271         TargetNumRegisters = ForceTargetNumScalarRegs;
5272     } else {
5273       if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
5274         TargetNumRegisters = ForceTargetNumVectorRegs;
5275     }
5276     unsigned MaxLocalUsers = pair.second;
5277     unsigned LoopInvariantRegs = 0;
5278     if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end())
5279       LoopInvariantRegs = R.LoopInvariantRegs[pair.first];
5280 
5281     unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers);
5282     // Don't count the induction variable as interleaved.
5283     if (EnableIndVarRegisterHeur) {
5284       TmpIC =
5285           PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) /
5286                         std::max(1U, (MaxLocalUsers - 1)));
5287     }
5288 
5289     IC = std::min(IC, TmpIC);
5290   }
5291 
5292   // Clamp the interleave ranges to reasonable counts.
5293   unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF);
5294 
5295   // Check if the user has overridden the max.
5296   if (VF == 1) {
5297     if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
5298       MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
5299   } else {
5300     if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
5301       MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
5302   }
5303 
5304   // If trip count is known or estimated compile time constant, limit the
5305   // interleave count to be less than the trip count divided by VF.
5306   if (BestKnownTC) {
5307     MaxInterleaveCount = std::min(*BestKnownTC / VF, MaxInterleaveCount);
5308   }
5309 
5310   // If we did not calculate the cost for VF (because the user selected the VF)
5311   // then we calculate the cost of VF here.
5312   if (LoopCost == 0)
5313     LoopCost = expectedCost(VF).first;
5314 
5315   assert(LoopCost && "Non-zero loop cost expected");
5316 
5317   // Clamp the calculated IC to be between the 1 and the max interleave count
5318   // that the target and trip count allows.
5319   if (IC > MaxInterleaveCount)
5320     IC = MaxInterleaveCount;
5321   else if (IC < 1)
5322     IC = 1;
5323 
5324   // Interleave if we vectorized this loop and there is a reduction that could
5325   // benefit from interleaving.
5326   if (VF > 1 && !Legal->getReductionVars().empty()) {
5327     LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
5328     return IC;
5329   }
5330 
5331   // Note that if we've already vectorized the loop we will have done the
5332   // runtime check and so interleaving won't require further checks.
5333   bool InterleavingRequiresRuntimePointerCheck =
5334       (VF == 1 && Legal->getRuntimePointerChecking()->Need);
5335 
5336   // We want to interleave small loops in order to reduce the loop overhead and
5337   // potentially expose ILP opportunities.
5338   LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n');
5339   if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) {
5340     // We assume that the cost overhead is 1 and we use the cost model
5341     // to estimate the cost of the loop and interleave until the cost of the
5342     // loop overhead is about 5% of the cost of the loop.
5343     unsigned SmallIC =
5344         std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost));
5345 
5346     // Interleave until store/load ports (estimated by max interleave count) are
5347     // saturated.
5348     unsigned NumStores = Legal->getNumStores();
5349     unsigned NumLoads = Legal->getNumLoads();
5350     unsigned StoresIC = IC / (NumStores ? NumStores : 1);
5351     unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
5352 
5353     // If we have a scalar reduction (vector reductions are already dealt with
5354     // by this point), we can increase the critical path length if the loop
5355     // we're interleaving is inside another loop. Limit, by default to 2, so the
5356     // critical path only gets increased by one reduction operation.
5357     if (!Legal->getReductionVars().empty() && TheLoop->getLoopDepth() > 1) {
5358       unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
5359       SmallIC = std::min(SmallIC, F);
5360       StoresIC = std::min(StoresIC, F);
5361       LoadsIC = std::min(LoadsIC, F);
5362     }
5363 
5364     if (EnableLoadStoreRuntimeInterleave &&
5365         std::max(StoresIC, LoadsIC) > SmallIC) {
5366       LLVM_DEBUG(
5367           dbgs() << "LV: Interleaving to saturate store or load ports.\n");
5368       return std::max(StoresIC, LoadsIC);
5369     }
5370 
5371     LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
5372     return SmallIC;
5373   }
5374 
5375   // Interleave if this is a large loop (small loops are already dealt with by
5376   // this point) that could benefit from interleaving.
5377   bool HasReductions = !Legal->getReductionVars().empty();
5378   if (TTI.enableAggressiveInterleaving(HasReductions)) {
5379     LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5380     return IC;
5381   }
5382 
5383   LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
5384   return 1;
5385 }
5386 
5387 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
5388 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {
5389   // This function calculates the register usage by measuring the highest number
5390   // of values that are alive at a single location. Obviously, this is a very
5391   // rough estimation. We scan the loop in a topological order in order and
5392   // assign a number to each instruction. We use RPO to ensure that defs are
5393   // met before their users. We assume that each instruction that has in-loop
5394   // users starts an interval. We record every time that an in-loop value is
5395   // used, so we have a list of the first and last occurrences of each
5396   // instruction. Next, we transpose this data structure into a multi map that
5397   // holds the list of intervals that *end* at a specific location. This multi
5398   // map allows us to perform a linear search. We scan the instructions linearly
5399   // and record each time that a new interval starts, by placing it in a set.
5400   // If we find this value in the multi-map then we remove it from the set.
5401   // The max register usage is the maximum size of the set.
5402   // We also search for instructions that are defined outside the loop, but are
5403   // used inside the loop. We need this number separately from the max-interval
5404   // usage number because when we unroll, loop-invariant values do not take
5405   // more register.
5406   LoopBlocksDFS DFS(TheLoop);
5407   DFS.perform(LI);
5408 
5409   RegisterUsage RU;
5410 
5411   // Each 'key' in the map opens a new interval. The values
5412   // of the map are the index of the 'last seen' usage of the
5413   // instruction that is the key.
5414   using IntervalMap = DenseMap<Instruction *, unsigned>;
5415 
5416   // Maps instruction to its index.
5417   SmallVector<Instruction *, 64> IdxToInstr;
5418   // Marks the end of each interval.
5419   IntervalMap EndPoint;
5420   // Saves the list of instruction indices that are used in the loop.
5421   SmallPtrSet<Instruction *, 8> Ends;
5422   // Saves the list of values that are used in the loop but are
5423   // defined outside the loop, such as arguments and constants.
5424   SmallPtrSet<Value *, 8> LoopInvariants;
5425 
5426   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
5427     for (Instruction &I : BB->instructionsWithoutDebug()) {
5428       IdxToInstr.push_back(&I);
5429 
5430       // Save the end location of each USE.
5431       for (Value *U : I.operands()) {
5432         auto *Instr = dyn_cast<Instruction>(U);
5433 
5434         // Ignore non-instruction values such as arguments, constants, etc.
5435         if (!Instr)
5436           continue;
5437 
5438         // If this instruction is outside the loop then record it and continue.
5439         if (!TheLoop->contains(Instr)) {
5440           LoopInvariants.insert(Instr);
5441           continue;
5442         }
5443 
5444         // Overwrite previous end points.
5445         EndPoint[Instr] = IdxToInstr.size();
5446         Ends.insert(Instr);
5447       }
5448     }
5449   }
5450 
5451   // Saves the list of intervals that end with the index in 'key'.
5452   using InstrList = SmallVector<Instruction *, 2>;
5453   DenseMap<unsigned, InstrList> TransposeEnds;
5454 
5455   // Transpose the EndPoints to a list of values that end at each index.
5456   for (auto &Interval : EndPoint)
5457     TransposeEnds[Interval.second].push_back(Interval.first);
5458 
5459   SmallPtrSet<Instruction *, 8> OpenIntervals;
5460 
5461   // Get the size of the widest register.
5462   unsigned MaxSafeDepDist = -1U;
5463   if (Legal->getMaxSafeDepDistBytes() != -1U)
5464     MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8;
5465   unsigned WidestRegister =
5466       std::min(TTI.getRegisterBitWidth(true), MaxSafeDepDist);
5467   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5468 
5469   SmallVector<RegisterUsage, 8> RUs(VFs.size());
5470   SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size());
5471 
5472   LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
5473 
5474   // A lambda that gets the register usage for the given type and VF.
5475   auto GetRegUsage = [&DL, WidestRegister](Type *Ty, unsigned VF) {
5476     if (Ty->isTokenTy())
5477       return 0U;
5478     unsigned TypeSize = DL.getTypeSizeInBits(Ty->getScalarType());
5479     return std::max<unsigned>(1, VF * TypeSize / WidestRegister);
5480   };
5481 
5482   for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
5483     Instruction *I = IdxToInstr[i];
5484 
5485     // Remove all of the instructions that end at this location.
5486     InstrList &List = TransposeEnds[i];
5487     for (Instruction *ToRemove : List)
5488       OpenIntervals.erase(ToRemove);
5489 
5490     // Ignore instructions that are never used within the loop.
5491     if (Ends.find(I) == Ends.end())
5492       continue;
5493 
5494     // Skip ignored values.
5495     if (ValuesToIgnore.find(I) != ValuesToIgnore.end())
5496       continue;
5497 
5498     // For each VF find the maximum usage of registers.
5499     for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
5500       // Count the number of live intervals.
5501       SmallMapVector<unsigned, unsigned, 4> RegUsage;
5502 
5503       if (VFs[j] == 1) {
5504         for (auto Inst : OpenIntervals) {
5505           unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
5506           if (RegUsage.find(ClassID) == RegUsage.end())
5507             RegUsage[ClassID] = 1;
5508           else
5509             RegUsage[ClassID] += 1;
5510         }
5511       } else {
5512         collectUniformsAndScalars(VFs[j]);
5513         for (auto Inst : OpenIntervals) {
5514           // Skip ignored values for VF > 1.
5515           if (VecValuesToIgnore.find(Inst) != VecValuesToIgnore.end())
5516             continue;
5517           if (isScalarAfterVectorization(Inst, VFs[j])) {
5518             unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
5519             if (RegUsage.find(ClassID) == RegUsage.end())
5520               RegUsage[ClassID] = 1;
5521             else
5522               RegUsage[ClassID] += 1;
5523           } else {
5524             unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType());
5525             if (RegUsage.find(ClassID) == RegUsage.end())
5526               RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]);
5527             else
5528               RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]);
5529           }
5530         }
5531       }
5532 
5533       for (auto& pair : RegUsage) {
5534         if (MaxUsages[j].find(pair.first) != MaxUsages[j].end())
5535           MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second);
5536         else
5537           MaxUsages[j][pair.first] = pair.second;
5538       }
5539     }
5540 
5541     LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
5542                       << OpenIntervals.size() << '\n');
5543 
5544     // Add the current instruction to the list of open intervals.
5545     OpenIntervals.insert(I);
5546   }
5547 
5548   for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
5549     SmallMapVector<unsigned, unsigned, 4> Invariant;
5550 
5551     for (auto Inst : LoopInvariants) {
5552       unsigned Usage = VFs[i] == 1 ? 1 : GetRegUsage(Inst->getType(), VFs[i]);
5553       unsigned ClassID = TTI.getRegisterClassForType(VFs[i] > 1, Inst->getType());
5554       if (Invariant.find(ClassID) == Invariant.end())
5555         Invariant[ClassID] = Usage;
5556       else
5557         Invariant[ClassID] += Usage;
5558     }
5559 
5560     LLVM_DEBUG({
5561       dbgs() << "LV(REG): VF = " << VFs[i] << '\n';
5562       dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size()
5563              << " item\n";
5564       for (const auto &pair : MaxUsages[i]) {
5565         dbgs() << "LV(REG): RegisterClass: "
5566                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5567                << " registers\n";
5568       }
5569       dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()
5570              << " item\n";
5571       for (const auto &pair : Invariant) {
5572         dbgs() << "LV(REG): RegisterClass: "
5573                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5574                << " registers\n";
5575       }
5576     });
5577 
5578     RU.LoopInvariantRegs = Invariant;
5579     RU.MaxLocalUsers = MaxUsages[i];
5580     RUs[i] = RU;
5581   }
5582 
5583   return RUs;
5584 }
5585 
5586 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){
5587   // TODO: Cost model for emulated masked load/store is completely
5588   // broken. This hack guides the cost model to use an artificially
5589   // high enough value to practically disable vectorization with such
5590   // operations, except where previously deployed legality hack allowed
5591   // using very low cost values. This is to avoid regressions coming simply
5592   // from moving "masked load/store" check from legality to cost model.
5593   // Masked Load/Gather emulation was previously never allowed.
5594   // Limited number of Masked Store/Scatter emulation was allowed.
5595   assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction");
5596   return isa<LoadInst>(I) ||
5597          (isa<StoreInst>(I) &&
5598           NumPredStores > NumberOfStoresToPredicate);
5599 }
5600 
5601 void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) {
5602   // If we aren't vectorizing the loop, or if we've already collected the
5603   // instructions to scalarize, there's nothing to do. Collection may already
5604   // have occurred if we have a user-selected VF and are now computing the
5605   // expected cost for interleaving.
5606   if (VF < 2 || InstsToScalarize.find(VF) != InstsToScalarize.end())
5607     return;
5608 
5609   // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
5610   // not profitable to scalarize any instructions, the presence of VF in the
5611   // map will indicate that we've analyzed it already.
5612   ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
5613 
5614   // Find all the instructions that are scalar with predication in the loop and
5615   // determine if it would be better to not if-convert the blocks they are in.
5616   // If so, we also record the instructions to scalarize.
5617   for (BasicBlock *BB : TheLoop->blocks()) {
5618     if (!blockNeedsPredication(BB))
5619       continue;
5620     for (Instruction &I : *BB)
5621       if (isScalarWithPredication(&I)) {
5622         ScalarCostsTy ScalarCosts;
5623         // Do not apply discount logic if hacked cost is needed
5624         // for emulated masked memrefs.
5625         if (!useEmulatedMaskMemRefHack(&I) &&
5626             computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
5627           ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
5628         // Remember that BB will remain after vectorization.
5629         PredicatedBBsAfterVectorization.insert(BB);
5630       }
5631   }
5632 }
5633 
5634 int LoopVectorizationCostModel::computePredInstDiscount(
5635     Instruction *PredInst, DenseMap<Instruction *, unsigned> &ScalarCosts,
5636     unsigned VF) {
5637   assert(!isUniformAfterVectorization(PredInst, VF) &&
5638          "Instruction marked uniform-after-vectorization will be predicated");
5639 
5640   // Initialize the discount to zero, meaning that the scalar version and the
5641   // vector version cost the same.
5642   int Discount = 0;
5643 
5644   // Holds instructions to analyze. The instructions we visit are mapped in
5645   // ScalarCosts. Those instructions are the ones that would be scalarized if
5646   // we find that the scalar version costs less.
5647   SmallVector<Instruction *, 8> Worklist;
5648 
5649   // Returns true if the given instruction can be scalarized.
5650   auto canBeScalarized = [&](Instruction *I) -> bool {
5651     // We only attempt to scalarize instructions forming a single-use chain
5652     // from the original predicated block that would otherwise be vectorized.
5653     // Although not strictly necessary, we give up on instructions we know will
5654     // already be scalar to avoid traversing chains that are unlikely to be
5655     // beneficial.
5656     if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
5657         isScalarAfterVectorization(I, VF))
5658       return false;
5659 
5660     // If the instruction is scalar with predication, it will be analyzed
5661     // separately. We ignore it within the context of PredInst.
5662     if (isScalarWithPredication(I))
5663       return false;
5664 
5665     // If any of the instruction's operands are uniform after vectorization,
5666     // the instruction cannot be scalarized. This prevents, for example, a
5667     // masked load from being scalarized.
5668     //
5669     // We assume we will only emit a value for lane zero of an instruction
5670     // marked uniform after vectorization, rather than VF identical values.
5671     // Thus, if we scalarize an instruction that uses a uniform, we would
5672     // create uses of values corresponding to the lanes we aren't emitting code
5673     // for. This behavior can be changed by allowing getScalarValue to clone
5674     // the lane zero values for uniforms rather than asserting.
5675     for (Use &U : I->operands())
5676       if (auto *J = dyn_cast<Instruction>(U.get()))
5677         if (isUniformAfterVectorization(J, VF))
5678           return false;
5679 
5680     // Otherwise, we can scalarize the instruction.
5681     return true;
5682   };
5683 
5684   // Compute the expected cost discount from scalarizing the entire expression
5685   // feeding the predicated instruction. We currently only consider expressions
5686   // that are single-use instruction chains.
5687   Worklist.push_back(PredInst);
5688   while (!Worklist.empty()) {
5689     Instruction *I = Worklist.pop_back_val();
5690 
5691     // If we've already analyzed the instruction, there's nothing to do.
5692     if (ScalarCosts.find(I) != ScalarCosts.end())
5693       continue;
5694 
5695     // Compute the cost of the vector instruction. Note that this cost already
5696     // includes the scalarization overhead of the predicated instruction.
5697     unsigned VectorCost = getInstructionCost(I, VF).first;
5698 
5699     // Compute the cost of the scalarized instruction. This cost is the cost of
5700     // the instruction as if it wasn't if-converted and instead remained in the
5701     // predicated block. We will scale this cost by block probability after
5702     // computing the scalarization overhead.
5703     unsigned ScalarCost = VF * getInstructionCost(I, 1).first;
5704 
5705     // Compute the scalarization overhead of needed insertelement instructions
5706     // and phi nodes.
5707     if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) {
5708       ScalarCost += TTI.getScalarizationOverhead(
5709           cast<VectorType>(ToVectorTy(I->getType(), VF)),
5710           APInt::getAllOnesValue(VF), true, false);
5711       ScalarCost += VF * TTI.getCFInstrCost(Instruction::PHI);
5712     }
5713 
5714     // Compute the scalarization overhead of needed extractelement
5715     // instructions. For each of the instruction's operands, if the operand can
5716     // be scalarized, add it to the worklist; otherwise, account for the
5717     // overhead.
5718     for (Use &U : I->operands())
5719       if (auto *J = dyn_cast<Instruction>(U.get())) {
5720         assert(VectorType::isValidElementType(J->getType()) &&
5721                "Instruction has non-scalar type");
5722         if (canBeScalarized(J))
5723           Worklist.push_back(J);
5724         else if (needsExtract(J, VF))
5725           ScalarCost += TTI.getScalarizationOverhead(
5726               cast<VectorType>(ToVectorTy(J->getType(), VF)),
5727               APInt::getAllOnesValue(VF), false, true);
5728       }
5729 
5730     // Scale the total scalar cost by block probability.
5731     ScalarCost /= getReciprocalPredBlockProb();
5732 
5733     // Compute the discount. A non-negative discount means the vector version
5734     // of the instruction costs more, and scalarizing would be beneficial.
5735     Discount += VectorCost - ScalarCost;
5736     ScalarCosts[I] = ScalarCost;
5737   }
5738 
5739   return Discount;
5740 }
5741 
5742 LoopVectorizationCostModel::VectorizationCostTy
5743 LoopVectorizationCostModel::expectedCost(unsigned VF) {
5744   VectorizationCostTy Cost;
5745 
5746   // For each block.
5747   for (BasicBlock *BB : TheLoop->blocks()) {
5748     VectorizationCostTy BlockCost;
5749 
5750     // For each instruction in the old loop.
5751     for (Instruction &I : BB->instructionsWithoutDebug()) {
5752       // Skip ignored values.
5753       if (ValuesToIgnore.find(&I) != ValuesToIgnore.end() ||
5754           (VF > 1 && VecValuesToIgnore.find(&I) != VecValuesToIgnore.end()))
5755         continue;
5756 
5757       VectorizationCostTy C = getInstructionCost(&I, VF);
5758 
5759       // Check if we should override the cost.
5760       if (ForceTargetInstructionCost.getNumOccurrences() > 0)
5761         C.first = ForceTargetInstructionCost;
5762 
5763       BlockCost.first += C.first;
5764       BlockCost.second |= C.second;
5765       LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first
5766                         << " for VF " << VF << " For instruction: " << I
5767                         << '\n');
5768     }
5769 
5770     // If we are vectorizing a predicated block, it will have been
5771     // if-converted. This means that the block's instructions (aside from
5772     // stores and instructions that may divide by zero) will now be
5773     // unconditionally executed. For the scalar case, we may not always execute
5774     // the predicated block. Thus, scale the block's cost by the probability of
5775     // executing it.
5776     if (VF == 1 && blockNeedsPredication(BB))
5777       BlockCost.first /= getReciprocalPredBlockProb();
5778 
5779     Cost.first += BlockCost.first;
5780     Cost.second |= BlockCost.second;
5781   }
5782 
5783   return Cost;
5784 }
5785 
5786 /// Gets Address Access SCEV after verifying that the access pattern
5787 /// is loop invariant except the induction variable dependence.
5788 ///
5789 /// This SCEV can be sent to the Target in order to estimate the address
5790 /// calculation cost.
5791 static const SCEV *getAddressAccessSCEV(
5792               Value *Ptr,
5793               LoopVectorizationLegality *Legal,
5794               PredicatedScalarEvolution &PSE,
5795               const Loop *TheLoop) {
5796 
5797   auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
5798   if (!Gep)
5799     return nullptr;
5800 
5801   // We are looking for a gep with all loop invariant indices except for one
5802   // which should be an induction variable.
5803   auto SE = PSE.getSE();
5804   unsigned NumOperands = Gep->getNumOperands();
5805   for (unsigned i = 1; i < NumOperands; ++i) {
5806     Value *Opd = Gep->getOperand(i);
5807     if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
5808         !Legal->isInductionVariable(Opd))
5809       return nullptr;
5810   }
5811 
5812   // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
5813   return PSE.getSCEV(Ptr);
5814 }
5815 
5816 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
5817   return Legal->hasStride(I->getOperand(0)) ||
5818          Legal->hasStride(I->getOperand(1));
5819 }
5820 
5821 unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
5822                                                                  unsigned VF) {
5823   assert(VF > 1 && "Scalarization cost of instruction implies vectorization.");
5824   Type *ValTy = getMemInstValueType(I);
5825   auto SE = PSE.getSE();
5826 
5827   unsigned AS = getLoadStoreAddressSpace(I);
5828   Value *Ptr = getLoadStorePointerOperand(I);
5829   Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
5830 
5831   // Figure out whether the access is strided and get the stride value
5832   // if it's known in compile time
5833   const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
5834 
5835   // Get the cost of the scalar memory instruction and address computation.
5836   unsigned Cost = VF * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
5837 
5838   // Don't pass *I here, since it is scalar but will actually be part of a
5839   // vectorized loop where the user of it is a vectorized instruction.
5840   const MaybeAlign Alignment = getLoadStoreAlignment(I);
5841   Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(),
5842                                    Alignment, AS,
5843                                    TTI::TCK_RecipThroughput);
5844 
5845   // Get the overhead of the extractelement and insertelement instructions
5846   // we might create due to scalarization.
5847   Cost += getScalarizationOverhead(I, VF);
5848 
5849   // If we have a predicated store, it may not be executed for each vector
5850   // lane. Scale the cost by the probability of executing the predicated
5851   // block.
5852   if (isPredicatedInst(I)) {
5853     Cost /= getReciprocalPredBlockProb();
5854 
5855     if (useEmulatedMaskMemRefHack(I))
5856       // Artificially setting to a high enough value to practically disable
5857       // vectorization with such operations.
5858       Cost = 3000000;
5859   }
5860 
5861   return Cost;
5862 }
5863 
5864 unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
5865                                                              unsigned VF) {
5866   Type *ValTy = getMemInstValueType(I);
5867   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
5868   Value *Ptr = getLoadStorePointerOperand(I);
5869   unsigned AS = getLoadStoreAddressSpace(I);
5870   int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
5871   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
5872 
5873   assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
5874          "Stride should be 1 or -1 for consecutive memory access");
5875   const MaybeAlign Alignment = getLoadStoreAlignment(I);
5876   unsigned Cost = 0;
5877   if (Legal->isMaskRequired(I))
5878     Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy,
5879                                       Alignment ? Alignment->value() : 0, AS,
5880                                       CostKind);
5881   else
5882     Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
5883                                 CostKind, I);
5884 
5885   bool Reverse = ConsecutiveStride < 0;
5886   if (Reverse)
5887     Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
5888   return Cost;
5889 }
5890 
5891 unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
5892                                                          unsigned VF) {
5893   Type *ValTy = getMemInstValueType(I);
5894   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
5895   const MaybeAlign Alignment = getLoadStoreAlignment(I);
5896   unsigned AS = getLoadStoreAddressSpace(I);
5897   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
5898   if (isa<LoadInst>(I)) {
5899     return TTI.getAddressComputationCost(ValTy) +
5900            TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS,
5901                                CostKind) +
5902            TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
5903   }
5904   StoreInst *SI = cast<StoreInst>(I);
5905 
5906   bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand());
5907   return TTI.getAddressComputationCost(ValTy) +
5908          TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS,
5909                              CostKind) +
5910          (isLoopInvariantStoreValue
5911               ? 0
5912               : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
5913                                        VF - 1));
5914 }
5915 
5916 unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
5917                                                           unsigned VF) {
5918   Type *ValTy = getMemInstValueType(I);
5919   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
5920   const MaybeAlign Alignment = getLoadStoreAlignment(I);
5921   Value *Ptr = getLoadStorePointerOperand(I);
5922 
5923   return TTI.getAddressComputationCost(VectorTy) +
5924          TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr,
5925                                     Legal->isMaskRequired(I),
5926                                     Alignment ? Alignment->value() : 0,
5927                                     TargetTransformInfo::TCK_RecipThroughput,
5928                                     I);
5929 }
5930 
5931 unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
5932                                                             unsigned VF) {
5933   Type *ValTy = getMemInstValueType(I);
5934   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
5935   unsigned AS = getLoadStoreAddressSpace(I);
5936 
5937   auto Group = getInterleavedAccessGroup(I);
5938   assert(Group && "Fail to get an interleaved access group.");
5939 
5940   unsigned InterleaveFactor = Group->getFactor();
5941   VectorType *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
5942 
5943   // Holds the indices of existing members in an interleaved load group.
5944   // An interleaved store group doesn't need this as it doesn't allow gaps.
5945   SmallVector<unsigned, 4> Indices;
5946   if (isa<LoadInst>(I)) {
5947     for (unsigned i = 0; i < InterleaveFactor; i++)
5948       if (Group->getMember(i))
5949         Indices.push_back(i);
5950   }
5951 
5952   // Calculate the cost of the whole interleaved group.
5953   bool UseMaskForGaps =
5954       Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
5955   unsigned Cost = TTI.getInterleavedMemoryOpCost(
5956       I->getOpcode(), WideVecTy, Group->getFactor(), Indices,
5957       Group->getAlign().value(), AS, TTI::TCK_RecipThroughput,
5958       Legal->isMaskRequired(I), UseMaskForGaps);
5959 
5960   if (Group->isReverse()) {
5961     // TODO: Add support for reversed masked interleaved access.
5962     assert(!Legal->isMaskRequired(I) &&
5963            "Reverse masked interleaved access not supported.");
5964     Cost += Group->getNumMembers() *
5965             TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
5966   }
5967   return Cost;
5968 }
5969 
5970 unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
5971                                                               unsigned VF) {
5972   // Calculate scalar cost only. Vectorization cost should be ready at this
5973   // moment.
5974   if (VF == 1) {
5975     Type *ValTy = getMemInstValueType(I);
5976     const MaybeAlign Alignment = getLoadStoreAlignment(I);
5977     unsigned AS = getLoadStoreAddressSpace(I);
5978 
5979     return TTI.getAddressComputationCost(ValTy) +
5980            TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS,
5981                                TTI::TCK_RecipThroughput, I);
5982   }
5983   return getWideningCost(I, VF);
5984 }
5985 
5986 LoopVectorizationCostModel::VectorizationCostTy
5987 LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
5988   // If we know that this instruction will remain uniform, check the cost of
5989   // the scalar version.
5990   if (isUniformAfterVectorization(I, VF))
5991     VF = 1;
5992 
5993   if (VF > 1 && isProfitableToScalarize(I, VF))
5994     return VectorizationCostTy(InstsToScalarize[VF][I], false);
5995 
5996   // Forced scalars do not have any scalarization overhead.
5997   auto ForcedScalar = ForcedScalars.find(VF);
5998   if (VF > 1 && ForcedScalar != ForcedScalars.end()) {
5999     auto InstSet = ForcedScalar->second;
6000     if (InstSet.find(I) != InstSet.end())
6001       return VectorizationCostTy((getInstructionCost(I, 1).first * VF), false);
6002   }
6003 
6004   Type *VectorTy;
6005   unsigned C = getInstructionCost(I, VF, VectorTy);
6006 
6007   bool TypeNotScalarized =
6008       VF > 1 && VectorTy->isVectorTy() && TTI.getNumberOfParts(VectorTy) < VF;
6009   return VectorizationCostTy(C, TypeNotScalarized);
6010 }
6011 
6012 unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
6013                                                               unsigned VF) {
6014 
6015   if (VF == 1)
6016     return 0;
6017 
6018   unsigned Cost = 0;
6019   Type *RetTy = ToVectorTy(I->getType(), VF);
6020   if (!RetTy->isVoidTy() &&
6021       (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
6022     Cost += TTI.getScalarizationOverhead(
6023         cast<VectorType>(RetTy), APInt::getAllOnesValue(VF), true, false);
6024 
6025   // Some targets keep addresses scalar.
6026   if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
6027     return Cost;
6028 
6029   // Some targets support efficient element stores.
6030   if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
6031     return Cost;
6032 
6033   // Collect operands to consider.
6034   CallInst *CI = dyn_cast<CallInst>(I);
6035   Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands();
6036 
6037   // Skip operands that do not require extraction/scalarization and do not incur
6038   // any overhead.
6039   return Cost + TTI.getOperandsScalarizationOverhead(
6040                     filterExtractingOperands(Ops, VF), VF);
6041 }
6042 
6043 void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {
6044   if (VF == 1)
6045     return;
6046   NumPredStores = 0;
6047   for (BasicBlock *BB : TheLoop->blocks()) {
6048     // For each instruction in the old loop.
6049     for (Instruction &I : *BB) {
6050       Value *Ptr =  getLoadStorePointerOperand(&I);
6051       if (!Ptr)
6052         continue;
6053 
6054       // TODO: We should generate better code and update the cost model for
6055       // predicated uniform stores. Today they are treated as any other
6056       // predicated store (see added test cases in
6057       // invariant-store-vectorization.ll).
6058       if (isa<StoreInst>(&I) && isScalarWithPredication(&I))
6059         NumPredStores++;
6060 
6061       if (Legal->isUniform(Ptr) &&
6062           // Conditional loads and stores should be scalarized and predicated.
6063           // isScalarWithPredication cannot be used here since masked
6064           // gather/scatters are not considered scalar with predication.
6065           !Legal->blockNeedsPredication(I.getParent())) {
6066         // TODO: Avoid replicating loads and stores instead of
6067         // relying on instcombine to remove them.
6068         // Load: Scalar load + broadcast
6069         // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
6070         unsigned Cost = getUniformMemOpCost(&I, VF);
6071         setWideningDecision(&I, VF, CM_Scalarize, Cost);
6072         continue;
6073       }
6074 
6075       // We assume that widening is the best solution when possible.
6076       if (memoryInstructionCanBeWidened(&I, VF)) {
6077         unsigned Cost = getConsecutiveMemOpCost(&I, VF);
6078         int ConsecutiveStride =
6079                Legal->isConsecutivePtr(getLoadStorePointerOperand(&I));
6080         assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6081                "Expected consecutive stride.");
6082         InstWidening Decision =
6083             ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
6084         setWideningDecision(&I, VF, Decision, Cost);
6085         continue;
6086       }
6087 
6088       // Choose between Interleaving, Gather/Scatter or Scalarization.
6089       unsigned InterleaveCost = std::numeric_limits<unsigned>::max();
6090       unsigned NumAccesses = 1;
6091       if (isAccessInterleaved(&I)) {
6092         auto Group = getInterleavedAccessGroup(&I);
6093         assert(Group && "Fail to get an interleaved access group.");
6094 
6095         // Make one decision for the whole group.
6096         if (getWideningDecision(&I, VF) != CM_Unknown)
6097           continue;
6098 
6099         NumAccesses = Group->getNumMembers();
6100         if (interleavedAccessCanBeWidened(&I, VF))
6101           InterleaveCost = getInterleaveGroupCost(&I, VF);
6102       }
6103 
6104       unsigned GatherScatterCost =
6105           isLegalGatherOrScatter(&I)
6106               ? getGatherScatterCost(&I, VF) * NumAccesses
6107               : std::numeric_limits<unsigned>::max();
6108 
6109       unsigned ScalarizationCost =
6110           getMemInstScalarizationCost(&I, VF) * NumAccesses;
6111 
6112       // Choose better solution for the current VF,
6113       // write down this decision and use it during vectorization.
6114       unsigned Cost;
6115       InstWidening Decision;
6116       if (InterleaveCost <= GatherScatterCost &&
6117           InterleaveCost < ScalarizationCost) {
6118         Decision = CM_Interleave;
6119         Cost = InterleaveCost;
6120       } else if (GatherScatterCost < ScalarizationCost) {
6121         Decision = CM_GatherScatter;
6122         Cost = GatherScatterCost;
6123       } else {
6124         Decision = CM_Scalarize;
6125         Cost = ScalarizationCost;
6126       }
6127       // If the instructions belongs to an interleave group, the whole group
6128       // receives the same decision. The whole group receives the cost, but
6129       // the cost will actually be assigned to one instruction.
6130       if (auto Group = getInterleavedAccessGroup(&I))
6131         setWideningDecision(Group, VF, Decision, Cost);
6132       else
6133         setWideningDecision(&I, VF, Decision, Cost);
6134     }
6135   }
6136 
6137   // Make sure that any load of address and any other address computation
6138   // remains scalar unless there is gather/scatter support. This avoids
6139   // inevitable extracts into address registers, and also has the benefit of
6140   // activating LSR more, since that pass can't optimize vectorized
6141   // addresses.
6142   if (TTI.prefersVectorizedAddressing())
6143     return;
6144 
6145   // Start with all scalar pointer uses.
6146   SmallPtrSet<Instruction *, 8> AddrDefs;
6147   for (BasicBlock *BB : TheLoop->blocks())
6148     for (Instruction &I : *BB) {
6149       Instruction *PtrDef =
6150         dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
6151       if (PtrDef && TheLoop->contains(PtrDef) &&
6152           getWideningDecision(&I, VF) != CM_GatherScatter)
6153         AddrDefs.insert(PtrDef);
6154     }
6155 
6156   // Add all instructions used to generate the addresses.
6157   SmallVector<Instruction *, 4> Worklist;
6158   for (auto *I : AddrDefs)
6159     Worklist.push_back(I);
6160   while (!Worklist.empty()) {
6161     Instruction *I = Worklist.pop_back_val();
6162     for (auto &Op : I->operands())
6163       if (auto *InstOp = dyn_cast<Instruction>(Op))
6164         if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
6165             AddrDefs.insert(InstOp).second)
6166           Worklist.push_back(InstOp);
6167   }
6168 
6169   for (auto *I : AddrDefs) {
6170     if (isa<LoadInst>(I)) {
6171       // Setting the desired widening decision should ideally be handled in
6172       // by cost functions, but since this involves the task of finding out
6173       // if the loaded register is involved in an address computation, it is
6174       // instead changed here when we know this is the case.
6175       InstWidening Decision = getWideningDecision(I, VF);
6176       if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
6177         // Scalarize a widened load of address.
6178         setWideningDecision(I, VF, CM_Scalarize,
6179                             (VF * getMemoryInstructionCost(I, 1)));
6180       else if (auto Group = getInterleavedAccessGroup(I)) {
6181         // Scalarize an interleave group of address loads.
6182         for (unsigned I = 0; I < Group->getFactor(); ++I) {
6183           if (Instruction *Member = Group->getMember(I))
6184             setWideningDecision(Member, VF, CM_Scalarize,
6185                                 (VF * getMemoryInstructionCost(Member, 1)));
6186         }
6187       }
6188     } else
6189       // Make sure I gets scalarized and a cost estimate without
6190       // scalarization overhead.
6191       ForcedScalars[VF].insert(I);
6192   }
6193 }
6194 
6195 unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
6196                                                         unsigned VF,
6197                                                         Type *&VectorTy) {
6198   Type *RetTy = I->getType();
6199   if (canTruncateToMinimalBitwidth(I, VF))
6200     RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
6201   VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF);
6202   auto SE = PSE.getSE();
6203   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6204 
6205   // TODO: We need to estimate the cost of intrinsic calls.
6206   switch (I->getOpcode()) {
6207   case Instruction::GetElementPtr:
6208     // We mark this instruction as zero-cost because the cost of GEPs in
6209     // vectorized code depends on whether the corresponding memory instruction
6210     // is scalarized or not. Therefore, we handle GEPs with the memory
6211     // instruction cost.
6212     return 0;
6213   case Instruction::Br: {
6214     // In cases of scalarized and predicated instructions, there will be VF
6215     // predicated blocks in the vectorized loop. Each branch around these
6216     // blocks requires also an extract of its vector compare i1 element.
6217     bool ScalarPredicatedBB = false;
6218     BranchInst *BI = cast<BranchInst>(I);
6219     if (VF > 1 && BI->isConditional() &&
6220         (PredicatedBBsAfterVectorization.find(BI->getSuccessor(0)) !=
6221              PredicatedBBsAfterVectorization.end() ||
6222          PredicatedBBsAfterVectorization.find(BI->getSuccessor(1)) !=
6223              PredicatedBBsAfterVectorization.end()))
6224       ScalarPredicatedBB = true;
6225 
6226     if (ScalarPredicatedBB) {
6227       // Return cost for branches around scalarized and predicated blocks.
6228       VectorType *Vec_i1Ty =
6229           VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
6230       return (TTI.getScalarizationOverhead(Vec_i1Ty, APInt::getAllOnesValue(VF),
6231                                            false, true) +
6232               (TTI.getCFInstrCost(Instruction::Br) * VF));
6233     } else if (I->getParent() == TheLoop->getLoopLatch() || VF == 1)
6234       // The back-edge branch will remain, as will all scalar branches.
6235       return TTI.getCFInstrCost(Instruction::Br);
6236     else
6237       // This branch will be eliminated by if-conversion.
6238       return 0;
6239     // Note: We currently assume zero cost for an unconditional branch inside
6240     // a predicated block since it will become a fall-through, although we
6241     // may decide in the future to call TTI for all branches.
6242   }
6243   case Instruction::PHI: {
6244     auto *Phi = cast<PHINode>(I);
6245 
6246     // First-order recurrences are replaced by vector shuffles inside the loop.
6247     // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type.
6248     if (VF > 1 && Legal->isFirstOrderRecurrence(Phi))
6249       return TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector,
6250                                 cast<VectorType>(VectorTy), VF - 1,
6251                                 VectorType::get(RetTy, 1));
6252 
6253     // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
6254     // converted into select instructions. We require N - 1 selects per phi
6255     // node, where N is the number of incoming values.
6256     if (VF > 1 && Phi->getParent() != TheLoop->getHeader())
6257       return (Phi->getNumIncomingValues() - 1) *
6258              TTI.getCmpSelInstrCost(
6259                  Instruction::Select, ToVectorTy(Phi->getType(), VF),
6260                  ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF),
6261                  CostKind);
6262 
6263     return TTI.getCFInstrCost(Instruction::PHI);
6264   }
6265   case Instruction::UDiv:
6266   case Instruction::SDiv:
6267   case Instruction::URem:
6268   case Instruction::SRem:
6269     // If we have a predicated instruction, it may not be executed for each
6270     // vector lane. Get the scalarization cost and scale this amount by the
6271     // probability of executing the predicated block. If the instruction is not
6272     // predicated, we fall through to the next case.
6273     if (VF > 1 && isScalarWithPredication(I)) {
6274       unsigned Cost = 0;
6275 
6276       // These instructions have a non-void type, so account for the phi nodes
6277       // that we will create. This cost is likely to be zero. The phi node
6278       // cost, if any, should be scaled by the block probability because it
6279       // models a copy at the end of each predicated block.
6280       Cost += VF * TTI.getCFInstrCost(Instruction::PHI);
6281 
6282       // The cost of the non-predicated instruction.
6283       Cost += VF * TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind);
6284 
6285       // The cost of insertelement and extractelement instructions needed for
6286       // scalarization.
6287       Cost += getScalarizationOverhead(I, VF);
6288 
6289       // Scale the cost by the probability of executing the predicated blocks.
6290       // This assumes the predicated block for each vector lane is equally
6291       // likely.
6292       return Cost / getReciprocalPredBlockProb();
6293     }
6294     LLVM_FALLTHROUGH;
6295   case Instruction::Add:
6296   case Instruction::FAdd:
6297   case Instruction::Sub:
6298   case Instruction::FSub:
6299   case Instruction::Mul:
6300   case Instruction::FMul:
6301   case Instruction::FDiv:
6302   case Instruction::FRem:
6303   case Instruction::Shl:
6304   case Instruction::LShr:
6305   case Instruction::AShr:
6306   case Instruction::And:
6307   case Instruction::Or:
6308   case Instruction::Xor: {
6309     // Since we will replace the stride by 1 the multiplication should go away.
6310     if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal))
6311       return 0;
6312     // Certain instructions can be cheaper to vectorize if they have a constant
6313     // second vector operand. One example of this are shifts on x86.
6314     Value *Op2 = I->getOperand(1);
6315     TargetTransformInfo::OperandValueProperties Op2VP;
6316     TargetTransformInfo::OperandValueKind Op2VK =
6317         TTI.getOperandInfo(Op2, Op2VP);
6318     if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2))
6319       Op2VK = TargetTransformInfo::OK_UniformValue;
6320 
6321     SmallVector<const Value *, 4> Operands(I->operand_values());
6322     unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
6323     return N * TTI.getArithmeticInstrCost(
6324                    I->getOpcode(), VectorTy, CostKind,
6325                    TargetTransformInfo::OK_AnyValue,
6326                    Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I);
6327   }
6328   case Instruction::FNeg: {
6329     unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
6330     return N * TTI.getArithmeticInstrCost(
6331                    I->getOpcode(), VectorTy, CostKind,
6332                    TargetTransformInfo::OK_AnyValue,
6333                    TargetTransformInfo::OK_AnyValue,
6334                    TargetTransformInfo::OP_None, TargetTransformInfo::OP_None,
6335                    I->getOperand(0), I);
6336   }
6337   case Instruction::Select: {
6338     SelectInst *SI = cast<SelectInst>(I);
6339     const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
6340     bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
6341     Type *CondTy = SI->getCondition()->getType();
6342     if (!ScalarCond)
6343       CondTy = VectorType::get(CondTy, VF);
6344 
6345     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy,
6346                                   CostKind, I);
6347   }
6348   case Instruction::ICmp:
6349   case Instruction::FCmp: {
6350     Type *ValTy = I->getOperand(0)->getType();
6351     Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
6352     if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
6353       ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
6354     VectorTy = ToVectorTy(ValTy, VF);
6355     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, CostKind,
6356                                   I);
6357   }
6358   case Instruction::Store:
6359   case Instruction::Load: {
6360     unsigned Width = VF;
6361     if (Width > 1) {
6362       InstWidening Decision = getWideningDecision(I, Width);
6363       assert(Decision != CM_Unknown &&
6364              "CM decision should be taken at this point");
6365       if (Decision == CM_Scalarize)
6366         Width = 1;
6367     }
6368     VectorTy = ToVectorTy(getMemInstValueType(I), Width);
6369     return getMemoryInstructionCost(I, VF);
6370   }
6371   case Instruction::ZExt:
6372   case Instruction::SExt:
6373   case Instruction::FPToUI:
6374   case Instruction::FPToSI:
6375   case Instruction::FPExt:
6376   case Instruction::PtrToInt:
6377   case Instruction::IntToPtr:
6378   case Instruction::SIToFP:
6379   case Instruction::UIToFP:
6380   case Instruction::Trunc:
6381   case Instruction::FPTrunc:
6382   case Instruction::BitCast: {
6383     // We optimize the truncation of induction variables having constant
6384     // integer steps. The cost of these truncations is the same as the scalar
6385     // operation.
6386     if (isOptimizableIVTruncate(I, VF)) {
6387       auto *Trunc = cast<TruncInst>(I);
6388       return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
6389                                   Trunc->getSrcTy(), CostKind, Trunc);
6390     }
6391 
6392     Type *SrcScalarTy = I->getOperand(0)->getType();
6393     Type *SrcVecTy =
6394         VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
6395     if (canTruncateToMinimalBitwidth(I, VF)) {
6396       // This cast is going to be shrunk. This may remove the cast or it might
6397       // turn it into slightly different cast. For example, if MinBW == 16,
6398       // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
6399       //
6400       // Calculate the modified src and dest types.
6401       Type *MinVecTy = VectorTy;
6402       if (I->getOpcode() == Instruction::Trunc) {
6403         SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
6404         VectorTy =
6405             largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
6406       } else if (I->getOpcode() == Instruction::ZExt ||
6407                  I->getOpcode() == Instruction::SExt) {
6408         SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy);
6409         VectorTy =
6410             smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
6411       }
6412     }
6413 
6414     unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
6415     return N * TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy,
6416                                     CostKind, I);
6417   }
6418   case Instruction::Call: {
6419     bool NeedToScalarize;
6420     CallInst *CI = cast<CallInst>(I);
6421     unsigned CallCost = getVectorCallCost(CI, VF, NeedToScalarize);
6422     if (getVectorIntrinsicIDForCall(CI, TLI))
6423       return std::min(CallCost, getVectorIntrinsicCost(CI, VF));
6424     return CallCost;
6425   }
6426   default:
6427     // The cost of executing VF copies of the scalar instruction. This opcode
6428     // is unknown. Assume that it is the same as 'mul'.
6429     return VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy,
6430                                            CostKind) +
6431            getScalarizationOverhead(I, VF);
6432   } // end of switch.
6433 }
6434 
6435 char LoopVectorize::ID = 0;
6436 
6437 static const char lv_name[] = "Loop Vectorization";
6438 
6439 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
6440 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
6441 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)
6442 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
6443 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
6444 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
6445 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
6446 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
6447 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
6448 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
6449 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
6450 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
6451 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
6452 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
6453 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy)
6454 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
6455 
6456 namespace llvm {
6457 
6458 Pass *createLoopVectorizePass() { return new LoopVectorize(); }
6459 
6460 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced,
6461                               bool VectorizeOnlyWhenForced) {
6462   return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced);
6463 }
6464 
6465 } // end namespace llvm
6466 
6467 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
6468   // Check if the pointer operand of a load or store instruction is
6469   // consecutive.
6470   if (auto *Ptr = getLoadStorePointerOperand(Inst))
6471     return Legal->isConsecutivePtr(Ptr);
6472   return false;
6473 }
6474 
6475 void LoopVectorizationCostModel::collectValuesToIgnore() {
6476   // Ignore ephemeral values.
6477   CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
6478 
6479   // Ignore type-promoting instructions we identified during reduction
6480   // detection.
6481   for (auto &Reduction : Legal->getReductionVars()) {
6482     RecurrenceDescriptor &RedDes = Reduction.second;
6483     SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
6484     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
6485   }
6486   // Ignore type-casting instructions we identified during induction
6487   // detection.
6488   for (auto &Induction : Legal->getInductionVars()) {
6489     InductionDescriptor &IndDes = Induction.second;
6490     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
6491     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
6492   }
6493 }
6494 
6495 // TODO: we could return a pair of values that specify the max VF and
6496 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
6497 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
6498 // doesn't have a cost model that can choose which plan to execute if
6499 // more than one is generated.
6500 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits,
6501                                  LoopVectorizationCostModel &CM) {
6502   unsigned WidestType;
6503   std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
6504   return WidestVectorRegBits / WidestType;
6505 }
6506 
6507 VectorizationFactor
6508 LoopVectorizationPlanner::planInVPlanNativePath(unsigned UserVF) {
6509   unsigned VF = UserVF;
6510   // Outer loop handling: They may require CFG and instruction level
6511   // transformations before even evaluating whether vectorization is profitable.
6512   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
6513   // the vectorization pipeline.
6514   if (!OrigLoop->empty()) {
6515     // If the user doesn't provide a vectorization factor, determine a
6516     // reasonable one.
6517     if (!UserVF) {
6518       VF = determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM);
6519       LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
6520 
6521       // Make sure we have a VF > 1 for stress testing.
6522       if (VPlanBuildStressTest && VF < 2) {
6523         LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
6524                           << "overriding computed VF.\n");
6525         VF = 4;
6526       }
6527     }
6528     assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
6529     assert(isPowerOf2_32(VF) && "VF needs to be a power of two");
6530     LLVM_DEBUG(dbgs() << "LV: Using " << (UserVF ? "user " : "") << "VF " << VF
6531                       << " to build VPlans.\n");
6532     buildVPlans(VF, VF);
6533 
6534     // For VPlan build stress testing, we bail out after VPlan construction.
6535     if (VPlanBuildStressTest)
6536       return VectorizationFactor::Disabled();
6537 
6538     return {VF, 0};
6539   }
6540 
6541   LLVM_DEBUG(
6542       dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
6543                 "VPlan-native path.\n");
6544   return VectorizationFactor::Disabled();
6545 }
6546 
6547 Optional<VectorizationFactor> LoopVectorizationPlanner::plan(unsigned UserVF) {
6548   assert(OrigLoop->empty() && "Inner loop expected.");
6549   Optional<unsigned> MaybeMaxVF = CM.computeMaxVF();
6550   if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved.
6551     return None;
6552 
6553   // Invalidate interleave groups if all blocks of loop will be predicated.
6554   if (CM.blockNeedsPredication(OrigLoop->getHeader()) &&
6555       !useMaskedInterleavedAccesses(*TTI)) {
6556     LLVM_DEBUG(
6557         dbgs()
6558         << "LV: Invalidate all interleaved groups due to fold-tail by masking "
6559            "which requires masked-interleaved support.\n");
6560     if (CM.InterleaveInfo.invalidateGroups())
6561       // Invalidating interleave groups also requires invalidating all decisions
6562       // based on them, which includes widening decisions and uniform and scalar
6563       // values.
6564       CM.invalidateCostModelingDecisions();
6565   }
6566 
6567   if (UserVF) {
6568     LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
6569     assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two");
6570     // Collect the instructions (and their associated costs) that will be more
6571     // profitable to scalarize.
6572     CM.selectUserVectorizationFactor(UserVF);
6573     buildVPlansWithVPRecipes(UserVF, UserVF);
6574     LLVM_DEBUG(printPlans(dbgs()));
6575     return {{UserVF, 0}};
6576   }
6577 
6578   unsigned MaxVF = MaybeMaxVF.getValue();
6579   assert(MaxVF != 0 && "MaxVF is zero.");
6580 
6581   for (unsigned VF = 1; VF <= MaxVF; VF *= 2) {
6582     // Collect Uniform and Scalar instructions after vectorization with VF.
6583     CM.collectUniformsAndScalars(VF);
6584 
6585     // Collect the instructions (and their associated costs) that will be more
6586     // profitable to scalarize.
6587     if (VF > 1)
6588       CM.collectInstsToScalarize(VF);
6589   }
6590 
6591   buildVPlansWithVPRecipes(1, MaxVF);
6592   LLVM_DEBUG(printPlans(dbgs()));
6593   if (MaxVF == 1)
6594     return VectorizationFactor::Disabled();
6595 
6596   // Select the optimal vectorization factor.
6597   return CM.selectVectorizationFactor(MaxVF);
6598 }
6599 
6600 void LoopVectorizationPlanner::setBestPlan(unsigned VF, unsigned UF) {
6601   LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF
6602                     << '\n');
6603   BestVF = VF;
6604   BestUF = UF;
6605 
6606   erase_if(VPlans, [VF](const VPlanPtr &Plan) {
6607     return !Plan->hasVF(VF);
6608   });
6609   assert(VPlans.size() == 1 && "Best VF has not a single VPlan.");
6610 }
6611 
6612 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV,
6613                                            DominatorTree *DT) {
6614   // Perform the actual loop transformation.
6615 
6616   // 1. Create a new empty loop. Unlink the old loop and connect the new one.
6617   VPCallbackILV CallbackILV(ILV);
6618 
6619   VPTransformState State{BestVF, BestUF,      LI,
6620                          DT,     ILV.Builder, ILV.VectorLoopValueMap,
6621                          &ILV,   CallbackILV};
6622   State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
6623   State.TripCount = ILV.getOrCreateTripCount(nullptr);
6624   State.CanonicalIV = ILV.Induction;
6625 
6626   //===------------------------------------------------===//
6627   //
6628   // Notice: any optimization or new instruction that go
6629   // into the code below should also be implemented in
6630   // the cost-model.
6631   //
6632   //===------------------------------------------------===//
6633 
6634   // 2. Copy and widen instructions from the old loop into the new loop.
6635   assert(VPlans.size() == 1 && "Not a single VPlan to execute.");
6636   VPlans.front()->execute(&State);
6637 
6638   // 3. Fix the vectorized code: take care of header phi's, live-outs,
6639   //    predication, updating analyses.
6640   ILV.fixVectorizedLoop();
6641 }
6642 
6643 void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
6644     SmallPtrSetImpl<Instruction *> &DeadInstructions) {
6645   BasicBlock *Latch = OrigLoop->getLoopLatch();
6646 
6647   // We create new control-flow for the vectorized loop, so the original
6648   // condition will be dead after vectorization if it's only used by the
6649   // branch.
6650   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
6651   if (Cmp && Cmp->hasOneUse())
6652     DeadInstructions.insert(Cmp);
6653 
6654   // We create new "steps" for induction variable updates to which the original
6655   // induction variables map. An original update instruction will be dead if
6656   // all its users except the induction variable are dead.
6657   for (auto &Induction : Legal->getInductionVars()) {
6658     PHINode *Ind = Induction.first;
6659     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
6660     if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
6661           return U == Ind || DeadInstructions.find(cast<Instruction>(U)) !=
6662                                  DeadInstructions.end();
6663         }))
6664       DeadInstructions.insert(IndUpdate);
6665 
6666     // We record as "Dead" also the type-casting instructions we had identified
6667     // during induction analysis. We don't need any handling for them in the
6668     // vectorized loop because we have proven that, under a proper runtime
6669     // test guarding the vectorized loop, the value of the phi, and the casted
6670     // value of the phi, are the same. The last instruction in this casting chain
6671     // will get its scalar/vector/widened def from the scalar/vector/widened def
6672     // of the respective phi node. Any other casts in the induction def-use chain
6673     // have no other uses outside the phi update chain, and will be ignored.
6674     InductionDescriptor &IndDes = Induction.second;
6675     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
6676     DeadInstructions.insert(Casts.begin(), Casts.end());
6677   }
6678 }
6679 
6680 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; }
6681 
6682 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; }
6683 
6684 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step,
6685                                         Instruction::BinaryOps BinOp) {
6686   // When unrolling and the VF is 1, we only need to add a simple scalar.
6687   Type *Ty = Val->getType();
6688   assert(!Ty->isVectorTy() && "Val must be a scalar");
6689 
6690   if (Ty->isFloatingPointTy()) {
6691     Constant *C = ConstantFP::get(Ty, (double)StartIdx);
6692 
6693     // Floating point operations had to be 'fast' to enable the unrolling.
6694     Value *MulOp = addFastMathFlag(Builder.CreateFMul(C, Step));
6695     return addFastMathFlag(Builder.CreateBinOp(BinOp, Val, MulOp));
6696   }
6697   Constant *C = ConstantInt::get(Ty, StartIdx);
6698   return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction");
6699 }
6700 
6701 static void AddRuntimeUnrollDisableMetaData(Loop *L) {
6702   SmallVector<Metadata *, 4> MDs;
6703   // Reserve first location for self reference to the LoopID metadata node.
6704   MDs.push_back(nullptr);
6705   bool IsUnrollMetadata = false;
6706   MDNode *LoopID = L->getLoopID();
6707   if (LoopID) {
6708     // First find existing loop unrolling disable metadata.
6709     for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
6710       auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
6711       if (MD) {
6712         const auto *S = dyn_cast<MDString>(MD->getOperand(0));
6713         IsUnrollMetadata =
6714             S && S->getString().startswith("llvm.loop.unroll.disable");
6715       }
6716       MDs.push_back(LoopID->getOperand(i));
6717     }
6718   }
6719 
6720   if (!IsUnrollMetadata) {
6721     // Add runtime unroll disable metadata.
6722     LLVMContext &Context = L->getHeader()->getContext();
6723     SmallVector<Metadata *, 1> DisableOperands;
6724     DisableOperands.push_back(
6725         MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
6726     MDNode *DisableNode = MDNode::get(Context, DisableOperands);
6727     MDs.push_back(DisableNode);
6728     MDNode *NewLoopID = MDNode::get(Context, MDs);
6729     // Set operand 0 to refer to the loop id itself.
6730     NewLoopID->replaceOperandWith(0, NewLoopID);
6731     L->setLoopID(NewLoopID);
6732   }
6733 }
6734 
6735 bool LoopVectorizationPlanner::getDecisionAndClampRange(
6736     const std::function<bool(unsigned)> &Predicate, VFRange &Range) {
6737   assert(Range.End > Range.Start && "Trying to test an empty VF range.");
6738   bool PredicateAtRangeStart = Predicate(Range.Start);
6739 
6740   for (unsigned TmpVF = Range.Start * 2; TmpVF < Range.End; TmpVF *= 2)
6741     if (Predicate(TmpVF) != PredicateAtRangeStart) {
6742       Range.End = TmpVF;
6743       break;
6744     }
6745 
6746   return PredicateAtRangeStart;
6747 }
6748 
6749 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
6750 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
6751 /// of VF's starting at a given VF and extending it as much as possible. Each
6752 /// vectorization decision can potentially shorten this sub-range during
6753 /// buildVPlan().
6754 void LoopVectorizationPlanner::buildVPlans(unsigned MinVF, unsigned MaxVF) {
6755   for (unsigned VF = MinVF; VF < MaxVF + 1;) {
6756     VFRange SubRange = {VF, MaxVF + 1};
6757     VPlans.push_back(buildVPlan(SubRange));
6758     VF = SubRange.End;
6759   }
6760 }
6761 
6762 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
6763                                          VPlanPtr &Plan) {
6764   assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
6765 
6766   // Look for cached value.
6767   std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
6768   EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
6769   if (ECEntryIt != EdgeMaskCache.end())
6770     return ECEntryIt->second;
6771 
6772   VPValue *SrcMask = createBlockInMask(Src, Plan);
6773 
6774   // The terminator has to be a branch inst!
6775   BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
6776   assert(BI && "Unexpected terminator found");
6777 
6778   if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1))
6779     return EdgeMaskCache[Edge] = SrcMask;
6780 
6781   VPValue *EdgeMask = Plan->getVPValue(BI->getCondition());
6782   assert(EdgeMask && "No Edge Mask found for condition");
6783 
6784   if (BI->getSuccessor(0) != Dst)
6785     EdgeMask = Builder.createNot(EdgeMask);
6786 
6787   if (SrcMask) // Otherwise block in-mask is all-one, no need to AND.
6788     EdgeMask = Builder.createAnd(EdgeMask, SrcMask);
6789 
6790   return EdgeMaskCache[Edge] = EdgeMask;
6791 }
6792 
6793 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
6794   assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
6795 
6796   // Look for cached value.
6797   BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB);
6798   if (BCEntryIt != BlockMaskCache.end())
6799     return BCEntryIt->second;
6800 
6801   // All-one mask is modelled as no-mask following the convention for masked
6802   // load/store/gather/scatter. Initialize BlockMask to no-mask.
6803   VPValue *BlockMask = nullptr;
6804 
6805   if (OrigLoop->getHeader() == BB) {
6806     if (!CM.blockNeedsPredication(BB))
6807       return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one.
6808 
6809     // Introduce the early-exit compare IV <= BTC to form header block mask.
6810     // This is used instead of IV < TC because TC may wrap, unlike BTC.
6811     // Start by constructing the desired canonical IV.
6812     VPValue *IV = nullptr;
6813     if (Legal->getPrimaryInduction())
6814       IV = Plan->getVPValue(Legal->getPrimaryInduction());
6815     else {
6816       auto IVRecipe = new VPWidenCanonicalIVRecipe();
6817       Builder.getInsertBlock()->appendRecipe(IVRecipe);
6818       IV = IVRecipe->getVPValue();
6819     }
6820     VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
6821     BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
6822     return BlockMaskCache[BB] = BlockMask;
6823   }
6824 
6825   // This is the block mask. We OR all incoming edges.
6826   for (auto *Predecessor : predecessors(BB)) {
6827     VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan);
6828     if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too.
6829       return BlockMaskCache[BB] = EdgeMask;
6830 
6831     if (!BlockMask) { // BlockMask has its initialized nullptr value.
6832       BlockMask = EdgeMask;
6833       continue;
6834     }
6835 
6836     BlockMask = Builder.createOr(BlockMask, EdgeMask);
6837   }
6838 
6839   return BlockMaskCache[BB] = BlockMask;
6840 }
6841 
6842 VPWidenMemoryInstructionRecipe *
6843 VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range,
6844                                   VPlanPtr &Plan) {
6845   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
6846          "Must be called with either a load or store");
6847 
6848   auto willWiden = [&](unsigned VF) -> bool {
6849     if (VF == 1)
6850       return false;
6851     LoopVectorizationCostModel::InstWidening Decision =
6852         CM.getWideningDecision(I, VF);
6853     assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
6854            "CM decision should be taken at this point.");
6855     if (Decision == LoopVectorizationCostModel::CM_Interleave)
6856       return true;
6857     if (CM.isScalarAfterVectorization(I, VF) ||
6858         CM.isProfitableToScalarize(I, VF))
6859       return false;
6860     return Decision != LoopVectorizationCostModel::CM_Scalarize;
6861   };
6862 
6863   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
6864     return nullptr;
6865 
6866   VPValue *Mask = nullptr;
6867   if (Legal->isMaskRequired(I))
6868     Mask = createBlockInMask(I->getParent(), Plan);
6869 
6870   VPValue *Addr = Plan->getOrAddVPValue(getLoadStorePointerOperand(I));
6871   if (LoadInst *Load = dyn_cast<LoadInst>(I))
6872     return new VPWidenMemoryInstructionRecipe(*Load, Addr, Mask);
6873 
6874   StoreInst *Store = cast<StoreInst>(I);
6875   VPValue *StoredValue = Plan->getOrAddVPValue(Store->getValueOperand());
6876   return new VPWidenMemoryInstructionRecipe(*Store, Addr, StoredValue, Mask);
6877 }
6878 
6879 VPWidenIntOrFpInductionRecipe *
6880 VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi) const {
6881   // Check if this is an integer or fp induction. If so, build the recipe that
6882   // produces its scalar and vector values.
6883   InductionDescriptor II = Legal->getInductionVars().lookup(Phi);
6884   if (II.getKind() == InductionDescriptor::IK_IntInduction ||
6885       II.getKind() == InductionDescriptor::IK_FpInduction)
6886     return new VPWidenIntOrFpInductionRecipe(Phi);
6887 
6888   return nullptr;
6889 }
6890 
6891 VPWidenIntOrFpInductionRecipe *
6892 VPRecipeBuilder::tryToOptimizeInductionTruncate(TruncInst *I,
6893                                                 VFRange &Range) const {
6894   // Optimize the special case where the source is a constant integer
6895   // induction variable. Notice that we can only optimize the 'trunc' case
6896   // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
6897   // (c) other casts depend on pointer size.
6898 
6899   // Determine whether \p K is a truncation based on an induction variable that
6900   // can be optimized.
6901   auto isOptimizableIVTruncate =
6902       [&](Instruction *K) -> std::function<bool(unsigned)> {
6903     return
6904         [=](unsigned VF) -> bool { return CM.isOptimizableIVTruncate(K, VF); };
6905   };
6906 
6907   if (LoopVectorizationPlanner::getDecisionAndClampRange(
6908           isOptimizableIVTruncate(I), Range))
6909     return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)),
6910                                              I);
6911   return nullptr;
6912 }
6913 
6914 VPBlendRecipe *VPRecipeBuilder::tryToBlend(PHINode *Phi, VPlanPtr &Plan) {
6915   // We know that all PHIs in non-header blocks are converted into selects, so
6916   // we don't have to worry about the insertion order and we can just use the
6917   // builder. At this point we generate the predication tree. There may be
6918   // duplications since this is a simple recursive scan, but future
6919   // optimizations will clean it up.
6920 
6921   SmallVector<VPValue *, 2> Operands;
6922   unsigned NumIncoming = Phi->getNumIncomingValues();
6923   for (unsigned In = 0; In < NumIncoming; In++) {
6924     VPValue *EdgeMask =
6925       createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan);
6926     assert((EdgeMask || NumIncoming == 1) &&
6927            "Multiple predecessors with one having a full mask");
6928     Operands.push_back(Plan->getOrAddVPValue(Phi->getIncomingValue(In)));
6929     if (EdgeMask)
6930       Operands.push_back(EdgeMask);
6931   }
6932   return new VPBlendRecipe(Phi, Operands);
6933 }
6934 
6935 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, VFRange &Range,
6936                                                    VPlan &Plan) const {
6937 
6938   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
6939       [this, CI](unsigned VF) { return CM.isScalarWithPredication(CI, VF); },
6940       Range);
6941 
6942   if (IsPredicated)
6943     return nullptr;
6944 
6945   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
6946   if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
6947              ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect))
6948     return nullptr;
6949 
6950   auto willWiden = [&](unsigned VF) -> bool {
6951     Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
6952     // The following case may be scalarized depending on the VF.
6953     // The flag shows whether we use Intrinsic or a usual Call for vectorized
6954     // version of the instruction.
6955     // Is it beneficial to perform intrinsic call compared to lib call?
6956     bool NeedToScalarize = false;
6957     unsigned CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize);
6958     bool UseVectorIntrinsic =
6959         ID && CM.getVectorIntrinsicCost(CI, VF) <= CallCost;
6960     return UseVectorIntrinsic || !NeedToScalarize;
6961   };
6962 
6963   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
6964     return nullptr;
6965 
6966   return new VPWidenCallRecipe(*CI, Plan.mapToVPValues(CI->arg_operands()));
6967 }
6968 
6969 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
6970   assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) &&
6971          !isa<StoreInst>(I) && "Instruction should have been handled earlier");
6972   // Instruction should be widened, unless it is scalar after vectorization,
6973   // scalarization is profitable or it is predicated.
6974   auto WillScalarize = [this, I](unsigned VF) -> bool {
6975     return CM.isScalarAfterVectorization(I, VF) ||
6976            CM.isProfitableToScalarize(I, VF) ||
6977            CM.isScalarWithPredication(I, VF);
6978   };
6979   return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize,
6980                                                              Range);
6981 }
6982 
6983 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, VPlan &Plan) const {
6984   auto IsVectorizableOpcode = [](unsigned Opcode) {
6985     switch (Opcode) {
6986     case Instruction::Add:
6987     case Instruction::And:
6988     case Instruction::AShr:
6989     case Instruction::BitCast:
6990     case Instruction::FAdd:
6991     case Instruction::FCmp:
6992     case Instruction::FDiv:
6993     case Instruction::FMul:
6994     case Instruction::FNeg:
6995     case Instruction::FPExt:
6996     case Instruction::FPToSI:
6997     case Instruction::FPToUI:
6998     case Instruction::FPTrunc:
6999     case Instruction::FRem:
7000     case Instruction::FSub:
7001     case Instruction::ICmp:
7002     case Instruction::IntToPtr:
7003     case Instruction::LShr:
7004     case Instruction::Mul:
7005     case Instruction::Or:
7006     case Instruction::PtrToInt:
7007     case Instruction::SDiv:
7008     case Instruction::Select:
7009     case Instruction::SExt:
7010     case Instruction::Shl:
7011     case Instruction::SIToFP:
7012     case Instruction::SRem:
7013     case Instruction::Sub:
7014     case Instruction::Trunc:
7015     case Instruction::UDiv:
7016     case Instruction::UIToFP:
7017     case Instruction::URem:
7018     case Instruction::Xor:
7019     case Instruction::ZExt:
7020       return true;
7021     }
7022     return false;
7023   };
7024 
7025   if (!IsVectorizableOpcode(I->getOpcode()))
7026     return nullptr;
7027 
7028   // Success: widen this instruction.
7029   return new VPWidenRecipe(*I, Plan.mapToVPValues(I->operands()));
7030 }
7031 
7032 VPBasicBlock *VPRecipeBuilder::handleReplication(
7033     Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
7034     DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe,
7035     VPlanPtr &Plan) {
7036   bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
7037       [&](unsigned VF) { return CM.isUniformAfterVectorization(I, VF); },
7038       Range);
7039 
7040   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
7041       [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range);
7042 
7043   auto *Recipe = new VPReplicateRecipe(I, IsUniform, IsPredicated);
7044   setRecipe(I, Recipe);
7045 
7046   // Find if I uses a predicated instruction. If so, it will use its scalar
7047   // value. Avoid hoisting the insert-element which packs the scalar value into
7048   // a vector value, as that happens iff all users use the vector value.
7049   for (auto &Op : I->operands())
7050     if (auto *PredInst = dyn_cast<Instruction>(Op))
7051       if (PredInst2Recipe.find(PredInst) != PredInst2Recipe.end())
7052         PredInst2Recipe[PredInst]->setAlsoPack(false);
7053 
7054   // Finalize the recipe for Instr, first if it is not predicated.
7055   if (!IsPredicated) {
7056     LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
7057     VPBB->appendRecipe(Recipe);
7058     return VPBB;
7059   }
7060   LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
7061   assert(VPBB->getSuccessors().empty() &&
7062          "VPBB has successors when handling predicated replication.");
7063   // Record predicated instructions for above packing optimizations.
7064   PredInst2Recipe[I] = Recipe;
7065   VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan);
7066   VPBlockUtils::insertBlockAfter(Region, VPBB);
7067   auto *RegSucc = new VPBasicBlock();
7068   VPBlockUtils::insertBlockAfter(RegSucc, Region);
7069   return RegSucc;
7070 }
7071 
7072 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr,
7073                                                       VPRecipeBase *PredRecipe,
7074                                                       VPlanPtr &Plan) {
7075   // Instructions marked for predication are replicated and placed under an
7076   // if-then construct to prevent side-effects.
7077 
7078   // Generate recipes to compute the block mask for this region.
7079   VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan);
7080 
7081   // Build the triangular if-then region.
7082   std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
7083   assert(Instr->getParent() && "Predicated instruction not in any basic block");
7084   auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask);
7085   auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
7086   auto *PHIRecipe =
7087       Instr->getType()->isVoidTy() ? nullptr : new VPPredInstPHIRecipe(Instr);
7088   auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
7089   auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe);
7090   VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true);
7091 
7092   // Note: first set Entry as region entry and then connect successors starting
7093   // from it in order, to propagate the "parent" of each VPBasicBlock.
7094   VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry);
7095   VPBlockUtils::connectBlocks(Pred, Exit);
7096 
7097   return Region;
7098 }
7099 
7100 VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
7101                                                       VFRange &Range,
7102                                                       VPlanPtr &Plan) {
7103   // First, check for specific widening recipes that deal with calls, memory
7104   // operations, inductions and Phi nodes.
7105   if (auto *CI = dyn_cast<CallInst>(Instr))
7106     return tryToWidenCall(CI, Range, *Plan);
7107 
7108   if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
7109     return tryToWidenMemory(Instr, Range, Plan);
7110 
7111   VPRecipeBase *Recipe;
7112   if (auto Phi = dyn_cast<PHINode>(Instr)) {
7113     if (Phi->getParent() != OrigLoop->getHeader())
7114       return tryToBlend(Phi, Plan);
7115     if ((Recipe = tryToOptimizeInductionPHI(Phi)))
7116       return Recipe;
7117     return new VPWidenPHIRecipe(Phi);
7118     return new VPWidenPHIRecipe(Phi);
7119   }
7120 
7121   if (isa<TruncInst>(Instr) &&
7122       (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Range)))
7123     return Recipe;
7124 
7125   if (!shouldWiden(Instr, Range))
7126     return nullptr;
7127 
7128   if (auto GEP = dyn_cast<GetElementPtrInst>(Instr))
7129     return new VPWidenGEPRecipe(GEP, OrigLoop);
7130 
7131   if (auto *SI = dyn_cast<SelectInst>(Instr)) {
7132     bool InvariantCond =
7133         PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop);
7134     return new VPWidenSelectRecipe(*SI, InvariantCond);
7135   }
7136 
7137   return tryToWiden(Instr, *Plan);
7138 }
7139 
7140 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF,
7141                                                         unsigned MaxVF) {
7142   assert(OrigLoop->empty() && "Inner loop expected.");
7143 
7144   // Collect conditions feeding internal conditional branches; they need to be
7145   // represented in VPlan for it to model masking.
7146   SmallPtrSet<Value *, 1> NeedDef;
7147 
7148   auto *Latch = OrigLoop->getLoopLatch();
7149   for (BasicBlock *BB : OrigLoop->blocks()) {
7150     if (BB == Latch)
7151       continue;
7152     BranchInst *Branch = dyn_cast<BranchInst>(BB->getTerminator());
7153     if (Branch && Branch->isConditional())
7154       NeedDef.insert(Branch->getCondition());
7155   }
7156 
7157   // If the tail is to be folded by masking, the primary induction variable, if
7158   // exists needs to be represented in VPlan for it to model early-exit masking.
7159   // Also, both the Phi and the live-out instruction of each reduction are
7160   // required in order to introduce a select between them in VPlan.
7161   if (CM.foldTailByMasking()) {
7162     if (Legal->getPrimaryInduction())
7163       NeedDef.insert(Legal->getPrimaryInduction());
7164     for (auto &Reduction : Legal->getReductionVars()) {
7165       NeedDef.insert(Reduction.first);
7166       NeedDef.insert(Reduction.second.getLoopExitInstr());
7167     }
7168   }
7169 
7170   // Collect instructions from the original loop that will become trivially dead
7171   // in the vectorized loop. We don't need to vectorize these instructions. For
7172   // example, original induction update instructions can become dead because we
7173   // separately emit induction "steps" when generating code for the new loop.
7174   // Similarly, we create a new latch condition when setting up the structure
7175   // of the new loop, so the old one can become dead.
7176   SmallPtrSet<Instruction *, 4> DeadInstructions;
7177   collectTriviallyDeadInstructions(DeadInstructions);
7178 
7179   // Add assume instructions we need to drop to DeadInstructions, to prevent
7180   // them from being added to the VPlan.
7181   // TODO: We only need to drop assumes in blocks that get flattend. If the
7182   // control flow is preserved, we should keep them.
7183   auto &ConditionalAssumes = Legal->getConditionalAssumes();
7184   DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end());
7185 
7186   DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter();
7187   // Dead instructions do not need sinking. Remove them from SinkAfter.
7188   for (Instruction *I : DeadInstructions)
7189     SinkAfter.erase(I);
7190 
7191   for (unsigned VF = MinVF; VF < MaxVF + 1;) {
7192     VFRange SubRange = {VF, MaxVF + 1};
7193     VPlans.push_back(buildVPlanWithVPRecipes(SubRange, NeedDef,
7194                                              DeadInstructions, SinkAfter));
7195     VF = SubRange.End;
7196   }
7197 }
7198 
7199 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
7200     VFRange &Range, SmallPtrSetImpl<Value *> &NeedDef,
7201     SmallPtrSetImpl<Instruction *> &DeadInstructions,
7202     const DenseMap<Instruction *, Instruction *> &SinkAfter) {
7203 
7204   // Hold a mapping from predicated instructions to their recipes, in order to
7205   // fix their AlsoPack behavior if a user is determined to replicate and use a
7206   // scalar instead of vector value.
7207   DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe;
7208 
7209   SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
7210 
7211   VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder);
7212 
7213   // ---------------------------------------------------------------------------
7214   // Pre-construction: record ingredients whose recipes we'll need to further
7215   // process after constructing the initial VPlan.
7216   // ---------------------------------------------------------------------------
7217 
7218   // Mark instructions we'll need to sink later and their targets as
7219   // ingredients whose recipe we'll need to record.
7220   for (auto &Entry : SinkAfter) {
7221     RecipeBuilder.recordRecipeOf(Entry.first);
7222     RecipeBuilder.recordRecipeOf(Entry.second);
7223   }
7224 
7225   // For each interleave group which is relevant for this (possibly trimmed)
7226   // Range, add it to the set of groups to be later applied to the VPlan and add
7227   // placeholders for its members' Recipes which we'll be replacing with a
7228   // single VPInterleaveRecipe.
7229   for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
7230     auto applyIG = [IG, this](unsigned VF) -> bool {
7231       return (VF >= 2 && // Query is illegal for VF == 1
7232               CM.getWideningDecision(IG->getInsertPos(), VF) ==
7233                   LoopVectorizationCostModel::CM_Interleave);
7234     };
7235     if (!getDecisionAndClampRange(applyIG, Range))
7236       continue;
7237     InterleaveGroups.insert(IG);
7238     for (unsigned i = 0; i < IG->getFactor(); i++)
7239       if (Instruction *Member = IG->getMember(i))
7240         RecipeBuilder.recordRecipeOf(Member);
7241   };
7242 
7243   // ---------------------------------------------------------------------------
7244   // Build initial VPlan: Scan the body of the loop in a topological order to
7245   // visit each basic block after having visited its predecessor basic blocks.
7246   // ---------------------------------------------------------------------------
7247 
7248   // Create a dummy pre-entry VPBasicBlock to start building the VPlan.
7249   auto Plan = std::make_unique<VPlan>();
7250   VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry");
7251   Plan->setEntry(VPBB);
7252 
7253   // Represent values that will have defs inside VPlan.
7254   for (Value *V : NeedDef)
7255     Plan->addVPValue(V);
7256 
7257   // Scan the body of the loop in a topological order to visit each basic block
7258   // after having visited its predecessor basic blocks.
7259   LoopBlocksDFS DFS(OrigLoop);
7260   DFS.perform(LI);
7261 
7262   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
7263     // Relevant instructions from basic block BB will be grouped into VPRecipe
7264     // ingredients and fill a new VPBasicBlock.
7265     unsigned VPBBsForBB = 0;
7266     auto *FirstVPBBForBB = new VPBasicBlock(BB->getName());
7267     VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB);
7268     VPBB = FirstVPBBForBB;
7269     Builder.setInsertPoint(VPBB);
7270 
7271     // Introduce each ingredient into VPlan.
7272     // TODO: Model and preserve debug instrinsics in VPlan.
7273     for (Instruction &I : BB->instructionsWithoutDebug()) {
7274       Instruction *Instr = &I;
7275 
7276       // First filter out irrelevant instructions, to ensure no recipes are
7277       // built for them.
7278       if (isa<BranchInst>(Instr) ||
7279           DeadInstructions.find(Instr) != DeadInstructions.end())
7280         continue;
7281 
7282       if (auto Recipe =
7283               RecipeBuilder.tryToCreateWidenRecipe(Instr, Range, Plan)) {
7284         RecipeBuilder.setRecipe(Instr, Recipe);
7285         VPBB->appendRecipe(Recipe);
7286         continue;
7287       }
7288 
7289       // Otherwise, if all widening options failed, Instruction is to be
7290       // replicated. This may create a successor for VPBB.
7291       VPBasicBlock *NextVPBB = RecipeBuilder.handleReplication(
7292           Instr, Range, VPBB, PredInst2Recipe, Plan);
7293       if (NextVPBB != VPBB) {
7294         VPBB = NextVPBB;
7295         VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++)
7296                                     : "");
7297       }
7298     }
7299   }
7300 
7301   // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks
7302   // may also be empty, such as the last one VPBB, reflecting original
7303   // basic-blocks with no recipes.
7304   VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry());
7305   assert(PreEntry->empty() && "Expecting empty pre-entry block.");
7306   VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor());
7307   VPBlockUtils::disconnectBlocks(PreEntry, Entry);
7308   delete PreEntry;
7309 
7310   // ---------------------------------------------------------------------------
7311   // Transform initial VPlan: Apply previously taken decisions, in order, to
7312   // bring the VPlan to its final state.
7313   // ---------------------------------------------------------------------------
7314 
7315   // Apply Sink-After legal constraints.
7316   for (auto &Entry : SinkAfter) {
7317     VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first);
7318     VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second);
7319     Sink->moveAfter(Target);
7320   }
7321 
7322   // Interleave memory: for each Interleave Group we marked earlier as relevant
7323   // for this VPlan, replace the Recipes widening its memory instructions with a
7324   // single VPInterleaveRecipe at its insertion point.
7325   for (auto IG : InterleaveGroups) {
7326     auto *Recipe = cast<VPWidenMemoryInstructionRecipe>(
7327         RecipeBuilder.getRecipe(IG->getInsertPos()));
7328     (new VPInterleaveRecipe(IG, Recipe->getAddr(), Recipe->getMask()))
7329         ->insertBefore(Recipe);
7330 
7331     for (unsigned i = 0; i < IG->getFactor(); ++i)
7332       if (Instruction *Member = IG->getMember(i)) {
7333         RecipeBuilder.getRecipe(Member)->eraseFromParent();
7334       }
7335   }
7336 
7337   // Finally, if tail is folded by masking, introduce selects between the phi
7338   // and the live-out instruction of each reduction, at the end of the latch.
7339   if (CM.foldTailByMasking()) {
7340     Builder.setInsertPoint(VPBB);
7341     auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan);
7342     for (auto &Reduction : Legal->getReductionVars()) {
7343       VPValue *Phi = Plan->getVPValue(Reduction.first);
7344       VPValue *Red = Plan->getVPValue(Reduction.second.getLoopExitInstr());
7345       Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi});
7346     }
7347   }
7348 
7349   std::string PlanName;
7350   raw_string_ostream RSO(PlanName);
7351   unsigned VF = Range.Start;
7352   Plan->addVF(VF);
7353   RSO << "Initial VPlan for VF={" << VF;
7354   for (VF *= 2; VF < Range.End; VF *= 2) {
7355     Plan->addVF(VF);
7356     RSO << "," << VF;
7357   }
7358   RSO << "},UF>=1";
7359   RSO.flush();
7360   Plan->setName(PlanName);
7361 
7362   return Plan;
7363 }
7364 
7365 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
7366   // Outer loop handling: They may require CFG and instruction level
7367   // transformations before even evaluating whether vectorization is profitable.
7368   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
7369   // the vectorization pipeline.
7370   assert(!OrigLoop->empty());
7371   assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
7372 
7373   // Create new empty VPlan
7374   auto Plan = std::make_unique<VPlan>();
7375 
7376   // Build hierarchical CFG
7377   VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
7378   HCFGBuilder.buildHierarchicalCFG();
7379 
7380   for (unsigned VF = Range.Start; VF < Range.End; VF *= 2)
7381     Plan->addVF(VF);
7382 
7383   if (EnableVPlanPredication) {
7384     VPlanPredicator VPP(*Plan);
7385     VPP.predicate();
7386 
7387     // Avoid running transformation to recipes until masked code generation in
7388     // VPlan-native path is in place.
7389     return Plan;
7390   }
7391 
7392   SmallPtrSet<Instruction *, 1> DeadInstructions;
7393   VPlanTransforms::VPInstructionsToVPRecipes(
7394       OrigLoop, Plan, Legal->getInductionVars(), DeadInstructions);
7395   return Plan;
7396 }
7397 
7398 Value* LoopVectorizationPlanner::VPCallbackILV::
7399 getOrCreateVectorValues(Value *V, unsigned Part) {
7400       return ILV.getOrCreateVectorValue(V, Part);
7401 }
7402 
7403 Value *LoopVectorizationPlanner::VPCallbackILV::getOrCreateScalarValue(
7404     Value *V, const VPIteration &Instance) {
7405   return ILV.getOrCreateScalarValue(V, Instance);
7406 }
7407 
7408 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
7409                                VPSlotTracker &SlotTracker) const {
7410   O << " +\n"
7411     << Indent << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
7412   IG->getInsertPos()->printAsOperand(O, false);
7413   O << ", ";
7414   getAddr()->printAsOperand(O, SlotTracker);
7415   VPValue *Mask = getMask();
7416   if (Mask) {
7417     O << ", ";
7418     Mask->printAsOperand(O, SlotTracker);
7419   }
7420   O << "\\l\"";
7421   for (unsigned i = 0; i < IG->getFactor(); ++i)
7422     if (Instruction *I = IG->getMember(i))
7423       O << " +\n"
7424         << Indent << "\"  " << VPlanIngredient(I) << " " << i << "\\l\"";
7425 }
7426 
7427 void VPWidenCallRecipe::execute(VPTransformState &State) {
7428   State.ILV->widenCallInstruction(Ingredient, User, State);
7429 }
7430 
7431 void VPWidenSelectRecipe::execute(VPTransformState &State) {
7432   State.ILV->widenSelectInstruction(Ingredient, InvariantCond);
7433 }
7434 
7435 void VPWidenRecipe::execute(VPTransformState &State) {
7436   State.ILV->widenInstruction(Ingredient, User, State);
7437 }
7438 
7439 void VPWidenGEPRecipe::execute(VPTransformState &State) {
7440   State.ILV->widenGEP(GEP, State.UF, State.VF, IsPtrLoopInvariant,
7441                       IsIndexLoopInvariant);
7442 }
7443 
7444 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
7445   assert(!State.Instance && "Int or FP induction being replicated.");
7446   State.ILV->widenIntOrFpInduction(IV, Trunc);
7447 }
7448 
7449 void VPWidenPHIRecipe::execute(VPTransformState &State) {
7450   State.ILV->widenPHIInstruction(Phi, State.UF, State.VF);
7451 }
7452 
7453 void VPBlendRecipe::execute(VPTransformState &State) {
7454   State.ILV->setDebugLocFromInst(State.Builder, Phi);
7455   // We know that all PHIs in non-header blocks are converted into
7456   // selects, so we don't have to worry about the insertion order and we
7457   // can just use the builder.
7458   // At this point we generate the predication tree. There may be
7459   // duplications since this is a simple recursive scan, but future
7460   // optimizations will clean it up.
7461 
7462   unsigned NumIncoming = getNumIncomingValues();
7463 
7464   // Generate a sequence of selects of the form:
7465   // SELECT(Mask3, In3,
7466   //        SELECT(Mask2, In2,
7467   //               SELECT(Mask1, In1,
7468   //                      In0)))
7469   // Note that Mask0 is never used: lanes for which no path reaches this phi and
7470   // are essentially undef are taken from In0.
7471   InnerLoopVectorizer::VectorParts Entry(State.UF);
7472   for (unsigned In = 0; In < NumIncoming; ++In) {
7473     for (unsigned Part = 0; Part < State.UF; ++Part) {
7474       // We might have single edge PHIs (blocks) - use an identity
7475       // 'select' for the first PHI operand.
7476       Value *In0 = State.get(getIncomingValue(In), Part);
7477       if (In == 0)
7478         Entry[Part] = In0; // Initialize with the first incoming value.
7479       else {
7480         // Select between the current value and the previous incoming edge
7481         // based on the incoming mask.
7482         Value *Cond = State.get(getMask(In), Part);
7483         Entry[Part] =
7484             State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi");
7485       }
7486     }
7487   }
7488   for (unsigned Part = 0; Part < State.UF; ++Part)
7489     State.ValueMap.setVectorValue(Phi, Part, Entry[Part]);
7490 }
7491 
7492 void VPInterleaveRecipe::execute(VPTransformState &State) {
7493   assert(!State.Instance && "Interleave group being replicated.");
7494   State.ILV->vectorizeInterleaveGroup(IG, State, getAddr(), getMask());
7495 }
7496 
7497 void VPReplicateRecipe::execute(VPTransformState &State) {
7498   if (State.Instance) { // Generate a single instance.
7499     State.ILV->scalarizeInstruction(Ingredient, *State.Instance, IsPredicated);
7500     // Insert scalar instance packing it into a vector.
7501     if (AlsoPack && State.VF > 1) {
7502       // If we're constructing lane 0, initialize to start from undef.
7503       if (State.Instance->Lane == 0) {
7504         Value *Undef =
7505             UndefValue::get(VectorType::get(Ingredient->getType(), State.VF));
7506         State.ValueMap.setVectorValue(Ingredient, State.Instance->Part, Undef);
7507       }
7508       State.ILV->packScalarIntoVectorValue(Ingredient, *State.Instance);
7509     }
7510     return;
7511   }
7512 
7513   // Generate scalar instances for all VF lanes of all UF parts, unless the
7514   // instruction is uniform inwhich case generate only the first lane for each
7515   // of the UF parts.
7516   unsigned EndLane = IsUniform ? 1 : State.VF;
7517   for (unsigned Part = 0; Part < State.UF; ++Part)
7518     for (unsigned Lane = 0; Lane < EndLane; ++Lane)
7519       State.ILV->scalarizeInstruction(Ingredient, {Part, Lane}, IsPredicated);
7520 }
7521 
7522 void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
7523   assert(State.Instance && "Branch on Mask works only on single instance.");
7524 
7525   unsigned Part = State.Instance->Part;
7526   unsigned Lane = State.Instance->Lane;
7527 
7528   Value *ConditionBit = nullptr;
7529   if (!User) // Block in mask is all-one.
7530     ConditionBit = State.Builder.getTrue();
7531   else {
7532     VPValue *BlockInMask = User->getOperand(0);
7533     ConditionBit = State.get(BlockInMask, Part);
7534     if (ConditionBit->getType()->isVectorTy())
7535       ConditionBit = State.Builder.CreateExtractElement(
7536           ConditionBit, State.Builder.getInt32(Lane));
7537   }
7538 
7539   // Replace the temporary unreachable terminator with a new conditional branch,
7540   // whose two destinations will be set later when they are created.
7541   auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
7542   assert(isa<UnreachableInst>(CurrentTerminator) &&
7543          "Expected to replace unreachable terminator with conditional branch.");
7544   auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
7545   CondBr->setSuccessor(0, nullptr);
7546   ReplaceInstWithInst(CurrentTerminator, CondBr);
7547 }
7548 
7549 void VPPredInstPHIRecipe::execute(VPTransformState &State) {
7550   assert(State.Instance && "Predicated instruction PHI works per instance.");
7551   Instruction *ScalarPredInst = cast<Instruction>(
7552       State.ValueMap.getScalarValue(PredInst, *State.Instance));
7553   BasicBlock *PredicatedBB = ScalarPredInst->getParent();
7554   BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
7555   assert(PredicatingBB && "Predicated block has no single predecessor.");
7556 
7557   // By current pack/unpack logic we need to generate only a single phi node: if
7558   // a vector value for the predicated instruction exists at this point it means
7559   // the instruction has vector users only, and a phi for the vector value is
7560   // needed. In this case the recipe of the predicated instruction is marked to
7561   // also do that packing, thereby "hoisting" the insert-element sequence.
7562   // Otherwise, a phi node for the scalar value is needed.
7563   unsigned Part = State.Instance->Part;
7564   if (State.ValueMap.hasVectorValue(PredInst, Part)) {
7565     Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part);
7566     InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
7567     PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
7568     VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
7569     VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
7570     State.ValueMap.resetVectorValue(PredInst, Part, VPhi); // Update cache.
7571   } else {
7572     Type *PredInstType = PredInst->getType();
7573     PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
7574     Phi->addIncoming(UndefValue::get(ScalarPredInst->getType()), PredicatingBB);
7575     Phi->addIncoming(ScalarPredInst, PredicatedBB);
7576     State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi);
7577   }
7578 }
7579 
7580 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
7581   VPValue *StoredValue = isa<StoreInst>(Instr) ? getStoredValue() : nullptr;
7582   State.ILV->vectorizeMemoryInstruction(&Instr, State, getAddr(), StoredValue,
7583                                         getMask());
7584 }
7585 
7586 // Determine how to lower the scalar epilogue, which depends on 1) optimising
7587 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
7588 // predication, and 4) a TTI hook that analyses whether the loop is suitable
7589 // for predication.
7590 static ScalarEpilogueLowering getScalarEpilogueLowering(
7591     Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI,
7592     BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
7593     AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
7594     LoopVectorizationLegality &LVL) {
7595   bool OptSize =
7596       F->hasOptSize() || llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
7597                                                      PGSOQueryType::IRPass);
7598   // 1) OptSize takes precedence over all other options, i.e. if this is set,
7599   // don't look at hints or options, and don't request a scalar epilogue.
7600   if (OptSize && Hints.getForce() != LoopVectorizeHints::FK_Enabled)
7601     return CM_ScalarEpilogueNotAllowedOptSize;
7602 
7603   bool PredicateOptDisabled = PreferPredicateOverEpilog.getNumOccurrences() &&
7604                               !PreferPredicateOverEpilog;
7605 
7606   // 2) Next, if disabling predication is requested on the command line, honour
7607   // this and request a scalar epilogue.
7608   if (PredicateOptDisabled)
7609     return CM_ScalarEpilogueAllowed;
7610 
7611   // 3) and 4) look if enabling predication is requested on the command line,
7612   // with a loop hint, or if the TTI hook indicates this is profitable, request
7613   // predication .
7614   if (PreferPredicateOverEpilog ||
7615       Hints.getPredicate() == LoopVectorizeHints::FK_Enabled ||
7616       (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT,
7617                                         LVL.getLAI()) &&
7618        Hints.getPredicate() != LoopVectorizeHints::FK_Disabled))
7619     return CM_ScalarEpilogueNotNeededUsePredicate;
7620 
7621   return CM_ScalarEpilogueAllowed;
7622 }
7623 
7624 // Process the loop in the VPlan-native vectorization path. This path builds
7625 // VPlan upfront in the vectorization pipeline, which allows to apply
7626 // VPlan-to-VPlan transformations from the very beginning without modifying the
7627 // input LLVM IR.
7628 static bool processLoopInVPlanNativePath(
7629     Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
7630     LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
7631     TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
7632     OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
7633     ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) {
7634 
7635   assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
7636   Function *F = L->getHeader()->getParent();
7637   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
7638 
7639   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
7640       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL);
7641 
7642   LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
7643                                 &Hints, IAI);
7644   // Use the planner for outer loop vectorization.
7645   // TODO: CM is not used at this point inside the planner. Turn CM into an
7646   // optional argument if we don't need it in the future.
7647   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE);
7648 
7649   // Get user vectorization factor.
7650   const unsigned UserVF = Hints.getWidth();
7651 
7652   // Plan how to best vectorize, return the best VF and its cost.
7653   const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
7654 
7655   // If we are stress testing VPlan builds, do not attempt to generate vector
7656   // code. Masked vector code generation support will follow soon.
7657   // Also, do not attempt to vectorize if no vector code will be produced.
7658   if (VPlanBuildStressTest || EnableVPlanPredication ||
7659       VectorizationFactor::Disabled() == VF)
7660     return false;
7661 
7662   LVP.setBestPlan(VF.Width, 1);
7663 
7664   InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL,
7665                          &CM);
7666   LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
7667                     << L->getHeader()->getParent()->getName() << "\"\n");
7668   LVP.executePlan(LB, DT);
7669 
7670   // Mark the loop as already vectorized to avoid vectorizing again.
7671   Hints.setAlreadyVectorized();
7672 
7673   LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent()));
7674   return true;
7675 }
7676 
7677 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts)
7678     : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced ||
7679                                !EnableLoopInterleaving),
7680       VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced ||
7681                               !EnableLoopVectorization) {}
7682 
7683 bool LoopVectorizePass::processLoop(Loop *L) {
7684   assert((EnableVPlanNativePath || L->empty()) &&
7685          "VPlan-native path is not enabled. Only process inner loops.");
7686 
7687 #ifndef NDEBUG
7688   const std::string DebugLocStr = getDebugLocString(L);
7689 #endif /* NDEBUG */
7690 
7691   LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \""
7692                     << L->getHeader()->getParent()->getName() << "\" from "
7693                     << DebugLocStr << "\n");
7694 
7695   LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE);
7696 
7697   LLVM_DEBUG(
7698       dbgs() << "LV: Loop hints:"
7699              << " force="
7700              << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
7701                      ? "disabled"
7702                      : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
7703                             ? "enabled"
7704                             : "?"))
7705              << " width=" << Hints.getWidth()
7706              << " unroll=" << Hints.getInterleave() << "\n");
7707 
7708   // Function containing loop
7709   Function *F = L->getHeader()->getParent();
7710 
7711   // Looking at the diagnostic output is the only way to determine if a loop
7712   // was vectorized (other than looking at the IR or machine code), so it
7713   // is important to generate an optimization remark for each loop. Most of
7714   // these messages are generated as OptimizationRemarkAnalysis. Remarks
7715   // generated as OptimizationRemark and OptimizationRemarkMissed are
7716   // less verbose reporting vectorized loops and unvectorized loops that may
7717   // benefit from vectorization, respectively.
7718 
7719   if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
7720     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
7721     return false;
7722   }
7723 
7724   PredicatedScalarEvolution PSE(*SE, *L);
7725 
7726   // Check if it is legal to vectorize the loop.
7727   LoopVectorizationRequirements Requirements(*ORE);
7728   LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE,
7729                                 &Requirements, &Hints, DB, AC);
7730   if (!LVL.canVectorize(EnableVPlanNativePath)) {
7731     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
7732     Hints.emitRemarkWithHints();
7733     return false;
7734   }
7735 
7736   // Check the function attributes and profiles to find out if this function
7737   // should be optimized for size.
7738   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
7739       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL);
7740 
7741   // Entrance to the VPlan-native vectorization path. Outer loops are processed
7742   // here. They may require CFG and instruction level transformations before
7743   // even evaluating whether vectorization is profitable. Since we cannot modify
7744   // the incoming IR, we need to build VPlan upfront in the vectorization
7745   // pipeline.
7746   if (!L->empty())
7747     return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
7748                                         ORE, BFI, PSI, Hints);
7749 
7750   assert(L->empty() && "Inner loop expected.");
7751 
7752   // Check the loop for a trip count threshold: vectorize loops with a tiny trip
7753   // count by optimizing for size, to minimize overheads.
7754   auto ExpectedTC = getSmallBestKnownTC(*SE, L);
7755   if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
7756     LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
7757                       << "This loop is worth vectorizing only if no scalar "
7758                       << "iteration overheads are incurred.");
7759     if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
7760       LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
7761     else {
7762       LLVM_DEBUG(dbgs() << "\n");
7763       SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
7764     }
7765   }
7766 
7767   // Check the function attributes to see if implicit floats are allowed.
7768   // FIXME: This check doesn't seem possibly correct -- what if the loop is
7769   // an integer loop and the vector instructions selected are purely integer
7770   // vector instructions?
7771   if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
7772     reportVectorizationFailure(
7773         "Can't vectorize when the NoImplicitFloat attribute is used",
7774         "loop not vectorized due to NoImplicitFloat attribute",
7775         "NoImplicitFloat", ORE, L);
7776     Hints.emitRemarkWithHints();
7777     return false;
7778   }
7779 
7780   // Check if the target supports potentially unsafe FP vectorization.
7781   // FIXME: Add a check for the type of safety issue (denormal, signaling)
7782   // for the target we're vectorizing for, to make sure none of the
7783   // additional fp-math flags can help.
7784   if (Hints.isPotentiallyUnsafe() &&
7785       TTI->isFPVectorizationPotentiallyUnsafe()) {
7786     reportVectorizationFailure(
7787         "Potentially unsafe FP op prevents vectorization",
7788         "loop not vectorized due to unsafe FP support.",
7789         "UnsafeFP", ORE, L);
7790     Hints.emitRemarkWithHints();
7791     return false;
7792   }
7793 
7794   bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
7795   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
7796 
7797   // If an override option has been passed in for interleaved accesses, use it.
7798   if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
7799     UseInterleaved = EnableInterleavedMemAccesses;
7800 
7801   // Analyze interleaved memory accesses.
7802   if (UseInterleaved) {
7803     IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
7804   }
7805 
7806   // Use the cost model.
7807   LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
7808                                 F, &Hints, IAI);
7809   CM.collectValuesToIgnore();
7810 
7811   // Use the planner for vectorization.
7812   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE);
7813 
7814   // Get user vectorization factor.
7815   unsigned UserVF = Hints.getWidth();
7816 
7817   // Plan how to best vectorize, return the best VF and its cost.
7818   Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF);
7819 
7820   VectorizationFactor VF = VectorizationFactor::Disabled();
7821   unsigned IC = 1;
7822   unsigned UserIC = Hints.getInterleave();
7823 
7824   if (MaybeVF) {
7825     VF = *MaybeVF;
7826     // Select the interleave count.
7827     IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
7828   }
7829 
7830   // Identify the diagnostic messages that should be produced.
7831   std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
7832   bool VectorizeLoop = true, InterleaveLoop = true;
7833   if (Requirements.doesNotMeet(F, L, Hints)) {
7834     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization "
7835                          "requirements.\n");
7836     Hints.emitRemarkWithHints();
7837     return false;
7838   }
7839 
7840   if (VF.Width == 1) {
7841     LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
7842     VecDiagMsg = std::make_pair(
7843         "VectorizationNotBeneficial",
7844         "the cost-model indicates that vectorization is not beneficial");
7845     VectorizeLoop = false;
7846   }
7847 
7848   if (!MaybeVF && UserIC > 1) {
7849     // Tell the user interleaving was avoided up-front, despite being explicitly
7850     // requested.
7851     LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
7852                          "interleaving should be avoided up front\n");
7853     IntDiagMsg = std::make_pair(
7854         "InterleavingAvoided",
7855         "Ignoring UserIC, because interleaving was avoided up front");
7856     InterleaveLoop = false;
7857   } else if (IC == 1 && UserIC <= 1) {
7858     // Tell the user interleaving is not beneficial.
7859     LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
7860     IntDiagMsg = std::make_pair(
7861         "InterleavingNotBeneficial",
7862         "the cost-model indicates that interleaving is not beneficial");
7863     InterleaveLoop = false;
7864     if (UserIC == 1) {
7865       IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
7866       IntDiagMsg.second +=
7867           " and is explicitly disabled or interleave count is set to 1";
7868     }
7869   } else if (IC > 1 && UserIC == 1) {
7870     // Tell the user interleaving is beneficial, but it explicitly disabled.
7871     LLVM_DEBUG(
7872         dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
7873     IntDiagMsg = std::make_pair(
7874         "InterleavingBeneficialButDisabled",
7875         "the cost-model indicates that interleaving is beneficial "
7876         "but is explicitly disabled or interleave count is set to 1");
7877     InterleaveLoop = false;
7878   }
7879 
7880   // Override IC if user provided an interleave count.
7881   IC = UserIC > 0 ? UserIC : IC;
7882 
7883   // Emit diagnostic messages, if any.
7884   const char *VAPassName = Hints.vectorizeAnalysisPassName();
7885   if (!VectorizeLoop && !InterleaveLoop) {
7886     // Do not vectorize or interleaving the loop.
7887     ORE->emit([&]() {
7888       return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
7889                                       L->getStartLoc(), L->getHeader())
7890              << VecDiagMsg.second;
7891     });
7892     ORE->emit([&]() {
7893       return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
7894                                       L->getStartLoc(), L->getHeader())
7895              << IntDiagMsg.second;
7896     });
7897     return false;
7898   } else if (!VectorizeLoop && InterleaveLoop) {
7899     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
7900     ORE->emit([&]() {
7901       return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
7902                                         L->getStartLoc(), L->getHeader())
7903              << VecDiagMsg.second;
7904     });
7905   } else if (VectorizeLoop && !InterleaveLoop) {
7906     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
7907                       << ") in " << DebugLocStr << '\n');
7908     ORE->emit([&]() {
7909       return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
7910                                         L->getStartLoc(), L->getHeader())
7911              << IntDiagMsg.second;
7912     });
7913   } else if (VectorizeLoop && InterleaveLoop) {
7914     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
7915                       << ") in " << DebugLocStr << '\n');
7916     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
7917   }
7918 
7919   LVP.setBestPlan(VF.Width, IC);
7920 
7921   using namespace ore;
7922   bool DisableRuntimeUnroll = false;
7923   MDNode *OrigLoopID = L->getLoopID();
7924 
7925   if (!VectorizeLoop) {
7926     assert(IC > 1 && "interleave count should not be 1 or 0");
7927     // If we decided that it is not legal to vectorize the loop, then
7928     // interleave it.
7929     InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
7930                                &CM);
7931     LVP.executePlan(Unroller, DT);
7932 
7933     ORE->emit([&]() {
7934       return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
7935                                 L->getHeader())
7936              << "interleaved loop (interleaved count: "
7937              << NV("InterleaveCount", IC) << ")";
7938     });
7939   } else {
7940     // If we decided that it is *legal* to vectorize the loop, then do it.
7941     InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
7942                            &LVL, &CM);
7943     LVP.executePlan(LB, DT);
7944     ++LoopsVectorized;
7945 
7946     // Add metadata to disable runtime unrolling a scalar loop when there are
7947     // no runtime checks about strides and memory. A scalar loop that is
7948     // rarely used is not worth unrolling.
7949     if (!LB.areSafetyChecksAdded())
7950       DisableRuntimeUnroll = true;
7951 
7952     // Report the vectorization decision.
7953     ORE->emit([&]() {
7954       return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(),
7955                                 L->getHeader())
7956              << "vectorized loop (vectorization width: "
7957              << NV("VectorizationFactor", VF.Width)
7958              << ", interleaved count: " << NV("InterleaveCount", IC) << ")";
7959     });
7960   }
7961 
7962   Optional<MDNode *> RemainderLoopID =
7963       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
7964                                       LLVMLoopVectorizeFollowupEpilogue});
7965   if (RemainderLoopID.hasValue()) {
7966     L->setLoopID(RemainderLoopID.getValue());
7967   } else {
7968     if (DisableRuntimeUnroll)
7969       AddRuntimeUnrollDisableMetaData(L);
7970 
7971     // Mark the loop as already vectorized to avoid vectorizing again.
7972     Hints.setAlreadyVectorized();
7973   }
7974 
7975   LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent()));
7976   return true;
7977 }
7978 
7979 LoopVectorizeResult LoopVectorizePass::runImpl(
7980     Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_,
7981     DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_,
7982     DemandedBits &DB_, AliasAnalysis &AA_, AssumptionCache &AC_,
7983     std::function<const LoopAccessInfo &(Loop &)> &GetLAA_,
7984     OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) {
7985   SE = &SE_;
7986   LI = &LI_;
7987   TTI = &TTI_;
7988   DT = &DT_;
7989   BFI = &BFI_;
7990   TLI = TLI_;
7991   AA = &AA_;
7992   AC = &AC_;
7993   GetLAA = &GetLAA_;
7994   DB = &DB_;
7995   ORE = &ORE_;
7996   PSI = PSI_;
7997 
7998   // Don't attempt if
7999   // 1. the target claims to have no vector registers, and
8000   // 2. interleaving won't help ILP.
8001   //
8002   // The second condition is necessary because, even if the target has no
8003   // vector registers, loop vectorization may still enable scalar
8004   // interleaving.
8005   if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) &&
8006       TTI->getMaxInterleaveFactor(1) < 2)
8007     return LoopVectorizeResult(false, false);
8008 
8009   bool Changed = false, CFGChanged = false;
8010 
8011   // The vectorizer requires loops to be in simplified form.
8012   // Since simplification may add new inner loops, it has to run before the
8013   // legality and profitability checks. This means running the loop vectorizer
8014   // will simplify all loops, regardless of whether anything end up being
8015   // vectorized.
8016   for (auto &L : *LI)
8017     Changed |= CFGChanged |=
8018         simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
8019 
8020   // Build up a worklist of inner-loops to vectorize. This is necessary as
8021   // the act of vectorizing or partially unrolling a loop creates new loops
8022   // and can invalidate iterators across the loops.
8023   SmallVector<Loop *, 8> Worklist;
8024 
8025   for (Loop *L : *LI)
8026     collectSupportedLoops(*L, LI, ORE, Worklist);
8027 
8028   LoopsAnalyzed += Worklist.size();
8029 
8030   // Now walk the identified inner loops.
8031   while (!Worklist.empty()) {
8032     Loop *L = Worklist.pop_back_val();
8033 
8034     // For the inner loops we actually process, form LCSSA to simplify the
8035     // transform.
8036     Changed |= formLCSSARecursively(*L, *DT, LI, SE);
8037 
8038     Changed |= CFGChanged |= processLoop(L);
8039   }
8040 
8041   // Process each loop nest in the function.
8042   return LoopVectorizeResult(Changed, CFGChanged);
8043 }
8044 
8045 PreservedAnalyses LoopVectorizePass::run(Function &F,
8046                                          FunctionAnalysisManager &AM) {
8047     auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
8048     auto &LI = AM.getResult<LoopAnalysis>(F);
8049     auto &TTI = AM.getResult<TargetIRAnalysis>(F);
8050     auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
8051     auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F);
8052     auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
8053     auto &AA = AM.getResult<AAManager>(F);
8054     auto &AC = AM.getResult<AssumptionAnalysis>(F);
8055     auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
8056     auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
8057     MemorySSA *MSSA = EnableMSSALoopDependency
8058                           ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA()
8059                           : nullptr;
8060 
8061     auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
8062     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
8063         [&](Loop &L) -> const LoopAccessInfo & {
8064       LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI, MSSA};
8065       return LAM.getResult<LoopAccessAnalysis>(L, AR);
8066     };
8067     const ModuleAnalysisManager &MAM =
8068         AM.getResult<ModuleAnalysisManagerFunctionProxy>(F).getManager();
8069     ProfileSummaryInfo *PSI =
8070         MAM.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
8071     LoopVectorizeResult Result =
8072         runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI);
8073     if (!Result.MadeAnyChange)
8074       return PreservedAnalyses::all();
8075     PreservedAnalyses PA;
8076 
8077     // We currently do not preserve loopinfo/dominator analyses with outer loop
8078     // vectorization. Until this is addressed, mark these analyses as preserved
8079     // only for non-VPlan-native path.
8080     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
8081     if (!EnableVPlanNativePath) {
8082       PA.preserve<LoopAnalysis>();
8083       PA.preserve<DominatorTreeAnalysis>();
8084     }
8085     PA.preserve<BasicAA>();
8086     PA.preserve<GlobalsAA>();
8087     if (!Result.MadeCFGChange)
8088       PA.preserveSet<CFGAnalyses>();
8089     return PA;
8090 }
8091