1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10 // and generates target-independent LLVM-IR.
11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12 // of instructions in order to estimate the profitability of vectorization.
13 //
14 // The loop vectorizer combines consecutive loop iterations into a single
15 // 'wide' iteration. After this transformation the index is incremented
16 // by the SIMD vector width, and not by one.
17 //
18 // This pass has three parts:
19 // 1. The main loop pass that drives the different parts.
20 // 2. LoopVectorizationLegality - A unit that checks for the legality
21 //    of the vectorization.
22 // 3. InnerLoopVectorizer - A unit that performs the actual
23 //    widening of instructions.
24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
25 //    of vectorization. It decides on the optimal vector width, which
26 //    can be one, if vectorization is not profitable.
27 //
28 // There is a development effort going on to migrate loop vectorizer to the
29 // VPlan infrastructure and to introduce outer loop vectorization support (see
30 // docs/Proposal/VectorizationPlan.rst and
31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32 // purpose, we temporarily introduced the VPlan-native vectorization path: an
33 // alternative vectorization path that is natively implemented on top of the
34 // VPlan infrastructure. See EnableVPlanNativePath for enabling.
35 //
36 //===----------------------------------------------------------------------===//
37 //
38 // The reduction-variable vectorization is based on the paper:
39 //  D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40 //
41 // Variable uniformity checks are inspired by:
42 //  Karrenberg, R. and Hack, S. Whole Function Vectorization.
43 //
44 // The interleaved access vectorization is based on the paper:
45 //  Dorit Nuzman, Ira Rosen and Ayal Zaks.  Auto-Vectorization of Interleaved
46 //  Data for SIMD
47 //
48 // Other ideas/concepts are from:
49 //  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50 //
51 //  S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua.  An Evaluation of
52 //  Vectorizing Compilers.
53 //
54 //===----------------------------------------------------------------------===//
55 
56 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
57 #include "LoopVectorizationPlanner.h"
58 #include "VPRecipeBuilder.h"
59 #include "VPlan.h"
60 #include "VPlanHCFGBuilder.h"
61 #include "VPlanPredicator.h"
62 #include "VPlanTransforms.h"
63 #include "llvm/ADT/APInt.h"
64 #include "llvm/ADT/ArrayRef.h"
65 #include "llvm/ADT/DenseMap.h"
66 #include "llvm/ADT/DenseMapInfo.h"
67 #include "llvm/ADT/Hashing.h"
68 #include "llvm/ADT/MapVector.h"
69 #include "llvm/ADT/None.h"
70 #include "llvm/ADT/Optional.h"
71 #include "llvm/ADT/STLExtras.h"
72 #include "llvm/ADT/SetVector.h"
73 #include "llvm/ADT/SmallPtrSet.h"
74 #include "llvm/ADT/SmallVector.h"
75 #include "llvm/ADT/Statistic.h"
76 #include "llvm/ADT/StringRef.h"
77 #include "llvm/ADT/Twine.h"
78 #include "llvm/ADT/iterator_range.h"
79 #include "llvm/Analysis/AssumptionCache.h"
80 #include "llvm/Analysis/BasicAliasAnalysis.h"
81 #include "llvm/Analysis/BlockFrequencyInfo.h"
82 #include "llvm/Analysis/CFG.h"
83 #include "llvm/Analysis/CodeMetrics.h"
84 #include "llvm/Analysis/DemandedBits.h"
85 #include "llvm/Analysis/GlobalsModRef.h"
86 #include "llvm/Analysis/LoopAccessAnalysis.h"
87 #include "llvm/Analysis/LoopAnalysisManager.h"
88 #include "llvm/Analysis/LoopInfo.h"
89 #include "llvm/Analysis/LoopIterator.h"
90 #include "llvm/Analysis/MemorySSA.h"
91 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
92 #include "llvm/Analysis/ProfileSummaryInfo.h"
93 #include "llvm/Analysis/ScalarEvolution.h"
94 #include "llvm/Analysis/ScalarEvolutionExpander.h"
95 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
96 #include "llvm/Analysis/TargetLibraryInfo.h"
97 #include "llvm/Analysis/TargetTransformInfo.h"
98 #include "llvm/Analysis/VectorUtils.h"
99 #include "llvm/IR/Attributes.h"
100 #include "llvm/IR/BasicBlock.h"
101 #include "llvm/IR/CFG.h"
102 #include "llvm/IR/Constant.h"
103 #include "llvm/IR/Constants.h"
104 #include "llvm/IR/DataLayout.h"
105 #include "llvm/IR/DebugInfoMetadata.h"
106 #include "llvm/IR/DebugLoc.h"
107 #include "llvm/IR/DerivedTypes.h"
108 #include "llvm/IR/DiagnosticInfo.h"
109 #include "llvm/IR/Dominators.h"
110 #include "llvm/IR/Function.h"
111 #include "llvm/IR/IRBuilder.h"
112 #include "llvm/IR/InstrTypes.h"
113 #include "llvm/IR/Instruction.h"
114 #include "llvm/IR/Instructions.h"
115 #include "llvm/IR/IntrinsicInst.h"
116 #include "llvm/IR/Intrinsics.h"
117 #include "llvm/IR/LLVMContext.h"
118 #include "llvm/IR/Metadata.h"
119 #include "llvm/IR/Module.h"
120 #include "llvm/IR/Operator.h"
121 #include "llvm/IR/Type.h"
122 #include "llvm/IR/Use.h"
123 #include "llvm/IR/User.h"
124 #include "llvm/IR/Value.h"
125 #include "llvm/IR/ValueHandle.h"
126 #include "llvm/IR/Verifier.h"
127 #include "llvm/InitializePasses.h"
128 #include "llvm/Pass.h"
129 #include "llvm/Support/Casting.h"
130 #include "llvm/Support/CommandLine.h"
131 #include "llvm/Support/Compiler.h"
132 #include "llvm/Support/Debug.h"
133 #include "llvm/Support/ErrorHandling.h"
134 #include "llvm/Support/MathExtras.h"
135 #include "llvm/Support/raw_ostream.h"
136 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
137 #include "llvm/Transforms/Utils/InjectTLIMappings.h"
138 #include "llvm/Transforms/Utils/LoopSimplify.h"
139 #include "llvm/Transforms/Utils/LoopUtils.h"
140 #include "llvm/Transforms/Utils/LoopVersioning.h"
141 #include "llvm/Transforms/Utils/SizeOpts.h"
142 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
143 #include <algorithm>
144 #include <cassert>
145 #include <cstdint>
146 #include <cstdlib>
147 #include <functional>
148 #include <iterator>
149 #include <limits>
150 #include <memory>
151 #include <string>
152 #include <tuple>
153 #include <utility>
154 
155 using namespace llvm;
156 
157 #define LV_NAME "loop-vectorize"
158 #define DEBUG_TYPE LV_NAME
159 
160 /// @{
161 /// Metadata attribute names
162 static const char *const LLVMLoopVectorizeFollowupAll =
163     "llvm.loop.vectorize.followup_all";
164 static const char *const LLVMLoopVectorizeFollowupVectorized =
165     "llvm.loop.vectorize.followup_vectorized";
166 static const char *const LLVMLoopVectorizeFollowupEpilogue =
167     "llvm.loop.vectorize.followup_epilogue";
168 /// @}
169 
170 STATISTIC(LoopsVectorized, "Number of loops vectorized");
171 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
172 
173 /// Loops with a known constant trip count below this number are vectorized only
174 /// if no scalar iteration overheads are incurred.
175 static cl::opt<unsigned> TinyTripCountVectorThreshold(
176     "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
177     cl::desc("Loops with a constant trip count that is smaller than this "
178              "value are vectorized only if no scalar iteration overheads "
179              "are incurred."));
180 
181 // Indicates that an epilogue is undesired, predication is preferred.
182 // This means that the vectorizer will try to fold the loop-tail (epilogue)
183 // into the loop and predicate the loop body accordingly.
184 static cl::opt<bool> PreferPredicateOverEpilog(
185     "prefer-predicate-over-epilog", cl::init(false), cl::Hidden,
186     cl::desc("Indicate that an epilogue is undesired, predication should be "
187              "used instead."));
188 
189 static cl::opt<bool> MaximizeBandwidth(
190     "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
191     cl::desc("Maximize bandwidth when selecting vectorization factor which "
192              "will be determined by the smallest type in loop."));
193 
194 static cl::opt<bool> EnableInterleavedMemAccesses(
195     "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
196     cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
197 
198 /// An interleave-group may need masking if it resides in a block that needs
199 /// predication, or in order to mask away gaps.
200 static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
201     "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
202     cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
203 
204 static cl::opt<unsigned> TinyTripCountInterleaveThreshold(
205     "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden,
206     cl::desc("We don't interleave loops with a estimated constant trip count "
207              "below this number"));
208 
209 static cl::opt<unsigned> ForceTargetNumScalarRegs(
210     "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
211     cl::desc("A flag that overrides the target's number of scalar registers."));
212 
213 static cl::opt<unsigned> ForceTargetNumVectorRegs(
214     "force-target-num-vector-regs", cl::init(0), cl::Hidden,
215     cl::desc("A flag that overrides the target's number of vector registers."));
216 
217 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
218     "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
219     cl::desc("A flag that overrides the target's max interleave factor for "
220              "scalar loops."));
221 
222 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
223     "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
224     cl::desc("A flag that overrides the target's max interleave factor for "
225              "vectorized loops."));
226 
227 static cl::opt<unsigned> ForceTargetInstructionCost(
228     "force-target-instruction-cost", cl::init(0), cl::Hidden,
229     cl::desc("A flag that overrides the target's expected cost for "
230              "an instruction to a single constant value. Mostly "
231              "useful for getting consistent testing."));
232 
233 static cl::opt<unsigned> SmallLoopCost(
234     "small-loop-cost", cl::init(20), cl::Hidden,
235     cl::desc(
236         "The cost of a loop that is considered 'small' by the interleaver."));
237 
238 static cl::opt<bool> LoopVectorizeWithBlockFrequency(
239     "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
240     cl::desc("Enable the use of the block frequency analysis to access PGO "
241              "heuristics minimizing code growth in cold regions and being more "
242              "aggressive in hot regions."));
243 
244 // Runtime interleave loops for load/store throughput.
245 static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
246     "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
247     cl::desc(
248         "Enable runtime interleaving until load/store ports are saturated"));
249 
250 /// The number of stores in a loop that are allowed to need predication.
251 static cl::opt<unsigned> NumberOfStoresToPredicate(
252     "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
253     cl::desc("Max number of stores to be predicated behind an if."));
254 
255 static cl::opt<bool> EnableIndVarRegisterHeur(
256     "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
257     cl::desc("Count the induction variable only once when interleaving"));
258 
259 static cl::opt<bool> EnableCondStoresVectorization(
260     "enable-cond-stores-vec", cl::init(true), cl::Hidden,
261     cl::desc("Enable if predication of stores during vectorization."));
262 
263 static cl::opt<unsigned> MaxNestedScalarReductionIC(
264     "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
265     cl::desc("The maximum interleave count to use when interleaving a scalar "
266              "reduction in a nested loop."));
267 
268 cl::opt<bool> EnableVPlanNativePath(
269     "enable-vplan-native-path", cl::init(false), cl::Hidden,
270     cl::desc("Enable VPlan-native vectorization path with "
271              "support for outer loop vectorization."));
272 
273 // FIXME: Remove this switch once we have divergence analysis. Currently we
274 // assume divergent non-backedge branches when this switch is true.
275 cl::opt<bool> EnableVPlanPredication(
276     "enable-vplan-predication", cl::init(false), cl::Hidden,
277     cl::desc("Enable VPlan-native vectorization path predicator with "
278              "support for outer loop vectorization."));
279 
280 // This flag enables the stress testing of the VPlan H-CFG construction in the
281 // VPlan-native vectorization path. It must be used in conjuction with
282 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
283 // verification of the H-CFGs built.
284 static cl::opt<bool> VPlanBuildStressTest(
285     "vplan-build-stress-test", cl::init(false), cl::Hidden,
286     cl::desc(
287         "Build VPlan for every supported loop nest in the function and bail "
288         "out right after the build (stress test the VPlan H-CFG construction "
289         "in the VPlan-native vectorization path)."));
290 
291 cl::opt<bool> llvm::EnableLoopInterleaving(
292     "interleave-loops", cl::init(true), cl::Hidden,
293     cl::desc("Enable loop interleaving in Loop vectorization passes"));
294 cl::opt<bool> llvm::EnableLoopVectorization(
295     "vectorize-loops", cl::init(true), cl::Hidden,
296     cl::desc("Run the Loop vectorization passes"));
297 
298 /// A helper function for converting Scalar types to vector types.
299 /// If the incoming type is void, we return void. If the VF is 1, we return
300 /// the scalar type.
301 static Type *ToVectorTy(Type *Scalar, unsigned VF) {
302   if (Scalar->isVoidTy() || VF == 1)
303     return Scalar;
304   return VectorType::get(Scalar, VF);
305 }
306 
307 /// A helper function that returns the type of loaded or stored value.
308 static Type *getMemInstValueType(Value *I) {
309   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
310          "Expected Load or Store instruction");
311   if (auto *LI = dyn_cast<LoadInst>(I))
312     return LI->getType();
313   return cast<StoreInst>(I)->getValueOperand()->getType();
314 }
315 
316 /// A helper function that returns true if the given type is irregular. The
317 /// type is irregular if its allocated size doesn't equal the store size of an
318 /// element of the corresponding vector type at the given vectorization factor.
319 static bool hasIrregularType(Type *Ty, const DataLayout &DL, unsigned VF) {
320   // Determine if an array of VF elements of type Ty is "bitcast compatible"
321   // with a <VF x Ty> vector.
322   if (VF > 1) {
323     auto *VectorTy = VectorType::get(Ty, VF);
324     return VF * DL.getTypeAllocSize(Ty) != DL.getTypeStoreSize(VectorTy);
325   }
326 
327   // If the vectorization factor is one, we just check if an array of type Ty
328   // requires padding between elements.
329   return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
330 }
331 
332 /// A helper function that returns the reciprocal of the block probability of
333 /// predicated blocks. If we return X, we are assuming the predicated block
334 /// will execute once for every X iterations of the loop header.
335 ///
336 /// TODO: We should use actual block probability here, if available. Currently,
337 ///       we always assume predicated blocks have a 50% chance of executing.
338 static unsigned getReciprocalPredBlockProb() { return 2; }
339 
340 /// A helper function that adds a 'fast' flag to floating-point operations.
341 static Value *addFastMathFlag(Value *V) {
342   if (isa<FPMathOperator>(V))
343     cast<Instruction>(V)->setFastMathFlags(FastMathFlags::getFast());
344   return V;
345 }
346 
347 static Value *addFastMathFlag(Value *V, FastMathFlags FMF) {
348   if (isa<FPMathOperator>(V))
349     cast<Instruction>(V)->setFastMathFlags(FMF);
350   return V;
351 }
352 
353 /// A helper function that returns an integer or floating-point constant with
354 /// value C.
355 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
356   return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
357                            : ConstantFP::get(Ty, C);
358 }
359 
360 /// Returns "best known" trip count for the specified loop \p L as defined by
361 /// the following procedure:
362 ///   1) Returns exact trip count if it is known.
363 ///   2) Returns expected trip count according to profile data if any.
364 ///   3) Returns upper bound estimate if it is known.
365 ///   4) Returns None if all of the above failed.
366 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) {
367   // Check if exact trip count is known.
368   if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
369     return ExpectedTC;
370 
371   // Check if there is an expected trip count available from profile data.
372   if (LoopVectorizeWithBlockFrequency)
373     if (auto EstimatedTC = getLoopEstimatedTripCount(L))
374       return EstimatedTC;
375 
376   // Check if upper bound estimate is known.
377   if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
378     return ExpectedTC;
379 
380   return None;
381 }
382 
383 namespace llvm {
384 
385 /// InnerLoopVectorizer vectorizes loops which contain only one basic
386 /// block to a specified vectorization factor (VF).
387 /// This class performs the widening of scalars into vectors, or multiple
388 /// scalars. This class also implements the following features:
389 /// * It inserts an epilogue loop for handling loops that don't have iteration
390 ///   counts that are known to be a multiple of the vectorization factor.
391 /// * It handles the code generation for reduction variables.
392 /// * Scalarization (implementation using scalars) of un-vectorizable
393 ///   instructions.
394 /// InnerLoopVectorizer does not perform any vectorization-legality
395 /// checks, and relies on the caller to check for the different legality
396 /// aspects. The InnerLoopVectorizer relies on the
397 /// LoopVectorizationLegality class to provide information about the induction
398 /// and reduction variables that were found to a given vectorization factor.
399 class InnerLoopVectorizer {
400 public:
401   InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
402                       LoopInfo *LI, DominatorTree *DT,
403                       const TargetLibraryInfo *TLI,
404                       const TargetTransformInfo *TTI, AssumptionCache *AC,
405                       OptimizationRemarkEmitter *ORE, unsigned VecWidth,
406                       unsigned UnrollFactor, LoopVectorizationLegality *LVL,
407                       LoopVectorizationCostModel *CM)
408       : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
409         AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
410         Builder(PSE.getSE()->getContext()),
411         VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM) {}
412   virtual ~InnerLoopVectorizer() = default;
413 
414   /// Create a new empty loop. Unlink the old loop and connect the new one.
415   /// Return the pre-header block of the new loop.
416   BasicBlock *createVectorizedLoopSkeleton();
417 
418   /// Widen a single instruction within the innermost loop.
419   void widenInstruction(Instruction &I);
420 
421   /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
422   void fixVectorizedLoop();
423 
424   // Return true if any runtime check is added.
425   bool areSafetyChecksAdded() { return AddedSafetyChecks; }
426 
427   /// A type for vectorized values in the new loop. Each value from the
428   /// original loop, when vectorized, is represented by UF vector values in the
429   /// new unrolled loop, where UF is the unroll factor.
430   using VectorParts = SmallVector<Value *, 2>;
431 
432   /// Vectorize a single GetElementPtrInst based on information gathered and
433   /// decisions taken during planning.
434   void widenGEP(GetElementPtrInst *GEP, unsigned UF, unsigned VF,
435                 bool IsPtrLoopInvariant, SmallBitVector &IsIndexLoopInvariant);
436 
437   /// Vectorize a single PHINode in a block. This method handles the induction
438   /// variable canonicalization. It supports both VF = 1 for unrolled loops and
439   /// arbitrary length vectors.
440   void widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF);
441 
442   /// A helper function to scalarize a single Instruction in the innermost loop.
443   /// Generates a sequence of scalar instances for each lane between \p MinLane
444   /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
445   /// inclusive..
446   void scalarizeInstruction(Instruction *Instr, const VPIteration &Instance,
447                             bool IfPredicateInstr);
448 
449   /// Widen an integer or floating-point induction variable \p IV. If \p Trunc
450   /// is provided, the integer induction variable will first be truncated to
451   /// the corresponding type.
452   void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr);
453 
454   /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a
455   /// vector or scalar value on-demand if one is not yet available. When
456   /// vectorizing a loop, we visit the definition of an instruction before its
457   /// uses. When visiting the definition, we either vectorize or scalarize the
458   /// instruction, creating an entry for it in the corresponding map. (In some
459   /// cases, such as induction variables, we will create both vector and scalar
460   /// entries.) Then, as we encounter uses of the definition, we derive values
461   /// for each scalar or vector use unless such a value is already available.
462   /// For example, if we scalarize a definition and one of its uses is vector,
463   /// we build the required vector on-demand with an insertelement sequence
464   /// when visiting the use. Otherwise, if the use is scalar, we can use the
465   /// existing scalar definition.
466   ///
467   /// Return a value in the new loop corresponding to \p V from the original
468   /// loop at unroll index \p Part. If the value has already been vectorized,
469   /// the corresponding vector entry in VectorLoopValueMap is returned. If,
470   /// however, the value has a scalar entry in VectorLoopValueMap, we construct
471   /// a new vector value on-demand by inserting the scalar values into a vector
472   /// with an insertelement sequence. If the value has been neither vectorized
473   /// nor scalarized, it must be loop invariant, so we simply broadcast the
474   /// value into a vector.
475   Value *getOrCreateVectorValue(Value *V, unsigned Part);
476 
477   /// Return a value in the new loop corresponding to \p V from the original
478   /// loop at unroll and vector indices \p Instance. If the value has been
479   /// vectorized but not scalarized, the necessary extractelement instruction
480   /// will be generated.
481   Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance);
482 
483   /// Construct the vector value of a scalarized value \p V one lane at a time.
484   void packScalarIntoVectorValue(Value *V, const VPIteration &Instance);
485 
486   /// Try to vectorize the interleaved access group that \p Instr belongs to
487   /// with the base address given in \p Addr, optionally masking the vector
488   /// operations if \p BlockInMask is non-null. Use \p State to translate given
489   /// VPValues to IR values in the vectorized loop.
490   void vectorizeInterleaveGroup(Instruction *Instr, VPTransformState &State,
491                                 VPValue *Addr, VPValue *BlockInMask = nullptr);
492 
493   /// Vectorize Load and Store instructions with the base address given in \p
494   /// Addr, optionally masking the vector operations if \p BlockInMask is
495   /// non-null. Use \p State to translate given VPValues to IR values in the
496   /// vectorized loop.
497   void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State,
498                                   VPValue *Addr,
499                                   VPValue *BlockInMask = nullptr);
500 
501   /// Set the debug location in the builder using the debug location in
502   /// the instruction.
503   void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr);
504 
505   /// Fix the non-induction PHIs in the OrigPHIsToFix vector.
506   void fixNonInductionPHIs(void);
507 
508 protected:
509   friend class LoopVectorizationPlanner;
510 
511   /// A small list of PHINodes.
512   using PhiVector = SmallVector<PHINode *, 4>;
513 
514   /// A type for scalarized values in the new loop. Each value from the
515   /// original loop, when scalarized, is represented by UF x VF scalar values
516   /// in the new unrolled loop, where UF is the unroll factor and VF is the
517   /// vectorization factor.
518   using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
519 
520   /// Set up the values of the IVs correctly when exiting the vector loop.
521   void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
522                     Value *CountRoundDown, Value *EndValue,
523                     BasicBlock *MiddleBlock);
524 
525   /// Create a new induction variable inside L.
526   PHINode *createInductionVariable(Loop *L, Value *Start, Value *End,
527                                    Value *Step, Instruction *DL);
528 
529   /// Handle all cross-iteration phis in the header.
530   void fixCrossIterationPHIs();
531 
532   /// Fix a first-order recurrence. This is the second phase of vectorizing
533   /// this phi node.
534   void fixFirstOrderRecurrence(PHINode *Phi);
535 
536   /// Fix a reduction cross-iteration phi. This is the second phase of
537   /// vectorizing this phi node.
538   void fixReduction(PHINode *Phi);
539 
540   /// Clear NSW/NUW flags from reduction instructions if necessary.
541   void clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc);
542 
543   /// The Loop exit block may have single value PHI nodes with some
544   /// incoming value. While vectorizing we only handled real values
545   /// that were defined inside the loop and we should have one value for
546   /// each predecessor of its parent basic block. See PR14725.
547   void fixLCSSAPHIs();
548 
549   /// Iteratively sink the scalarized operands of a predicated instruction into
550   /// the block that was created for it.
551   void sinkScalarOperands(Instruction *PredInst);
552 
553   /// Shrinks vector element sizes to the smallest bitwidth they can be legally
554   /// represented as.
555   void truncateToMinimalBitwidths();
556 
557   /// Create a broadcast instruction. This method generates a broadcast
558   /// instruction (shuffle) for loop invariant values and for the induction
559   /// value. If this is the induction variable then we extend it to N, N+1, ...
560   /// this is needed because each iteration in the loop corresponds to a SIMD
561   /// element.
562   virtual Value *getBroadcastInstrs(Value *V);
563 
564   /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...)
565   /// to each vector element of Val. The sequence starts at StartIndex.
566   /// \p Opcode is relevant for FP induction variable.
567   virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step,
568                                Instruction::BinaryOps Opcode =
569                                Instruction::BinaryOpsEnd);
570 
571   /// Compute scalar induction steps. \p ScalarIV is the scalar induction
572   /// variable on which to base the steps, \p Step is the size of the step, and
573   /// \p EntryVal is the value from the original loop that maps to the steps.
574   /// Note that \p EntryVal doesn't have to be an induction variable - it
575   /// can also be a truncate instruction.
576   void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal,
577                         const InductionDescriptor &ID);
578 
579   /// Create a vector induction phi node based on an existing scalar one. \p
580   /// EntryVal is the value from the original loop that maps to the vector phi
581   /// node, and \p Step is the loop-invariant step. If \p EntryVal is a
582   /// truncate instruction, instead of widening the original IV, we widen a
583   /// version of the IV truncated to \p EntryVal's type.
584   void createVectorIntOrFpInductionPHI(const InductionDescriptor &II,
585                                        Value *Step, Instruction *EntryVal);
586 
587   /// Returns true if an instruction \p I should be scalarized instead of
588   /// vectorized for the chosen vectorization factor.
589   bool shouldScalarizeInstruction(Instruction *I) const;
590 
591   /// Returns true if we should generate a scalar version of \p IV.
592   bool needsScalarInduction(Instruction *IV) const;
593 
594   /// If there is a cast involved in the induction variable \p ID, which should
595   /// be ignored in the vectorized loop body, this function records the
596   /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the
597   /// cast. We had already proved that the casted Phi is equal to the uncasted
598   /// Phi in the vectorized loop (under a runtime guard), and therefore
599   /// there is no need to vectorize the cast - the same value can be used in the
600   /// vector loop for both the Phi and the cast.
601   /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified,
602   /// Otherwise, \p VectorLoopValue is a widened/vectorized value.
603   ///
604   /// \p EntryVal is the value from the original loop that maps to the vector
605   /// phi node and is used to distinguish what is the IV currently being
606   /// processed - original one (if \p EntryVal is a phi corresponding to the
607   /// original IV) or the "newly-created" one based on the proof mentioned above
608   /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the
609   /// latter case \p EntryVal is a TruncInst and we must not record anything for
610   /// that IV, but it's error-prone to expect callers of this routine to care
611   /// about that, hence this explicit parameter.
612   void recordVectorLoopValueForInductionCast(const InductionDescriptor &ID,
613                                              const Instruction *EntryVal,
614                                              Value *VectorLoopValue,
615                                              unsigned Part,
616                                              unsigned Lane = UINT_MAX);
617 
618   /// Generate a shuffle sequence that will reverse the vector Vec.
619   virtual Value *reverseVector(Value *Vec);
620 
621   /// Returns (and creates if needed) the original loop trip count.
622   Value *getOrCreateTripCount(Loop *NewLoop);
623 
624   /// Returns (and creates if needed) the trip count of the widened loop.
625   Value *getOrCreateVectorTripCount(Loop *NewLoop);
626 
627   /// Returns a bitcasted value to the requested vector type.
628   /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
629   Value *createBitOrPointerCast(Value *V, VectorType *DstVTy,
630                                 const DataLayout &DL);
631 
632   /// Emit a bypass check to see if the vector trip count is zero, including if
633   /// it overflows.
634   void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass);
635 
636   /// Emit a bypass check to see if all of the SCEV assumptions we've
637   /// had to make are correct.
638   void emitSCEVChecks(Loop *L, BasicBlock *Bypass);
639 
640   /// Emit bypass checks to check any memory assumptions we may have made.
641   void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass);
642 
643   /// Compute the transformed value of Index at offset StartValue using step
644   /// StepValue.
645   /// For integer induction, returns StartValue + Index * StepValue.
646   /// For pointer induction, returns StartValue[Index * StepValue].
647   /// FIXME: The newly created binary instructions should contain nsw/nuw
648   /// flags, which can be found from the original scalar operations.
649   Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE,
650                               const DataLayout &DL,
651                               const InductionDescriptor &ID) const;
652 
653   /// Add additional metadata to \p To that was not present on \p Orig.
654   ///
655   /// Currently this is used to add the noalias annotations based on the
656   /// inserted memchecks.  Use this for instructions that are *cloned* into the
657   /// vector loop.
658   void addNewMetadata(Instruction *To, const Instruction *Orig);
659 
660   /// Add metadata from one instruction to another.
661   ///
662   /// This includes both the original MDs from \p From and additional ones (\see
663   /// addNewMetadata).  Use this for *newly created* instructions in the vector
664   /// loop.
665   void addMetadata(Instruction *To, Instruction *From);
666 
667   /// Similar to the previous function but it adds the metadata to a
668   /// vector of instructions.
669   void addMetadata(ArrayRef<Value *> To, Instruction *From);
670 
671   /// The original loop.
672   Loop *OrigLoop;
673 
674   /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
675   /// dynamic knowledge to simplify SCEV expressions and converts them to a
676   /// more usable form.
677   PredicatedScalarEvolution &PSE;
678 
679   /// Loop Info.
680   LoopInfo *LI;
681 
682   /// Dominator Tree.
683   DominatorTree *DT;
684 
685   /// Alias Analysis.
686   AliasAnalysis *AA;
687 
688   /// Target Library Info.
689   const TargetLibraryInfo *TLI;
690 
691   /// Target Transform Info.
692   const TargetTransformInfo *TTI;
693 
694   /// Assumption Cache.
695   AssumptionCache *AC;
696 
697   /// Interface to emit optimization remarks.
698   OptimizationRemarkEmitter *ORE;
699 
700   /// LoopVersioning.  It's only set up (non-null) if memchecks were
701   /// used.
702   ///
703   /// This is currently only used to add no-alias metadata based on the
704   /// memchecks.  The actually versioning is performed manually.
705   std::unique_ptr<LoopVersioning> LVer;
706 
707   /// The vectorization SIMD factor to use. Each vector will have this many
708   /// vector elements.
709   unsigned VF;
710 
711   /// The vectorization unroll factor to use. Each scalar is vectorized to this
712   /// many different vector instructions.
713   unsigned UF;
714 
715   /// The builder that we use
716   IRBuilder<> Builder;
717 
718   // --- Vectorization state ---
719 
720   /// The vector-loop preheader.
721   BasicBlock *LoopVectorPreHeader;
722 
723   /// The scalar-loop preheader.
724   BasicBlock *LoopScalarPreHeader;
725 
726   /// Middle Block between the vector and the scalar.
727   BasicBlock *LoopMiddleBlock;
728 
729   /// The ExitBlock of the scalar loop.
730   BasicBlock *LoopExitBlock;
731 
732   /// The vector loop body.
733   BasicBlock *LoopVectorBody;
734 
735   /// The scalar loop body.
736   BasicBlock *LoopScalarBody;
737 
738   /// A list of all bypass blocks. The first block is the entry of the loop.
739   SmallVector<BasicBlock *, 4> LoopBypassBlocks;
740 
741   /// The new Induction variable which was added to the new block.
742   PHINode *Induction = nullptr;
743 
744   /// The induction variable of the old basic block.
745   PHINode *OldInduction = nullptr;
746 
747   /// Maps values from the original loop to their corresponding values in the
748   /// vectorized loop. A key value can map to either vector values, scalar
749   /// values or both kinds of values, depending on whether the key was
750   /// vectorized and scalarized.
751   VectorizerValueMap VectorLoopValueMap;
752 
753   /// Store instructions that were predicated.
754   SmallVector<Instruction *, 4> PredicatedInstructions;
755 
756   /// Trip count of the original loop.
757   Value *TripCount = nullptr;
758 
759   /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
760   Value *VectorTripCount = nullptr;
761 
762   /// The legality analysis.
763   LoopVectorizationLegality *Legal;
764 
765   /// The profitablity analysis.
766   LoopVectorizationCostModel *Cost;
767 
768   // Record whether runtime checks are added.
769   bool AddedSafetyChecks = false;
770 
771   // Holds the end values for each induction variable. We save the end values
772   // so we can later fix-up the external users of the induction variables.
773   DenseMap<PHINode *, Value *> IVEndValues;
774 
775   // Vector of original scalar PHIs whose corresponding widened PHIs need to be
776   // fixed up at the end of vector code generation.
777   SmallVector<PHINode *, 8> OrigPHIsToFix;
778 };
779 
780 class InnerLoopUnroller : public InnerLoopVectorizer {
781 public:
782   InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
783                     LoopInfo *LI, DominatorTree *DT,
784                     const TargetLibraryInfo *TLI,
785                     const TargetTransformInfo *TTI, AssumptionCache *AC,
786                     OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
787                     LoopVectorizationLegality *LVL,
788                     LoopVectorizationCostModel *CM)
789       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1,
790                             UnrollFactor, LVL, CM) {}
791 
792 private:
793   Value *getBroadcastInstrs(Value *V) override;
794   Value *getStepVector(Value *Val, int StartIdx, Value *Step,
795                        Instruction::BinaryOps Opcode =
796                        Instruction::BinaryOpsEnd) override;
797   Value *reverseVector(Value *Vec) override;
798 };
799 
800 } // end namespace llvm
801 
802 /// Look for a meaningful debug location on the instruction or it's
803 /// operands.
804 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
805   if (!I)
806     return I;
807 
808   DebugLoc Empty;
809   if (I->getDebugLoc() != Empty)
810     return I;
811 
812   for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) {
813     if (Instruction *OpInst = dyn_cast<Instruction>(*OI))
814       if (OpInst->getDebugLoc() != Empty)
815         return OpInst;
816   }
817 
818   return I;
819 }
820 
821 void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) {
822   if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) {
823     const DILocation *DIL = Inst->getDebugLoc();
824     if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
825         !isa<DbgInfoIntrinsic>(Inst)) {
826       auto NewDIL = DIL->cloneByMultiplyingDuplicationFactor(UF * VF);
827       if (NewDIL)
828         B.SetCurrentDebugLocation(NewDIL.getValue());
829       else
830         LLVM_DEBUG(dbgs()
831                    << "Failed to create new discriminator: "
832                    << DIL->getFilename() << " Line: " << DIL->getLine());
833     }
834     else
835       B.SetCurrentDebugLocation(DIL);
836   } else
837     B.SetCurrentDebugLocation(DebugLoc());
838 }
839 
840 /// Write a record \p DebugMsg about vectorization failure to the debug
841 /// output stream. If \p I is passed, it is an instruction that prevents
842 /// vectorization.
843 #ifndef NDEBUG
844 static void debugVectorizationFailure(const StringRef DebugMsg,
845     Instruction *I) {
846   dbgs() << "LV: Not vectorizing: " << DebugMsg;
847   if (I != nullptr)
848     dbgs() << " " << *I;
849   else
850     dbgs() << '.';
851   dbgs() << '\n';
852 }
853 #endif
854 
855 /// Create an analysis remark that explains why vectorization failed
856 ///
857 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint).  \p
858 /// RemarkName is the identifier for the remark.  If \p I is passed it is an
859 /// instruction that prevents vectorization.  Otherwise \p TheLoop is used for
860 /// the location of the remark.  \return the remark object that can be
861 /// streamed to.
862 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
863     StringRef RemarkName, Loop *TheLoop, Instruction *I) {
864   Value *CodeRegion = TheLoop->getHeader();
865   DebugLoc DL = TheLoop->getStartLoc();
866 
867   if (I) {
868     CodeRegion = I->getParent();
869     // If there is no debug location attached to the instruction, revert back to
870     // using the loop's.
871     if (I->getDebugLoc())
872       DL = I->getDebugLoc();
873   }
874 
875   OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion);
876   R << "loop not vectorized: ";
877   return R;
878 }
879 
880 namespace llvm {
881 
882 void reportVectorizationFailure(const StringRef DebugMsg,
883     const StringRef OREMsg, const StringRef ORETag,
884     OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) {
885   LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I));
886   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
887   ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(),
888                 ORETag, TheLoop, I) << OREMsg);
889 }
890 
891 } // end namespace llvm
892 
893 #ifndef NDEBUG
894 /// \return string containing a file name and a line # for the given loop.
895 static std::string getDebugLocString(const Loop *L) {
896   std::string Result;
897   if (L) {
898     raw_string_ostream OS(Result);
899     if (const DebugLoc LoopDbgLoc = L->getStartLoc())
900       LoopDbgLoc.print(OS);
901     else
902       // Just print the module name.
903       OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
904     OS.flush();
905   }
906   return Result;
907 }
908 #endif
909 
910 void InnerLoopVectorizer::addNewMetadata(Instruction *To,
911                                          const Instruction *Orig) {
912   // If the loop was versioned with memchecks, add the corresponding no-alias
913   // metadata.
914   if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig)))
915     LVer->annotateInstWithNoAlias(To, Orig);
916 }
917 
918 void InnerLoopVectorizer::addMetadata(Instruction *To,
919                                       Instruction *From) {
920   propagateMetadata(To, From);
921   addNewMetadata(To, From);
922 }
923 
924 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,
925                                       Instruction *From) {
926   for (Value *V : To) {
927     if (Instruction *I = dyn_cast<Instruction>(V))
928       addMetadata(I, From);
929   }
930 }
931 
932 namespace llvm {
933 
934 // Loop vectorization cost-model hints how the scalar epilogue loop should be
935 // lowered.
936 enum ScalarEpilogueLowering {
937 
938   // The default: allowing scalar epilogues.
939   CM_ScalarEpilogueAllowed,
940 
941   // Vectorization with OptForSize: don't allow epilogues.
942   CM_ScalarEpilogueNotAllowedOptSize,
943 
944   // A special case of vectorisation with OptForSize: loops with a very small
945   // trip count are considered for vectorization under OptForSize, thereby
946   // making sure the cost of their loop body is dominant, free of runtime
947   // guards and scalar iteration overheads.
948   CM_ScalarEpilogueNotAllowedLowTripLoop,
949 
950   // Loop hint predicate indicating an epilogue is undesired.
951   CM_ScalarEpilogueNotNeededUsePredicate
952 };
953 
954 /// LoopVectorizationCostModel - estimates the expected speedups due to
955 /// vectorization.
956 /// In many cases vectorization is not profitable. This can happen because of
957 /// a number of reasons. In this class we mainly attempt to predict the
958 /// expected speedup/slowdowns due to the supported instruction set. We use the
959 /// TargetTransformInfo to query the different backends for the cost of
960 /// different operations.
961 class LoopVectorizationCostModel {
962 public:
963   LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
964                              PredicatedScalarEvolution &PSE, LoopInfo *LI,
965                              LoopVectorizationLegality *Legal,
966                              const TargetTransformInfo &TTI,
967                              const TargetLibraryInfo *TLI, DemandedBits *DB,
968                              AssumptionCache *AC,
969                              OptimizationRemarkEmitter *ORE, const Function *F,
970                              const LoopVectorizeHints *Hints,
971                              InterleavedAccessInfo &IAI)
972       : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
973         TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
974         Hints(Hints), InterleaveInfo(IAI) {}
975 
976   /// \return An upper bound for the vectorization factor, or None if
977   /// vectorization and interleaving should be avoided up front.
978   Optional<unsigned> computeMaxVF();
979 
980   /// \return True if runtime checks are required for vectorization, and false
981   /// otherwise.
982   bool runtimeChecksRequired();
983 
984   /// \return The most profitable vectorization factor and the cost of that VF.
985   /// This method checks every power of two up to MaxVF. If UserVF is not ZERO
986   /// then this vectorization factor will be selected if vectorization is
987   /// possible.
988   VectorizationFactor selectVectorizationFactor(unsigned MaxVF);
989 
990   /// Setup cost-based decisions for user vectorization factor.
991   void selectUserVectorizationFactor(unsigned UserVF) {
992     collectUniformsAndScalars(UserVF);
993     collectInstsToScalarize(UserVF);
994   }
995 
996   /// \return The size (in bits) of the smallest and widest types in the code
997   /// that needs to be vectorized. We ignore values that remain scalar such as
998   /// 64 bit loop indices.
999   std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
1000 
1001   /// \return The desired interleave count.
1002   /// If interleave count has been specified by metadata it will be returned.
1003   /// Otherwise, the interleave count is computed and returned. VF and LoopCost
1004   /// are the selected vectorization factor and the cost of the selected VF.
1005   unsigned selectInterleaveCount(unsigned VF, unsigned LoopCost);
1006 
1007   /// Memory access instruction may be vectorized in more than one way.
1008   /// Form of instruction after vectorization depends on cost.
1009   /// This function takes cost-based decisions for Load/Store instructions
1010   /// and collects them in a map. This decisions map is used for building
1011   /// the lists of loop-uniform and loop-scalar instructions.
1012   /// The calculated cost is saved with widening decision in order to
1013   /// avoid redundant calculations.
1014   void setCostBasedWideningDecision(unsigned VF);
1015 
1016   /// A struct that represents some properties of the register usage
1017   /// of a loop.
1018   struct RegisterUsage {
1019     /// Holds the number of loop invariant values that are used in the loop.
1020     /// The key is ClassID of target-provided register class.
1021     SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs;
1022     /// Holds the maximum number of concurrent live intervals in the loop.
1023     /// The key is ClassID of target-provided register class.
1024     SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
1025   };
1026 
1027   /// \return Returns information about the register usages of the loop for the
1028   /// given vectorization factors.
1029   SmallVector<RegisterUsage, 8> calculateRegisterUsage(ArrayRef<unsigned> VFs);
1030 
1031   /// Collect values we want to ignore in the cost model.
1032   void collectValuesToIgnore();
1033 
1034   /// \returns The smallest bitwidth each instruction can be represented with.
1035   /// The vector equivalents of these instructions should be truncated to this
1036   /// type.
1037   const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
1038     return MinBWs;
1039   }
1040 
1041   /// \returns True if it is more profitable to scalarize instruction \p I for
1042   /// vectorization factor \p VF.
1043   bool isProfitableToScalarize(Instruction *I, unsigned VF) const {
1044     assert(VF > 1 && "Profitable to scalarize relevant only for VF > 1.");
1045 
1046     // Cost model is not run in the VPlan-native path - return conservative
1047     // result until this changes.
1048     if (EnableVPlanNativePath)
1049       return false;
1050 
1051     auto Scalars = InstsToScalarize.find(VF);
1052     assert(Scalars != InstsToScalarize.end() &&
1053            "VF not yet analyzed for scalarization profitability");
1054     return Scalars->second.find(I) != Scalars->second.end();
1055   }
1056 
1057   /// Returns true if \p I is known to be uniform after vectorization.
1058   bool isUniformAfterVectorization(Instruction *I, unsigned VF) const {
1059     if (VF == 1)
1060       return true;
1061 
1062     // Cost model is not run in the VPlan-native path - return conservative
1063     // result until this changes.
1064     if (EnableVPlanNativePath)
1065       return false;
1066 
1067     auto UniformsPerVF = Uniforms.find(VF);
1068     assert(UniformsPerVF != Uniforms.end() &&
1069            "VF not yet analyzed for uniformity");
1070     return UniformsPerVF->second.find(I) != UniformsPerVF->second.end();
1071   }
1072 
1073   /// Returns true if \p I is known to be scalar after vectorization.
1074   bool isScalarAfterVectorization(Instruction *I, unsigned VF) const {
1075     if (VF == 1)
1076       return true;
1077 
1078     // Cost model is not run in the VPlan-native path - return conservative
1079     // result until this changes.
1080     if (EnableVPlanNativePath)
1081       return false;
1082 
1083     auto ScalarsPerVF = Scalars.find(VF);
1084     assert(ScalarsPerVF != Scalars.end() &&
1085            "Scalar values are not calculated for VF");
1086     return ScalarsPerVF->second.find(I) != ScalarsPerVF->second.end();
1087   }
1088 
1089   /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1090   /// for vectorization factor \p VF.
1091   bool canTruncateToMinimalBitwidth(Instruction *I, unsigned VF) const {
1092     return VF > 1 && MinBWs.find(I) != MinBWs.end() &&
1093            !isProfitableToScalarize(I, VF) &&
1094            !isScalarAfterVectorization(I, VF);
1095   }
1096 
1097   /// Decision that was taken during cost calculation for memory instruction.
1098   enum InstWidening {
1099     CM_Unknown,
1100     CM_Widen,         // For consecutive accesses with stride +1.
1101     CM_Widen_Reverse, // For consecutive accesses with stride -1.
1102     CM_Interleave,
1103     CM_GatherScatter,
1104     CM_Scalarize
1105   };
1106 
1107   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1108   /// instruction \p I and vector width \p VF.
1109   void setWideningDecision(Instruction *I, unsigned VF, InstWidening W,
1110                            unsigned Cost) {
1111     assert(VF >= 2 && "Expected VF >=2");
1112     WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1113   }
1114 
1115   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1116   /// interleaving group \p Grp and vector width \p VF.
1117   void setWideningDecision(const InterleaveGroup<Instruction> *Grp, unsigned VF,
1118                            InstWidening W, unsigned Cost) {
1119     assert(VF >= 2 && "Expected VF >=2");
1120     /// Broadcast this decicion to all instructions inside the group.
1121     /// But the cost will be assigned to one instruction only.
1122     for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1123       if (auto *I = Grp->getMember(i)) {
1124         if (Grp->getInsertPos() == I)
1125           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1126         else
1127           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1128       }
1129     }
1130   }
1131 
1132   /// Return the cost model decision for the given instruction \p I and vector
1133   /// width \p VF. Return CM_Unknown if this instruction did not pass
1134   /// through the cost modeling.
1135   InstWidening getWideningDecision(Instruction *I, unsigned VF) {
1136     assert(VF >= 2 && "Expected VF >=2");
1137 
1138     // Cost model is not run in the VPlan-native path - return conservative
1139     // result until this changes.
1140     if (EnableVPlanNativePath)
1141       return CM_GatherScatter;
1142 
1143     std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
1144     auto Itr = WideningDecisions.find(InstOnVF);
1145     if (Itr == WideningDecisions.end())
1146       return CM_Unknown;
1147     return Itr->second.first;
1148   }
1149 
1150   /// Return the vectorization cost for the given instruction \p I and vector
1151   /// width \p VF.
1152   unsigned getWideningCost(Instruction *I, unsigned VF) {
1153     assert(VF >= 2 && "Expected VF >=2");
1154     std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
1155     assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&
1156            "The cost is not calculated");
1157     return WideningDecisions[InstOnVF].second;
1158   }
1159 
1160   /// Return True if instruction \p I is an optimizable truncate whose operand
1161   /// is an induction variable. Such a truncate will be removed by adding a new
1162   /// induction variable with the destination type.
1163   bool isOptimizableIVTruncate(Instruction *I, unsigned VF) {
1164     // If the instruction is not a truncate, return false.
1165     auto *Trunc = dyn_cast<TruncInst>(I);
1166     if (!Trunc)
1167       return false;
1168 
1169     // Get the source and destination types of the truncate.
1170     Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1171     Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1172 
1173     // If the truncate is free for the given types, return false. Replacing a
1174     // free truncate with an induction variable would add an induction variable
1175     // update instruction to each iteration of the loop. We exclude from this
1176     // check the primary induction variable since it will need an update
1177     // instruction regardless.
1178     Value *Op = Trunc->getOperand(0);
1179     if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1180       return false;
1181 
1182     // If the truncated value is not an induction variable, return false.
1183     return Legal->isInductionPhi(Op);
1184   }
1185 
1186   /// Collects the instructions to scalarize for each predicated instruction in
1187   /// the loop.
1188   void collectInstsToScalarize(unsigned VF);
1189 
1190   /// Collect Uniform and Scalar values for the given \p VF.
1191   /// The sets depend on CM decision for Load/Store instructions
1192   /// that may be vectorized as interleave, gather-scatter or scalarized.
1193   void collectUniformsAndScalars(unsigned VF) {
1194     // Do the analysis once.
1195     if (VF == 1 || Uniforms.find(VF) != Uniforms.end())
1196       return;
1197     setCostBasedWideningDecision(VF);
1198     collectLoopUniforms(VF);
1199     collectLoopScalars(VF);
1200   }
1201 
1202   /// Returns true if the target machine supports masked store operation
1203   /// for the given \p DataType and kind of access to \p Ptr.
1204   bool isLegalMaskedStore(Type *DataType, Value *Ptr, MaybeAlign Alignment) {
1205     return Legal->isConsecutivePtr(Ptr) &&
1206            TTI.isLegalMaskedStore(DataType, Alignment);
1207   }
1208 
1209   /// Returns true if the target machine supports masked load operation
1210   /// for the given \p DataType and kind of access to \p Ptr.
1211   bool isLegalMaskedLoad(Type *DataType, Value *Ptr, MaybeAlign Alignment) {
1212     return Legal->isConsecutivePtr(Ptr) &&
1213            TTI.isLegalMaskedLoad(DataType, Alignment);
1214   }
1215 
1216   /// Returns true if the target machine supports masked scatter operation
1217   /// for the given \p DataType.
1218   bool isLegalMaskedScatter(Type *DataType, MaybeAlign Alignment) {
1219     return TTI.isLegalMaskedScatter(DataType, Alignment);
1220   }
1221 
1222   /// Returns true if the target machine supports masked gather operation
1223   /// for the given \p DataType.
1224   bool isLegalMaskedGather(Type *DataType, MaybeAlign Alignment) {
1225     return TTI.isLegalMaskedGather(DataType, Alignment);
1226   }
1227 
1228   /// Returns true if the target machine can represent \p V as a masked gather
1229   /// or scatter operation.
1230   bool isLegalGatherOrScatter(Value *V) {
1231     bool LI = isa<LoadInst>(V);
1232     bool SI = isa<StoreInst>(V);
1233     if (!LI && !SI)
1234       return false;
1235     auto *Ty = getMemInstValueType(V);
1236     MaybeAlign Align = getLoadStoreAlignment(V);
1237     return (LI && isLegalMaskedGather(Ty, Align)) ||
1238            (SI && isLegalMaskedScatter(Ty, Align));
1239   }
1240 
1241   /// Returns true if \p I is an instruction that will be scalarized with
1242   /// predication. Such instructions include conditional stores and
1243   /// instructions that may divide by zero.
1244   /// If a non-zero VF has been calculated, we check if I will be scalarized
1245   /// predication for that VF.
1246   bool isScalarWithPredication(Instruction *I, unsigned VF = 1);
1247 
1248   // Returns true if \p I is an instruction that will be predicated either
1249   // through scalar predication or masked load/store or masked gather/scatter.
1250   // Superset of instructions that return true for isScalarWithPredication.
1251   bool isPredicatedInst(Instruction *I) {
1252     if (!blockNeedsPredication(I->getParent()))
1253       return false;
1254     // Loads and stores that need some form of masked operation are predicated
1255     // instructions.
1256     if (isa<LoadInst>(I) || isa<StoreInst>(I))
1257       return Legal->isMaskRequired(I);
1258     return isScalarWithPredication(I);
1259   }
1260 
1261   /// Returns true if \p I is a memory instruction with consecutive memory
1262   /// access that can be widened.
1263   bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1);
1264 
1265   /// Returns true if \p I is a memory instruction in an interleaved-group
1266   /// of memory accesses that can be vectorized with wide vector loads/stores
1267   /// and shuffles.
1268   bool interleavedAccessCanBeWidened(Instruction *I, unsigned VF = 1);
1269 
1270   /// Check if \p Instr belongs to any interleaved access group.
1271   bool isAccessInterleaved(Instruction *Instr) {
1272     return InterleaveInfo.isInterleaved(Instr);
1273   }
1274 
1275   /// Get the interleaved access group that \p Instr belongs to.
1276   const InterleaveGroup<Instruction> *
1277   getInterleavedAccessGroup(Instruction *Instr) {
1278     return InterleaveInfo.getInterleaveGroup(Instr);
1279   }
1280 
1281   /// Returns true if an interleaved group requires a scalar iteration
1282   /// to handle accesses with gaps, and there is nothing preventing us from
1283   /// creating a scalar epilogue.
1284   bool requiresScalarEpilogue() const {
1285     return isScalarEpilogueAllowed() && InterleaveInfo.requiresScalarEpilogue();
1286   }
1287 
1288   /// Returns true if a scalar epilogue is not allowed due to optsize or a
1289   /// loop hint annotation.
1290   bool isScalarEpilogueAllowed() const {
1291     return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1292   }
1293 
1294   /// Returns true if all loop blocks should be masked to fold tail loop.
1295   bool foldTailByMasking() const { return FoldTailByMasking; }
1296 
1297   bool blockNeedsPredication(BasicBlock *BB) {
1298     return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1299   }
1300 
1301   /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1302   /// with factor VF.  Return the cost of the instruction, including
1303   /// scalarization overhead if it's needed.
1304   unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF);
1305 
1306   /// Estimate cost of a call instruction CI if it were vectorized with factor
1307   /// VF. Return the cost of the instruction, including scalarization overhead
1308   /// if it's needed. The flag NeedToScalarize shows if the call needs to be
1309   /// scalarized -
1310   /// i.e. either vector version isn't available, or is too expensive.
1311   unsigned getVectorCallCost(CallInst *CI, unsigned VF, bool &NeedToScalarize);
1312 
1313 private:
1314   unsigned NumPredStores = 0;
1315 
1316   /// \return An upper bound for the vectorization factor, larger than zero.
1317   /// One is returned if vectorization should best be avoided due to cost.
1318   unsigned computeFeasibleMaxVF(unsigned ConstTripCount);
1319 
1320   /// The vectorization cost is a combination of the cost itself and a boolean
1321   /// indicating whether any of the contributing operations will actually
1322   /// operate on
1323   /// vector values after type legalization in the backend. If this latter value
1324   /// is
1325   /// false, then all operations will be scalarized (i.e. no vectorization has
1326   /// actually taken place).
1327   using VectorizationCostTy = std::pair<unsigned, bool>;
1328 
1329   /// Returns the expected execution cost. The unit of the cost does
1330   /// not matter because we use the 'cost' units to compare different
1331   /// vector widths. The cost that is returned is *not* normalized by
1332   /// the factor width.
1333   VectorizationCostTy expectedCost(unsigned VF);
1334 
1335   /// Returns the execution time cost of an instruction for a given vector
1336   /// width. Vector width of one means scalar.
1337   VectorizationCostTy getInstructionCost(Instruction *I, unsigned VF);
1338 
1339   /// The cost-computation logic from getInstructionCost which provides
1340   /// the vector type as an output parameter.
1341   unsigned getInstructionCost(Instruction *I, unsigned VF, Type *&VectorTy);
1342 
1343   /// Calculate vectorization cost of memory instruction \p I.
1344   unsigned getMemoryInstructionCost(Instruction *I, unsigned VF);
1345 
1346   /// The cost computation for scalarized memory instruction.
1347   unsigned getMemInstScalarizationCost(Instruction *I, unsigned VF);
1348 
1349   /// The cost computation for interleaving group of memory instructions.
1350   unsigned getInterleaveGroupCost(Instruction *I, unsigned VF);
1351 
1352   /// The cost computation for Gather/Scatter instruction.
1353   unsigned getGatherScatterCost(Instruction *I, unsigned VF);
1354 
1355   /// The cost computation for widening instruction \p I with consecutive
1356   /// memory access.
1357   unsigned getConsecutiveMemOpCost(Instruction *I, unsigned VF);
1358 
1359   /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1360   /// Load: scalar load + broadcast.
1361   /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1362   /// element)
1363   unsigned getUniformMemOpCost(Instruction *I, unsigned VF);
1364 
1365   /// Estimate the overhead of scalarizing an instruction. This is a
1366   /// convenience wrapper for the type-based getScalarizationOverhead API.
1367   unsigned getScalarizationOverhead(Instruction *I, unsigned VF);
1368 
1369   /// Returns whether the instruction is a load or store and will be a emitted
1370   /// as a vector operation.
1371   bool isConsecutiveLoadOrStore(Instruction *I);
1372 
1373   /// Returns true if an artificially high cost for emulated masked memrefs
1374   /// should be used.
1375   bool useEmulatedMaskMemRefHack(Instruction *I);
1376 
1377   /// Map of scalar integer values to the smallest bitwidth they can be legally
1378   /// represented as. The vector equivalents of these values should be truncated
1379   /// to this type.
1380   MapVector<Instruction *, uint64_t> MinBWs;
1381 
1382   /// A type representing the costs for instructions if they were to be
1383   /// scalarized rather than vectorized. The entries are Instruction-Cost
1384   /// pairs.
1385   using ScalarCostsTy = DenseMap<Instruction *, unsigned>;
1386 
1387   /// A set containing all BasicBlocks that are known to present after
1388   /// vectorization as a predicated block.
1389   SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;
1390 
1391   /// Records whether it is allowed to have the original scalar loop execute at
1392   /// least once. This may be needed as a fallback loop in case runtime
1393   /// aliasing/dependence checks fail, or to handle the tail/remainder
1394   /// iterations when the trip count is unknown or doesn't divide by the VF,
1395   /// or as a peel-loop to handle gaps in interleave-groups.
1396   /// Under optsize and when the trip count is very small we don't allow any
1397   /// iterations to execute in the scalar loop.
1398   ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1399 
1400   /// All blocks of loop are to be masked to fold tail of scalar iterations.
1401   bool FoldTailByMasking = false;
1402 
1403   /// A map holding scalar costs for different vectorization factors. The
1404   /// presence of a cost for an instruction in the mapping indicates that the
1405   /// instruction will be scalarized when vectorizing with the associated
1406   /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1407   DenseMap<unsigned, ScalarCostsTy> InstsToScalarize;
1408 
1409   /// Holds the instructions known to be uniform after vectorization.
1410   /// The data is collected per VF.
1411   DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Uniforms;
1412 
1413   /// Holds the instructions known to be scalar after vectorization.
1414   /// The data is collected per VF.
1415   DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Scalars;
1416 
1417   /// Holds the instructions (address computations) that are forced to be
1418   /// scalarized.
1419   DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1420 
1421   /// Returns the expected difference in cost from scalarizing the expression
1422   /// feeding a predicated instruction \p PredInst. The instructions to
1423   /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1424   /// non-negative return value implies the expression will be scalarized.
1425   /// Currently, only single-use chains are considered for scalarization.
1426   int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
1427                               unsigned VF);
1428 
1429   /// Collect the instructions that are uniform after vectorization. An
1430   /// instruction is uniform if we represent it with a single scalar value in
1431   /// the vectorized loop corresponding to each vector iteration. Examples of
1432   /// uniform instructions include pointer operands of consecutive or
1433   /// interleaved memory accesses. Note that although uniformity implies an
1434   /// instruction will be scalar, the reverse is not true. In general, a
1435   /// scalarized instruction will be represented by VF scalar values in the
1436   /// vectorized loop, each corresponding to an iteration of the original
1437   /// scalar loop.
1438   void collectLoopUniforms(unsigned VF);
1439 
1440   /// Collect the instructions that are scalar after vectorization. An
1441   /// instruction is scalar if it is known to be uniform or will be scalarized
1442   /// during vectorization. Non-uniform scalarized instructions will be
1443   /// represented by VF values in the vectorized loop, each corresponding to an
1444   /// iteration of the original scalar loop.
1445   void collectLoopScalars(unsigned VF);
1446 
1447   /// Keeps cost model vectorization decision and cost for instructions.
1448   /// Right now it is used for memory instructions only.
1449   using DecisionList = DenseMap<std::pair<Instruction *, unsigned>,
1450                                 std::pair<InstWidening, unsigned>>;
1451 
1452   DecisionList WideningDecisions;
1453 
1454   /// Returns true if \p V is expected to be vectorized and it needs to be
1455   /// extracted.
1456   bool needsExtract(Value *V, unsigned VF) const {
1457     Instruction *I = dyn_cast<Instruction>(V);
1458     if (VF == 1 || !I || !TheLoop->contains(I) || TheLoop->isLoopInvariant(I))
1459       return false;
1460 
1461     // Assume we can vectorize V (and hence we need extraction) if the
1462     // scalars are not computed yet. This can happen, because it is called
1463     // via getScalarizationOverhead from setCostBasedWideningDecision, before
1464     // the scalars are collected. That should be a safe assumption in most
1465     // cases, because we check if the operands have vectorizable types
1466     // beforehand in LoopVectorizationLegality.
1467     return Scalars.find(VF) == Scalars.end() ||
1468            !isScalarAfterVectorization(I, VF);
1469   };
1470 
1471   /// Returns a range containing only operands needing to be extracted.
1472   SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1473                                                    unsigned VF) {
1474     return SmallVector<Value *, 4>(make_filter_range(
1475         Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1476   }
1477 
1478 public:
1479   /// The loop that we evaluate.
1480   Loop *TheLoop;
1481 
1482   /// Predicated scalar evolution analysis.
1483   PredicatedScalarEvolution &PSE;
1484 
1485   /// Loop Info analysis.
1486   LoopInfo *LI;
1487 
1488   /// Vectorization legality.
1489   LoopVectorizationLegality *Legal;
1490 
1491   /// Vector target information.
1492   const TargetTransformInfo &TTI;
1493 
1494   /// Target Library Info.
1495   const TargetLibraryInfo *TLI;
1496 
1497   /// Demanded bits analysis.
1498   DemandedBits *DB;
1499 
1500   /// Assumption cache.
1501   AssumptionCache *AC;
1502 
1503   /// Interface to emit optimization remarks.
1504   OptimizationRemarkEmitter *ORE;
1505 
1506   const Function *TheFunction;
1507 
1508   /// Loop Vectorize Hint.
1509   const LoopVectorizeHints *Hints;
1510 
1511   /// The interleave access information contains groups of interleaved accesses
1512   /// with the same stride and close to each other.
1513   InterleavedAccessInfo &InterleaveInfo;
1514 
1515   /// Values to ignore in the cost model.
1516   SmallPtrSet<const Value *, 16> ValuesToIgnore;
1517 
1518   /// Values to ignore in the cost model when VF > 1.
1519   SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1520 };
1521 
1522 } // end namespace llvm
1523 
1524 // Return true if \p OuterLp is an outer loop annotated with hints for explicit
1525 // vectorization. The loop needs to be annotated with #pragma omp simd
1526 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
1527 // vector length information is not provided, vectorization is not considered
1528 // explicit. Interleave hints are not allowed either. These limitations will be
1529 // relaxed in the future.
1530 // Please, note that we are currently forced to abuse the pragma 'clang
1531 // vectorize' semantics. This pragma provides *auto-vectorization hints*
1532 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
1533 // provides *explicit vectorization hints* (LV can bypass legal checks and
1534 // assume that vectorization is legal). However, both hints are implemented
1535 // using the same metadata (llvm.loop.vectorize, processed by
1536 // LoopVectorizeHints). This will be fixed in the future when the native IR
1537 // representation for pragma 'omp simd' is introduced.
1538 static bool isExplicitVecOuterLoop(Loop *OuterLp,
1539                                    OptimizationRemarkEmitter *ORE) {
1540   assert(!OuterLp->empty() && "This is not an outer loop");
1541   LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
1542 
1543   // Only outer loops with an explicit vectorization hint are supported.
1544   // Unannotated outer loops are ignored.
1545   if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
1546     return false;
1547 
1548   Function *Fn = OuterLp->getHeader()->getParent();
1549   if (!Hints.allowVectorization(Fn, OuterLp,
1550                                 true /*VectorizeOnlyWhenForced*/)) {
1551     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
1552     return false;
1553   }
1554 
1555   if (Hints.getInterleave() > 1) {
1556     // TODO: Interleave support is future work.
1557     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
1558                          "outer loops.\n");
1559     Hints.emitRemarkWithHints();
1560     return false;
1561   }
1562 
1563   return true;
1564 }
1565 
1566 static void collectSupportedLoops(Loop &L, LoopInfo *LI,
1567                                   OptimizationRemarkEmitter *ORE,
1568                                   SmallVectorImpl<Loop *> &V) {
1569   // Collect inner loops and outer loops without irreducible control flow. For
1570   // now, only collect outer loops that have explicit vectorization hints. If we
1571   // are stress testing the VPlan H-CFG construction, we collect the outermost
1572   // loop of every loop nest.
1573   if (L.empty() || VPlanBuildStressTest ||
1574       (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
1575     LoopBlocksRPO RPOT(&L);
1576     RPOT.perform(LI);
1577     if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
1578       V.push_back(&L);
1579       // TODO: Collect inner loops inside marked outer loops in case
1580       // vectorization fails for the outer loop. Do not invoke
1581       // 'containsIrreducibleCFG' again for inner loops when the outer loop is
1582       // already known to be reducible. We can use an inherited attribute for
1583       // that.
1584       return;
1585     }
1586   }
1587   for (Loop *InnerL : L)
1588     collectSupportedLoops(*InnerL, LI, ORE, V);
1589 }
1590 
1591 namespace {
1592 
1593 /// The LoopVectorize Pass.
1594 struct LoopVectorize : public FunctionPass {
1595   /// Pass identification, replacement for typeid
1596   static char ID;
1597 
1598   LoopVectorizePass Impl;
1599 
1600   explicit LoopVectorize(bool InterleaveOnlyWhenForced = false,
1601                          bool VectorizeOnlyWhenForced = false)
1602       : FunctionPass(ID) {
1603     Impl.InterleaveOnlyWhenForced = InterleaveOnlyWhenForced;
1604     Impl.VectorizeOnlyWhenForced = VectorizeOnlyWhenForced;
1605     initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
1606   }
1607 
1608   bool runOnFunction(Function &F) override {
1609     if (skipFunction(F))
1610       return false;
1611 
1612     auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
1613     auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
1614     auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
1615     auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
1616     auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
1617     auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
1618     auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
1619     auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
1620     auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
1621     auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
1622     auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
1623     auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
1624     auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
1625 
1626     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
1627         [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
1628 
1629     return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC,
1630                         GetLAA, *ORE, PSI);
1631   }
1632 
1633   void getAnalysisUsage(AnalysisUsage &AU) const override {
1634     AU.addRequired<AssumptionCacheTracker>();
1635     AU.addRequired<BlockFrequencyInfoWrapperPass>();
1636     AU.addRequired<DominatorTreeWrapperPass>();
1637     AU.addRequired<LoopInfoWrapperPass>();
1638     AU.addRequired<ScalarEvolutionWrapperPass>();
1639     AU.addRequired<TargetTransformInfoWrapperPass>();
1640     AU.addRequired<AAResultsWrapperPass>();
1641     AU.addRequired<LoopAccessLegacyAnalysis>();
1642     AU.addRequired<DemandedBitsWrapperPass>();
1643     AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
1644     AU.addRequired<InjectTLIMappingsLegacy>();
1645 
1646     // We currently do not preserve loopinfo/dominator analyses with outer loop
1647     // vectorization. Until this is addressed, mark these analyses as preserved
1648     // only for non-VPlan-native path.
1649     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
1650     if (!EnableVPlanNativePath) {
1651       AU.addPreserved<LoopInfoWrapperPass>();
1652       AU.addPreserved<DominatorTreeWrapperPass>();
1653     }
1654 
1655     AU.addPreserved<BasicAAWrapperPass>();
1656     AU.addPreserved<GlobalsAAWrapperPass>();
1657     AU.addRequired<ProfileSummaryInfoWrapperPass>();
1658   }
1659 };
1660 
1661 } // end anonymous namespace
1662 
1663 //===----------------------------------------------------------------------===//
1664 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
1665 // LoopVectorizationCostModel and LoopVectorizationPlanner.
1666 //===----------------------------------------------------------------------===//
1667 
1668 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
1669   // We need to place the broadcast of invariant variables outside the loop,
1670   // but only if it's proven safe to do so. Else, broadcast will be inside
1671   // vector loop body.
1672   Instruction *Instr = dyn_cast<Instruction>(V);
1673   bool SafeToHoist = OrigLoop->isLoopInvariant(V) &&
1674                      (!Instr ||
1675                       DT->dominates(Instr->getParent(), LoopVectorPreHeader));
1676   // Place the code for broadcasting invariant variables in the new preheader.
1677   IRBuilder<>::InsertPointGuard Guard(Builder);
1678   if (SafeToHoist)
1679     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
1680 
1681   // Broadcast the scalar into all locations in the vector.
1682   Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
1683 
1684   return Shuf;
1685 }
1686 
1687 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
1688     const InductionDescriptor &II, Value *Step, Instruction *EntryVal) {
1689   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1690          "Expected either an induction phi-node or a truncate of it!");
1691   Value *Start = II.getStartValue();
1692 
1693   // Construct the initial value of the vector IV in the vector loop preheader
1694   auto CurrIP = Builder.saveIP();
1695   Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
1696   if (isa<TruncInst>(EntryVal)) {
1697     assert(Start->getType()->isIntegerTy() &&
1698            "Truncation requires an integer type");
1699     auto *TruncType = cast<IntegerType>(EntryVal->getType());
1700     Step = Builder.CreateTrunc(Step, TruncType);
1701     Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
1702   }
1703   Value *SplatStart = Builder.CreateVectorSplat(VF, Start);
1704   Value *SteppedStart =
1705       getStepVector(SplatStart, 0, Step, II.getInductionOpcode());
1706 
1707   // We create vector phi nodes for both integer and floating-point induction
1708   // variables. Here, we determine the kind of arithmetic we will perform.
1709   Instruction::BinaryOps AddOp;
1710   Instruction::BinaryOps MulOp;
1711   if (Step->getType()->isIntegerTy()) {
1712     AddOp = Instruction::Add;
1713     MulOp = Instruction::Mul;
1714   } else {
1715     AddOp = II.getInductionOpcode();
1716     MulOp = Instruction::FMul;
1717   }
1718 
1719   // Multiply the vectorization factor by the step using integer or
1720   // floating-point arithmetic as appropriate.
1721   Value *ConstVF = getSignedIntOrFpConstant(Step->getType(), VF);
1722   Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF));
1723 
1724   // Create a vector splat to use in the induction update.
1725   //
1726   // FIXME: If the step is non-constant, we create the vector splat with
1727   //        IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
1728   //        handle a constant vector splat.
1729   Value *SplatVF = isa<Constant>(Mul)
1730                        ? ConstantVector::getSplat(VF, cast<Constant>(Mul))
1731                        : Builder.CreateVectorSplat(VF, Mul);
1732   Builder.restoreIP(CurrIP);
1733 
1734   // We may need to add the step a number of times, depending on the unroll
1735   // factor. The last of those goes into the PHI.
1736   PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
1737                                     &*LoopVectorBody->getFirstInsertionPt());
1738   VecInd->setDebugLoc(EntryVal->getDebugLoc());
1739   Instruction *LastInduction = VecInd;
1740   for (unsigned Part = 0; Part < UF; ++Part) {
1741     VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction);
1742 
1743     if (isa<TruncInst>(EntryVal))
1744       addMetadata(LastInduction, EntryVal);
1745     recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, Part);
1746 
1747     LastInduction = cast<Instruction>(addFastMathFlag(
1748         Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")));
1749     LastInduction->setDebugLoc(EntryVal->getDebugLoc());
1750   }
1751 
1752   // Move the last step to the end of the latch block. This ensures consistent
1753   // placement of all induction updates.
1754   auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
1755   auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator());
1756   auto *ICmp = cast<Instruction>(Br->getCondition());
1757   LastInduction->moveBefore(ICmp);
1758   LastInduction->setName("vec.ind.next");
1759 
1760   VecInd->addIncoming(SteppedStart, LoopVectorPreHeader);
1761   VecInd->addIncoming(LastInduction, LoopVectorLatch);
1762 }
1763 
1764 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const {
1765   return Cost->isScalarAfterVectorization(I, VF) ||
1766          Cost->isProfitableToScalarize(I, VF);
1767 }
1768 
1769 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const {
1770   if (shouldScalarizeInstruction(IV))
1771     return true;
1772   auto isScalarInst = [&](User *U) -> bool {
1773     auto *I = cast<Instruction>(U);
1774     return (OrigLoop->contains(I) && shouldScalarizeInstruction(I));
1775   };
1776   return llvm::any_of(IV->users(), isScalarInst);
1777 }
1778 
1779 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast(
1780     const InductionDescriptor &ID, const Instruction *EntryVal,
1781     Value *VectorLoopVal, unsigned Part, unsigned Lane) {
1782   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1783          "Expected either an induction phi-node or a truncate of it!");
1784 
1785   // This induction variable is not the phi from the original loop but the
1786   // newly-created IV based on the proof that casted Phi is equal to the
1787   // uncasted Phi in the vectorized loop (under a runtime guard possibly). It
1788   // re-uses the same InductionDescriptor that original IV uses but we don't
1789   // have to do any recording in this case - that is done when original IV is
1790   // processed.
1791   if (isa<TruncInst>(EntryVal))
1792     return;
1793 
1794   const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts();
1795   if (Casts.empty())
1796     return;
1797   // Only the first Cast instruction in the Casts vector is of interest.
1798   // The rest of the Casts (if exist) have no uses outside the
1799   // induction update chain itself.
1800   Instruction *CastInst = *Casts.begin();
1801   if (Lane < UINT_MAX)
1802     VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal);
1803   else
1804     VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal);
1805 }
1806 
1807 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) {
1808   assert((IV->getType()->isIntegerTy() || IV != OldInduction) &&
1809          "Primary induction variable must have an integer type");
1810 
1811   auto II = Legal->getInductionVars()->find(IV);
1812   assert(II != Legal->getInductionVars()->end() && "IV is not an induction");
1813 
1814   auto ID = II->second;
1815   assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
1816 
1817   // The scalar value to broadcast. This will be derived from the canonical
1818   // induction variable.
1819   Value *ScalarIV = nullptr;
1820 
1821   // The value from the original loop to which we are mapping the new induction
1822   // variable.
1823   Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
1824 
1825   // True if we have vectorized the induction variable.
1826   auto VectorizedIV = false;
1827 
1828   // Determine if we want a scalar version of the induction variable. This is
1829   // true if the induction variable itself is not widened, or if it has at
1830   // least one user in the loop that is not widened.
1831   auto NeedsScalarIV = VF > 1 && needsScalarInduction(EntryVal);
1832 
1833   // Generate code for the induction step. Note that induction steps are
1834   // required to be loop-invariant
1835   assert(PSE.getSE()->isLoopInvariant(ID.getStep(), OrigLoop) &&
1836          "Induction step should be loop invariant");
1837   auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
1838   Value *Step = nullptr;
1839   if (PSE.getSE()->isSCEVable(IV->getType())) {
1840     SCEVExpander Exp(*PSE.getSE(), DL, "induction");
1841     Step = Exp.expandCodeFor(ID.getStep(), ID.getStep()->getType(),
1842                              LoopVectorPreHeader->getTerminator());
1843   } else {
1844     Step = cast<SCEVUnknown>(ID.getStep())->getValue();
1845   }
1846 
1847   // Try to create a new independent vector induction variable. If we can't
1848   // create the phi node, we will splat the scalar induction variable in each
1849   // loop iteration.
1850   if (VF > 1 && !shouldScalarizeInstruction(EntryVal)) {
1851     createVectorIntOrFpInductionPHI(ID, Step, EntryVal);
1852     VectorizedIV = true;
1853   }
1854 
1855   // If we haven't yet vectorized the induction variable, or if we will create
1856   // a scalar one, we need to define the scalar induction variable and step
1857   // values. If we were given a truncation type, truncate the canonical
1858   // induction variable and step. Otherwise, derive these values from the
1859   // induction descriptor.
1860   if (!VectorizedIV || NeedsScalarIV) {
1861     ScalarIV = Induction;
1862     if (IV != OldInduction) {
1863       ScalarIV = IV->getType()->isIntegerTy()
1864                      ? Builder.CreateSExtOrTrunc(Induction, IV->getType())
1865                      : Builder.CreateCast(Instruction::SIToFP, Induction,
1866                                           IV->getType());
1867       ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID);
1868       ScalarIV->setName("offset.idx");
1869     }
1870     if (Trunc) {
1871       auto *TruncType = cast<IntegerType>(Trunc->getType());
1872       assert(Step->getType()->isIntegerTy() &&
1873              "Truncation requires an integer step");
1874       ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType);
1875       Step = Builder.CreateTrunc(Step, TruncType);
1876     }
1877   }
1878 
1879   // If we haven't yet vectorized the induction variable, splat the scalar
1880   // induction variable, and build the necessary step vectors.
1881   // TODO: Don't do it unless the vectorized IV is really required.
1882   if (!VectorizedIV) {
1883     Value *Broadcasted = getBroadcastInstrs(ScalarIV);
1884     for (unsigned Part = 0; Part < UF; ++Part) {
1885       Value *EntryPart =
1886           getStepVector(Broadcasted, VF * Part, Step, ID.getInductionOpcode());
1887       VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart);
1888       if (Trunc)
1889         addMetadata(EntryPart, Trunc);
1890       recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part);
1891     }
1892   }
1893 
1894   // If an induction variable is only used for counting loop iterations or
1895   // calculating addresses, it doesn't need to be widened. Create scalar steps
1896   // that can be used by instructions we will later scalarize. Note that the
1897   // addition of the scalar steps will not increase the number of instructions
1898   // in the loop in the common case prior to InstCombine. We will be trading
1899   // one vector extract for each scalar step.
1900   if (NeedsScalarIV)
1901     buildScalarSteps(ScalarIV, Step, EntryVal, ID);
1902 }
1903 
1904 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step,
1905                                           Instruction::BinaryOps BinOp) {
1906   // Create and check the types.
1907   assert(Val->getType()->isVectorTy() && "Must be a vector");
1908   int VLen = Val->getType()->getVectorNumElements();
1909 
1910   Type *STy = Val->getType()->getScalarType();
1911   assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
1912          "Induction Step must be an integer or FP");
1913   assert(Step->getType() == STy && "Step has wrong type");
1914 
1915   SmallVector<Constant *, 8> Indices;
1916 
1917   if (STy->isIntegerTy()) {
1918     // Create a vector of consecutive numbers from zero to VF.
1919     for (int i = 0; i < VLen; ++i)
1920       Indices.push_back(ConstantInt::get(STy, StartIdx + i));
1921 
1922     // Add the consecutive indices to the vector value.
1923     Constant *Cv = ConstantVector::get(Indices);
1924     assert(Cv->getType() == Val->getType() && "Invalid consecutive vec");
1925     Step = Builder.CreateVectorSplat(VLen, Step);
1926     assert(Step->getType() == Val->getType() && "Invalid step vec");
1927     // FIXME: The newly created binary instructions should contain nsw/nuw flags,
1928     // which can be found from the original scalar operations.
1929     Step = Builder.CreateMul(Cv, Step);
1930     return Builder.CreateAdd(Val, Step, "induction");
1931   }
1932 
1933   // Floating point induction.
1934   assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
1935          "Binary Opcode should be specified for FP induction");
1936   // Create a vector of consecutive numbers from zero to VF.
1937   for (int i = 0; i < VLen; ++i)
1938     Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i)));
1939 
1940   // Add the consecutive indices to the vector value.
1941   Constant *Cv = ConstantVector::get(Indices);
1942 
1943   Step = Builder.CreateVectorSplat(VLen, Step);
1944 
1945   // Floating point operations had to be 'fast' to enable the induction.
1946   FastMathFlags Flags;
1947   Flags.setFast();
1948 
1949   Value *MulOp = Builder.CreateFMul(Cv, Step);
1950   if (isa<Instruction>(MulOp))
1951     // Have to check, MulOp may be a constant
1952     cast<Instruction>(MulOp)->setFastMathFlags(Flags);
1953 
1954   Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
1955   if (isa<Instruction>(BOp))
1956     cast<Instruction>(BOp)->setFastMathFlags(Flags);
1957   return BOp;
1958 }
1959 
1960 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
1961                                            Instruction *EntryVal,
1962                                            const InductionDescriptor &ID) {
1963   // We shouldn't have to build scalar steps if we aren't vectorizing.
1964   assert(VF > 1 && "VF should be greater than one");
1965 
1966   // Get the value type and ensure it and the step have the same integer type.
1967   Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
1968   assert(ScalarIVTy == Step->getType() &&
1969          "Val and Step should have the same type");
1970 
1971   // We build scalar steps for both integer and floating-point induction
1972   // variables. Here, we determine the kind of arithmetic we will perform.
1973   Instruction::BinaryOps AddOp;
1974   Instruction::BinaryOps MulOp;
1975   if (ScalarIVTy->isIntegerTy()) {
1976     AddOp = Instruction::Add;
1977     MulOp = Instruction::Mul;
1978   } else {
1979     AddOp = ID.getInductionOpcode();
1980     MulOp = Instruction::FMul;
1981   }
1982 
1983   // Determine the number of scalars we need to generate for each unroll
1984   // iteration. If EntryVal is uniform, we only need to generate the first
1985   // lane. Otherwise, we generate all VF values.
1986   unsigned Lanes =
1987       Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) ? 1
1988                                                                          : VF;
1989   // Compute the scalar steps and save the results in VectorLoopValueMap.
1990   for (unsigned Part = 0; Part < UF; ++Part) {
1991     for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
1992       auto *StartIdx = getSignedIntOrFpConstant(ScalarIVTy, VF * Part + Lane);
1993       auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step));
1994       auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul));
1995       VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add);
1996       recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane);
1997     }
1998   }
1999 }
2000 
2001 Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) {
2002   assert(V != Induction && "The new induction variable should not be used.");
2003   assert(!V->getType()->isVectorTy() && "Can't widen a vector");
2004   assert(!V->getType()->isVoidTy() && "Type does not produce a value");
2005 
2006   // If we have a stride that is replaced by one, do it here. Defer this for
2007   // the VPlan-native path until we start running Legal checks in that path.
2008   if (!EnableVPlanNativePath && Legal->hasStride(V))
2009     V = ConstantInt::get(V->getType(), 1);
2010 
2011   // If we have a vector mapped to this value, return it.
2012   if (VectorLoopValueMap.hasVectorValue(V, Part))
2013     return VectorLoopValueMap.getVectorValue(V, Part);
2014 
2015   // If the value has not been vectorized, check if it has been scalarized
2016   // instead. If it has been scalarized, and we actually need the value in
2017   // vector form, we will construct the vector values on demand.
2018   if (VectorLoopValueMap.hasAnyScalarValue(V)) {
2019     Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0});
2020 
2021     // If we've scalarized a value, that value should be an instruction.
2022     auto *I = cast<Instruction>(V);
2023 
2024     // If we aren't vectorizing, we can just copy the scalar map values over to
2025     // the vector map.
2026     if (VF == 1) {
2027       VectorLoopValueMap.setVectorValue(V, Part, ScalarValue);
2028       return ScalarValue;
2029     }
2030 
2031     // Get the last scalar instruction we generated for V and Part. If the value
2032     // is known to be uniform after vectorization, this corresponds to lane zero
2033     // of the Part unroll iteration. Otherwise, the last instruction is the one
2034     // we created for the last vector lane of the Part unroll iteration.
2035     unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) ? 0 : VF - 1;
2036     auto *LastInst = cast<Instruction>(
2037         VectorLoopValueMap.getScalarValue(V, {Part, LastLane}));
2038 
2039     // Set the insert point after the last scalarized instruction. This ensures
2040     // the insertelement sequence will directly follow the scalar definitions.
2041     auto OldIP = Builder.saveIP();
2042     auto NewIP = std::next(BasicBlock::iterator(LastInst));
2043     Builder.SetInsertPoint(&*NewIP);
2044 
2045     // However, if we are vectorizing, we need to construct the vector values.
2046     // If the value is known to be uniform after vectorization, we can just
2047     // broadcast the scalar value corresponding to lane zero for each unroll
2048     // iteration. Otherwise, we construct the vector values using insertelement
2049     // instructions. Since the resulting vectors are stored in
2050     // VectorLoopValueMap, we will only generate the insertelements once.
2051     Value *VectorValue = nullptr;
2052     if (Cost->isUniformAfterVectorization(I, VF)) {
2053       VectorValue = getBroadcastInstrs(ScalarValue);
2054       VectorLoopValueMap.setVectorValue(V, Part, VectorValue);
2055     } else {
2056       // Initialize packing with insertelements to start from undef.
2057       Value *Undef = UndefValue::get(VectorType::get(V->getType(), VF));
2058       VectorLoopValueMap.setVectorValue(V, Part, Undef);
2059       for (unsigned Lane = 0; Lane < VF; ++Lane)
2060         packScalarIntoVectorValue(V, {Part, Lane});
2061       VectorValue = VectorLoopValueMap.getVectorValue(V, Part);
2062     }
2063     Builder.restoreIP(OldIP);
2064     return VectorValue;
2065   }
2066 
2067   // If this scalar is unknown, assume that it is a constant or that it is
2068   // loop invariant. Broadcast V and save the value for future uses.
2069   Value *B = getBroadcastInstrs(V);
2070   VectorLoopValueMap.setVectorValue(V, Part, B);
2071   return B;
2072 }
2073 
2074 Value *
2075 InnerLoopVectorizer::getOrCreateScalarValue(Value *V,
2076                                             const VPIteration &Instance) {
2077   // If the value is not an instruction contained in the loop, it should
2078   // already be scalar.
2079   if (OrigLoop->isLoopInvariant(V))
2080     return V;
2081 
2082   assert(Instance.Lane > 0
2083              ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF)
2084              : true && "Uniform values only have lane zero");
2085 
2086   // If the value from the original loop has not been vectorized, it is
2087   // represented by UF x VF scalar values in the new loop. Return the requested
2088   // scalar value.
2089   if (VectorLoopValueMap.hasScalarValue(V, Instance))
2090     return VectorLoopValueMap.getScalarValue(V, Instance);
2091 
2092   // If the value has not been scalarized, get its entry in VectorLoopValueMap
2093   // for the given unroll part. If this entry is not a vector type (i.e., the
2094   // vectorization factor is one), there is no need to generate an
2095   // extractelement instruction.
2096   auto *U = getOrCreateVectorValue(V, Instance.Part);
2097   if (!U->getType()->isVectorTy()) {
2098     assert(VF == 1 && "Value not scalarized has non-vector type");
2099     return U;
2100   }
2101 
2102   // Otherwise, the value from the original loop has been vectorized and is
2103   // represented by UF vector values. Extract and return the requested scalar
2104   // value from the appropriate vector lane.
2105   return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane));
2106 }
2107 
2108 void InnerLoopVectorizer::packScalarIntoVectorValue(
2109     Value *V, const VPIteration &Instance) {
2110   assert(V != Induction && "The new induction variable should not be used.");
2111   assert(!V->getType()->isVectorTy() && "Can't pack a vector");
2112   assert(!V->getType()->isVoidTy() && "Type does not produce a value");
2113 
2114   Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance);
2115   Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part);
2116   VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst,
2117                                             Builder.getInt32(Instance.Lane));
2118   VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue);
2119 }
2120 
2121 Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
2122   assert(Vec->getType()->isVectorTy() && "Invalid type");
2123   SmallVector<Constant *, 8> ShuffleMask;
2124   for (unsigned i = 0; i < VF; ++i)
2125     ShuffleMask.push_back(Builder.getInt32(VF - i - 1));
2126 
2127   return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()),
2128                                      ConstantVector::get(ShuffleMask),
2129                                      "reverse");
2130 }
2131 
2132 // Return whether we allow using masked interleave-groups (for dealing with
2133 // strided loads/stores that reside in predicated blocks, or for dealing
2134 // with gaps).
2135 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2136   // If an override option has been passed in for interleaved accesses, use it.
2137   if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2138     return EnableMaskedInterleavedMemAccesses;
2139 
2140   return TTI.enableMaskedInterleavedAccessVectorization();
2141 }
2142 
2143 // Try to vectorize the interleave group that \p Instr belongs to.
2144 //
2145 // E.g. Translate following interleaved load group (factor = 3):
2146 //   for (i = 0; i < N; i+=3) {
2147 //     R = Pic[i];             // Member of index 0
2148 //     G = Pic[i+1];           // Member of index 1
2149 //     B = Pic[i+2];           // Member of index 2
2150 //     ... // do something to R, G, B
2151 //   }
2152 // To:
2153 //   %wide.vec = load <12 x i32>                       ; Read 4 tuples of R,G,B
2154 //   %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9>   ; R elements
2155 //   %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10>  ; G elements
2156 //   %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11>  ; B elements
2157 //
2158 // Or translate following interleaved store group (factor = 3):
2159 //   for (i = 0; i < N; i+=3) {
2160 //     ... do something to R, G, B
2161 //     Pic[i]   = R;           // Member of index 0
2162 //     Pic[i+1] = G;           // Member of index 1
2163 //     Pic[i+2] = B;           // Member of index 2
2164 //   }
2165 // To:
2166 //   %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2167 //   %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u>
2168 //   %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2169 //        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements
2170 //   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
2171 void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr,
2172                                                    VPTransformState &State,
2173                                                    VPValue *Addr,
2174                                                    VPValue *BlockInMask) {
2175   const InterleaveGroup<Instruction> *Group =
2176       Cost->getInterleavedAccessGroup(Instr);
2177   assert(Group && "Fail to get an interleaved access group.");
2178 
2179   // Skip if current instruction is not the insert position.
2180   if (Instr != Group->getInsertPos())
2181     return;
2182 
2183   const DataLayout &DL = Instr->getModule()->getDataLayout();
2184 
2185   // Prepare for the vector type of the interleaved load/store.
2186   Type *ScalarTy = getMemInstValueType(Instr);
2187   unsigned InterleaveFactor = Group->getFactor();
2188   Type *VecTy = VectorType::get(ScalarTy, InterleaveFactor * VF);
2189 
2190   // Prepare for the new pointers.
2191   SmallVector<Value *, 2> AddrParts;
2192   unsigned Index = Group->getIndex(Instr);
2193 
2194   // TODO: extend the masked interleaved-group support to reversed access.
2195   assert((!BlockInMask || !Group->isReverse()) &&
2196          "Reversed masked interleave-group not supported.");
2197 
2198   // If the group is reverse, adjust the index to refer to the last vector lane
2199   // instead of the first. We adjust the index from the first vector lane,
2200   // rather than directly getting the pointer for lane VF - 1, because the
2201   // pointer operand of the interleaved access is supposed to be uniform. For
2202   // uniform instructions, we're only required to generate a value for the
2203   // first vector lane in each unroll iteration.
2204   if (Group->isReverse())
2205     Index += (VF - 1) * Group->getFactor();
2206 
2207   for (unsigned Part = 0; Part < UF; Part++) {
2208     Value *AddrPart = State.get(Addr, {Part, 0});
2209     setDebugLocFromInst(Builder, AddrPart);
2210 
2211     // Notice current instruction could be any index. Need to adjust the address
2212     // to the member of index 0.
2213     //
2214     // E.g.  a = A[i+1];     // Member of index 1 (Current instruction)
2215     //       b = A[i];       // Member of index 0
2216     // Current pointer is pointed to A[i+1], adjust it to A[i].
2217     //
2218     // E.g.  A[i+1] = a;     // Member of index 1
2219     //       A[i]   = b;     // Member of index 0
2220     //       A[i+2] = c;     // Member of index 2 (Current instruction)
2221     // Current pointer is pointed to A[i+2], adjust it to A[i].
2222 
2223     bool InBounds = false;
2224     if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts()))
2225       InBounds = gep->isInBounds();
2226     AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index));
2227     cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds);
2228 
2229     // Cast to the vector pointer type.
2230     unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace();
2231     Type *PtrTy = VecTy->getPointerTo(AddressSpace);
2232     AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy));
2233   }
2234 
2235   setDebugLocFromInst(Builder, Instr);
2236   Value *UndefVec = UndefValue::get(VecTy);
2237 
2238   Value *MaskForGaps = nullptr;
2239   if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) {
2240     MaskForGaps = createBitMaskForGaps(Builder, VF, *Group);
2241     assert(MaskForGaps && "Mask for Gaps is required but it is null");
2242   }
2243 
2244   // Vectorize the interleaved load group.
2245   if (isa<LoadInst>(Instr)) {
2246     // For each unroll part, create a wide load for the group.
2247     SmallVector<Value *, 2> NewLoads;
2248     for (unsigned Part = 0; Part < UF; Part++) {
2249       Instruction *NewLoad;
2250       if (BlockInMask || MaskForGaps) {
2251         assert(useMaskedInterleavedAccesses(*TTI) &&
2252                "masked interleaved groups are not allowed.");
2253         Value *GroupMask = MaskForGaps;
2254         if (BlockInMask) {
2255           Value *BlockInMaskPart = State.get(BlockInMask, Part);
2256           auto *Undefs = UndefValue::get(BlockInMaskPart->getType());
2257           auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF);
2258           Value *ShuffledMask = Builder.CreateShuffleVector(
2259               BlockInMaskPart, Undefs, RepMask, "interleaved.mask");
2260           GroupMask = MaskForGaps
2261                           ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
2262                                                 MaskForGaps)
2263                           : ShuffledMask;
2264         }
2265         NewLoad =
2266             Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlignment(),
2267                                      GroupMask, UndefVec, "wide.masked.vec");
2268       }
2269       else
2270         NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part],
2271                                             Group->getAlignment(), "wide.vec");
2272       Group->addMetadata(NewLoad);
2273       NewLoads.push_back(NewLoad);
2274     }
2275 
2276     // For each member in the group, shuffle out the appropriate data from the
2277     // wide loads.
2278     for (unsigned I = 0; I < InterleaveFactor; ++I) {
2279       Instruction *Member = Group->getMember(I);
2280 
2281       // Skip the gaps in the group.
2282       if (!Member)
2283         continue;
2284 
2285       Constant *StrideMask = createStrideMask(Builder, I, InterleaveFactor, VF);
2286       for (unsigned Part = 0; Part < UF; Part++) {
2287         Value *StridedVec = Builder.CreateShuffleVector(
2288             NewLoads[Part], UndefVec, StrideMask, "strided.vec");
2289 
2290         // If this member has different type, cast the result type.
2291         if (Member->getType() != ScalarTy) {
2292           VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2293           StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2294         }
2295 
2296         if (Group->isReverse())
2297           StridedVec = reverseVector(StridedVec);
2298 
2299         VectorLoopValueMap.setVectorValue(Member, Part, StridedVec);
2300       }
2301     }
2302     return;
2303   }
2304 
2305   // The sub vector type for current instruction.
2306   VectorType *SubVT = VectorType::get(ScalarTy, VF);
2307 
2308   // Vectorize the interleaved store group.
2309   for (unsigned Part = 0; Part < UF; Part++) {
2310     // Collect the stored vector from each member.
2311     SmallVector<Value *, 4> StoredVecs;
2312     for (unsigned i = 0; i < InterleaveFactor; i++) {
2313       // Interleaved store group doesn't allow a gap, so each index has a member
2314       Instruction *Member = Group->getMember(i);
2315       assert(Member && "Fail to get a member from an interleaved store group");
2316 
2317       Value *StoredVec = getOrCreateVectorValue(
2318           cast<StoreInst>(Member)->getValueOperand(), Part);
2319       if (Group->isReverse())
2320         StoredVec = reverseVector(StoredVec);
2321 
2322       // If this member has different type, cast it to a unified type.
2323 
2324       if (StoredVec->getType() != SubVT)
2325         StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2326 
2327       StoredVecs.push_back(StoredVec);
2328     }
2329 
2330     // Concatenate all vectors into a wide vector.
2331     Value *WideVec = concatenateVectors(Builder, StoredVecs);
2332 
2333     // Interleave the elements in the wide vector.
2334     Constant *IMask = createInterleaveMask(Builder, VF, InterleaveFactor);
2335     Value *IVec = Builder.CreateShuffleVector(WideVec, UndefVec, IMask,
2336                                               "interleaved.vec");
2337 
2338     Instruction *NewStoreInstr;
2339     if (BlockInMask) {
2340       Value *BlockInMaskPart = State.get(BlockInMask, Part);
2341       auto *Undefs = UndefValue::get(BlockInMaskPart->getType());
2342       auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF);
2343       Value *ShuffledMask = Builder.CreateShuffleVector(
2344           BlockInMaskPart, Undefs, RepMask, "interleaved.mask");
2345       NewStoreInstr = Builder.CreateMaskedStore(
2346           IVec, AddrParts[Part], Group->getAlignment(), ShuffledMask);
2347     }
2348     else
2349       NewStoreInstr = Builder.CreateAlignedStore(IVec, AddrParts[Part],
2350                                                  Group->getAlignment());
2351 
2352     Group->addMetadata(NewStoreInstr);
2353   }
2354 }
2355 
2356 void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
2357                                                      VPTransformState &State,
2358                                                      VPValue *Addr,
2359                                                      VPValue *BlockInMask) {
2360   // Attempt to issue a wide load.
2361   LoadInst *LI = dyn_cast<LoadInst>(Instr);
2362   StoreInst *SI = dyn_cast<StoreInst>(Instr);
2363 
2364   assert((LI || SI) && "Invalid Load/Store instruction");
2365 
2366   LoopVectorizationCostModel::InstWidening Decision =
2367       Cost->getWideningDecision(Instr, VF);
2368   assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
2369          "CM decision should be taken at this point");
2370   if (Decision == LoopVectorizationCostModel::CM_Interleave)
2371     return vectorizeInterleaveGroup(Instr, State, Addr, BlockInMask);
2372 
2373   Type *ScalarDataTy = getMemInstValueType(Instr);
2374   Type *DataTy = VectorType::get(ScalarDataTy, VF);
2375   // An alignment of 0 means target abi alignment. We need to use the scalar's
2376   // target abi alignment in such a case.
2377   const DataLayout &DL = Instr->getModule()->getDataLayout();
2378   const Align Alignment =
2379       DL.getValueOrABITypeAlignment(getLoadStoreAlignment(Instr), ScalarDataTy);
2380 
2381   // Determine if the pointer operand of the access is either consecutive or
2382   // reverse consecutive.
2383   bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse);
2384   bool ConsecutiveStride =
2385       Reverse || (Decision == LoopVectorizationCostModel::CM_Widen);
2386   bool CreateGatherScatter =
2387       (Decision == LoopVectorizationCostModel::CM_GatherScatter);
2388 
2389   // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector
2390   // gather/scatter. Otherwise Decision should have been to Scalarize.
2391   assert((ConsecutiveStride || CreateGatherScatter) &&
2392          "The instruction should be scalarized");
2393   (void)ConsecutiveStride;
2394 
2395   VectorParts BlockInMaskParts(UF);
2396   bool isMaskRequired = BlockInMask;
2397   if (isMaskRequired)
2398     for (unsigned Part = 0; Part < UF; ++Part)
2399       BlockInMaskParts[Part] = State.get(BlockInMask, Part);
2400 
2401   const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
2402     // Calculate the pointer for the specific unroll-part.
2403     GetElementPtrInst *PartPtr = nullptr;
2404 
2405     bool InBounds = false;
2406     if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
2407       InBounds = gep->isInBounds();
2408 
2409     if (Reverse) {
2410       // If the address is consecutive but reversed, then the
2411       // wide store needs to start at the last vector element.
2412       PartPtr = cast<GetElementPtrInst>(
2413           Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(-Part * VF)));
2414       PartPtr->setIsInBounds(InBounds);
2415       PartPtr = cast<GetElementPtrInst>(
2416           Builder.CreateGEP(ScalarDataTy, PartPtr, Builder.getInt32(1 - VF)));
2417       PartPtr->setIsInBounds(InBounds);
2418       if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
2419         BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]);
2420     } else {
2421       PartPtr = cast<GetElementPtrInst>(
2422           Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(Part * VF)));
2423       PartPtr->setIsInBounds(InBounds);
2424     }
2425 
2426     unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
2427     return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
2428   };
2429 
2430   // Handle Stores:
2431   if (SI) {
2432     setDebugLocFromInst(Builder, SI);
2433 
2434     for (unsigned Part = 0; Part < UF; ++Part) {
2435       Instruction *NewSI = nullptr;
2436       Value *StoredVal = getOrCreateVectorValue(SI->getValueOperand(), Part);
2437       if (CreateGatherScatter) {
2438         Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
2439         Value *VectorGep = State.get(Addr, Part);
2440         NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep,
2441                                             Alignment.value(), MaskPart);
2442       } else {
2443         if (Reverse) {
2444           // If we store to reverse consecutive memory locations, then we need
2445           // to reverse the order of elements in the stored value.
2446           StoredVal = reverseVector(StoredVal);
2447           // We don't want to update the value in the map as it might be used in
2448           // another expression. So don't call resetVectorValue(StoredVal).
2449         }
2450         auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0}));
2451         if (isMaskRequired)
2452           NewSI = Builder.CreateMaskedStore(
2453               StoredVal, VecPtr, Alignment.value(), BlockInMaskParts[Part]);
2454         else
2455           NewSI =
2456               Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment.value());
2457       }
2458       addMetadata(NewSI, SI);
2459     }
2460     return;
2461   }
2462 
2463   // Handle loads.
2464   assert(LI && "Must have a load instruction");
2465   setDebugLocFromInst(Builder, LI);
2466   for (unsigned Part = 0; Part < UF; ++Part) {
2467     Value *NewLI;
2468     if (CreateGatherScatter) {
2469       Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
2470       Value *VectorGep = State.get(Addr, Part);
2471       NewLI = Builder.CreateMaskedGather(VectorGep, Alignment.value(), MaskPart,
2472                                          nullptr, "wide.masked.gather");
2473       addMetadata(NewLI, LI);
2474     } else {
2475       auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0}));
2476       if (isMaskRequired)
2477         NewLI = Builder.CreateMaskedLoad(
2478             VecPtr, Alignment.value(), BlockInMaskParts[Part],
2479             UndefValue::get(DataTy), "wide.masked.load");
2480       else
2481         NewLI = Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment.value(),
2482                                           "wide.load");
2483 
2484       // Add metadata to the load, but setVectorValue to the reverse shuffle.
2485       addMetadata(NewLI, LI);
2486       if (Reverse)
2487         NewLI = reverseVector(NewLI);
2488     }
2489     VectorLoopValueMap.setVectorValue(Instr, Part, NewLI);
2490   }
2491 }
2492 
2493 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
2494                                                const VPIteration &Instance,
2495                                                bool IfPredicateInstr) {
2496   assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
2497 
2498   setDebugLocFromInst(Builder, Instr);
2499 
2500   // Does this instruction return a value ?
2501   bool IsVoidRetTy = Instr->getType()->isVoidTy();
2502 
2503   Instruction *Cloned = Instr->clone();
2504   if (!IsVoidRetTy)
2505     Cloned->setName(Instr->getName() + ".cloned");
2506 
2507   // Replace the operands of the cloned instructions with their scalar
2508   // equivalents in the new loop.
2509   for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
2510     auto *NewOp = getOrCreateScalarValue(Instr->getOperand(op), Instance);
2511     Cloned->setOperand(op, NewOp);
2512   }
2513   addNewMetadata(Cloned, Instr);
2514 
2515   // Place the cloned scalar in the new loop.
2516   Builder.Insert(Cloned);
2517 
2518   // Add the cloned scalar to the scalar map entry.
2519   VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned);
2520 
2521   // If we just cloned a new assumption, add it the assumption cache.
2522   if (auto *II = dyn_cast<IntrinsicInst>(Cloned))
2523     if (II->getIntrinsicID() == Intrinsic::assume)
2524       AC->registerAssumption(II);
2525 
2526   // End if-block.
2527   if (IfPredicateInstr)
2528     PredicatedInstructions.push_back(Cloned);
2529 }
2530 
2531 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start,
2532                                                       Value *End, Value *Step,
2533                                                       Instruction *DL) {
2534   BasicBlock *Header = L->getHeader();
2535   BasicBlock *Latch = L->getLoopLatch();
2536   // As we're just creating this loop, it's possible no latch exists
2537   // yet. If so, use the header as this will be a single block loop.
2538   if (!Latch)
2539     Latch = Header;
2540 
2541   IRBuilder<> Builder(&*Header->getFirstInsertionPt());
2542   Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction);
2543   setDebugLocFromInst(Builder, OldInst);
2544   auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index");
2545 
2546   Builder.SetInsertPoint(Latch->getTerminator());
2547   setDebugLocFromInst(Builder, OldInst);
2548 
2549   // Create i+1 and fill the PHINode.
2550   Value *Next = Builder.CreateAdd(Induction, Step, "index.next");
2551   Induction->addIncoming(Start, L->getLoopPreheader());
2552   Induction->addIncoming(Next, Latch);
2553   // Create the compare.
2554   Value *ICmp = Builder.CreateICmpEQ(Next, End);
2555   Builder.CreateCondBr(ICmp, L->getExitBlock(), Header);
2556 
2557   // Now we have two terminators. Remove the old one from the block.
2558   Latch->getTerminator()->eraseFromParent();
2559 
2560   return Induction;
2561 }
2562 
2563 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
2564   if (TripCount)
2565     return TripCount;
2566 
2567   assert(L && "Create Trip Count for null loop.");
2568   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2569   // Find the loop boundaries.
2570   ScalarEvolution *SE = PSE.getSE();
2571   const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
2572   assert(BackedgeTakenCount != SE->getCouldNotCompute() &&
2573          "Invalid loop count");
2574 
2575   Type *IdxTy = Legal->getWidestInductionType();
2576   assert(IdxTy && "No type for induction");
2577 
2578   // The exit count might have the type of i64 while the phi is i32. This can
2579   // happen if we have an induction variable that is sign extended before the
2580   // compare. The only way that we get a backedge taken count is that the
2581   // induction variable was signed and as such will not overflow. In such a case
2582   // truncation is legal.
2583   if (BackedgeTakenCount->getType()->getPrimitiveSizeInBits() >
2584       IdxTy->getPrimitiveSizeInBits())
2585     BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
2586   BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
2587 
2588   // Get the total trip count from the count by adding 1.
2589   const SCEV *ExitCount = SE->getAddExpr(
2590       BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
2591 
2592   const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
2593 
2594   // Expand the trip count and place the new instructions in the preheader.
2595   // Notice that the pre-header does not change, only the loop body.
2596   SCEVExpander Exp(*SE, DL, "induction");
2597 
2598   // Count holds the overall loop count (N).
2599   TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
2600                                 L->getLoopPreheader()->getTerminator());
2601 
2602   if (TripCount->getType()->isPointerTy())
2603     TripCount =
2604         CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
2605                                     L->getLoopPreheader()->getTerminator());
2606 
2607   return TripCount;
2608 }
2609 
2610 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
2611   if (VectorTripCount)
2612     return VectorTripCount;
2613 
2614   Value *TC = getOrCreateTripCount(L);
2615   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2616 
2617   Type *Ty = TC->getType();
2618   Constant *Step = ConstantInt::get(Ty, VF * UF);
2619 
2620   // If the tail is to be folded by masking, round the number of iterations N
2621   // up to a multiple of Step instead of rounding down. This is done by first
2622   // adding Step-1 and then rounding down. Note that it's ok if this addition
2623   // overflows: the vector induction variable will eventually wrap to zero given
2624   // that it starts at zero and its Step is a power of two; the loop will then
2625   // exit, with the last early-exit vector comparison also producing all-true.
2626   if (Cost->foldTailByMasking()) {
2627     assert(isPowerOf2_32(VF * UF) &&
2628            "VF*UF must be a power of 2 when folding tail by masking");
2629     TC = Builder.CreateAdd(TC, ConstantInt::get(Ty, VF * UF - 1), "n.rnd.up");
2630   }
2631 
2632   // Now we need to generate the expression for the part of the loop that the
2633   // vectorized body will execute. This is equal to N - (N % Step) if scalar
2634   // iterations are not required for correctness, or N - Step, otherwise. Step
2635   // is equal to the vectorization factor (number of SIMD elements) times the
2636   // unroll factor (number of SIMD instructions).
2637   Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
2638 
2639   // If there is a non-reversed interleaved group that may speculatively access
2640   // memory out-of-bounds, we need to ensure that there will be at least one
2641   // iteration of the scalar epilogue loop. Thus, if the step evenly divides
2642   // the trip count, we set the remainder to be equal to the step. If the step
2643   // does not evenly divide the trip count, no adjustment is necessary since
2644   // there will already be scalar iterations. Note that the minimum iterations
2645   // check ensures that N >= Step.
2646   if (VF > 1 && Cost->requiresScalarEpilogue()) {
2647     auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
2648     R = Builder.CreateSelect(IsZero, Step, R);
2649   }
2650 
2651   VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
2652 
2653   return VectorTripCount;
2654 }
2655 
2656 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
2657                                                    const DataLayout &DL) {
2658   // Verify that V is a vector type with same number of elements as DstVTy.
2659   unsigned VF = DstVTy->getNumElements();
2660   VectorType *SrcVecTy = cast<VectorType>(V->getType());
2661   assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match");
2662   Type *SrcElemTy = SrcVecTy->getElementType();
2663   Type *DstElemTy = DstVTy->getElementType();
2664   assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
2665          "Vector elements must have same size");
2666 
2667   // Do a direct cast if element types are castable.
2668   if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
2669     return Builder.CreateBitOrPointerCast(V, DstVTy);
2670   }
2671   // V cannot be directly casted to desired vector type.
2672   // May happen when V is a floating point vector but DstVTy is a vector of
2673   // pointers or vice-versa. Handle this using a two-step bitcast using an
2674   // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
2675   assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
2676          "Only one type should be a pointer type");
2677   assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
2678          "Only one type should be a floating point type");
2679   Type *IntTy =
2680       IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
2681   VectorType *VecIntTy = VectorType::get(IntTy, VF);
2682   Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
2683   return Builder.CreateBitOrPointerCast(CastVal, DstVTy);
2684 }
2685 
2686 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
2687                                                          BasicBlock *Bypass) {
2688   Value *Count = getOrCreateTripCount(L);
2689   // Reuse existing vector loop preheader for TC checks.
2690   // Note that new preheader block is generated for vector loop.
2691   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
2692   IRBuilder<> Builder(TCCheckBlock->getTerminator());
2693 
2694   // Generate code to check if the loop's trip count is less than VF * UF, or
2695   // equal to it in case a scalar epilogue is required; this implies that the
2696   // vector trip count is zero. This check also covers the case where adding one
2697   // to the backedge-taken count overflowed leading to an incorrect trip count
2698   // of zero. In this case we will also jump to the scalar loop.
2699   auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE
2700                                           : ICmpInst::ICMP_ULT;
2701 
2702   // If tail is to be folded, vector loop takes care of all iterations.
2703   Value *CheckMinIters = Builder.getFalse();
2704   if (!Cost->foldTailByMasking())
2705     CheckMinIters = Builder.CreateICmp(
2706         P, Count, ConstantInt::get(Count->getType(), VF * UF),
2707         "min.iters.check");
2708 
2709   // Create new preheader for vector loop.
2710   LoopVectorPreHeader =
2711       SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
2712                  "vector.ph");
2713 
2714   assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
2715                                DT->getNode(Bypass)->getIDom()) &&
2716          "TC check is expected to dominate Bypass");
2717 
2718   // Update dominator for Bypass & LoopExit.
2719   DT->changeImmediateDominator(Bypass, TCCheckBlock);
2720   DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
2721 
2722   ReplaceInstWithInst(
2723       TCCheckBlock->getTerminator(),
2724       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
2725   LoopBypassBlocks.push_back(TCCheckBlock);
2726 }
2727 
2728 void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
2729   // Reuse existing vector loop preheader for SCEV checks.
2730   // Note that new preheader block is generated for vector loop.
2731   BasicBlock *const SCEVCheckBlock = LoopVectorPreHeader;
2732 
2733   // Generate the code to check that the SCEV assumptions that we made.
2734   // We want the new basic block to start at the first instruction in a
2735   // sequence of instructions that form a check.
2736   SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(),
2737                    "scev.check");
2738   Value *SCEVCheck = Exp.expandCodeForPredicate(
2739       &PSE.getUnionPredicate(), SCEVCheckBlock->getTerminator());
2740 
2741   if (auto *C = dyn_cast<ConstantInt>(SCEVCheck))
2742     if (C->isZero())
2743       return;
2744 
2745   assert(!SCEVCheckBlock->getParent()->hasOptSize() &&
2746          "Cannot SCEV check stride or overflow when optimizing for size");
2747 
2748   SCEVCheckBlock->setName("vector.scevcheck");
2749   // Create new preheader for vector loop.
2750   LoopVectorPreHeader =
2751       SplitBlock(SCEVCheckBlock, SCEVCheckBlock->getTerminator(), DT, LI,
2752                  nullptr, "vector.ph");
2753 
2754   // Update dominator only if this is first RT check.
2755   if (LoopBypassBlocks.empty()) {
2756     DT->changeImmediateDominator(Bypass, SCEVCheckBlock);
2757     DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock);
2758   }
2759 
2760   ReplaceInstWithInst(
2761       SCEVCheckBlock->getTerminator(),
2762       BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheck));
2763   LoopBypassBlocks.push_back(SCEVCheckBlock);
2764   AddedSafetyChecks = true;
2765 }
2766 
2767 void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {
2768   // VPlan-native path does not do any analysis for runtime checks currently.
2769   if (EnableVPlanNativePath)
2770     return;
2771 
2772   // Reuse existing vector loop preheader for runtime memory checks.
2773   // Note that new preheader block is generated for vector loop.
2774   BasicBlock *const MemCheckBlock = L->getLoopPreheader();
2775 
2776   // Generate the code that checks in runtime if arrays overlap. We put the
2777   // checks into a separate block to make the more common case of few elements
2778   // faster.
2779   Instruction *FirstCheckInst;
2780   Instruction *MemRuntimeCheck;
2781   std::tie(FirstCheckInst, MemRuntimeCheck) =
2782       Legal->getLAI()->addRuntimeChecks(MemCheckBlock->getTerminator());
2783   if (!MemRuntimeCheck)
2784     return;
2785 
2786   if (MemCheckBlock->getParent()->hasOptSize()) {
2787     assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
2788            "Cannot emit memory checks when optimizing for size, unless forced "
2789            "to vectorize.");
2790     ORE->emit([&]() {
2791       return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
2792                                         L->getStartLoc(), L->getHeader())
2793              << "Code-size may be reduced by not forcing "
2794                 "vectorization, or by source-code modifications "
2795                 "eliminating the need for runtime checks "
2796                 "(e.g., adding 'restrict').";
2797     });
2798   }
2799 
2800   MemCheckBlock->setName("vector.memcheck");
2801   // Create new preheader for vector loop.
2802   LoopVectorPreHeader =
2803       SplitBlock(MemCheckBlock, MemCheckBlock->getTerminator(), DT, LI, nullptr,
2804                  "vector.ph");
2805 
2806   // Update dominator only if this is first RT check.
2807   if (LoopBypassBlocks.empty()) {
2808     DT->changeImmediateDominator(Bypass, MemCheckBlock);
2809     DT->changeImmediateDominator(LoopExitBlock, MemCheckBlock);
2810   }
2811 
2812   ReplaceInstWithInst(
2813       MemCheckBlock->getTerminator(),
2814       BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheck));
2815   LoopBypassBlocks.push_back(MemCheckBlock);
2816   AddedSafetyChecks = true;
2817 
2818   // We currently don't use LoopVersioning for the actual loop cloning but we
2819   // still use it to add the noalias metadata.
2820   LVer = std::make_unique<LoopVersioning>(*Legal->getLAI(), OrigLoop, LI, DT,
2821                                           PSE.getSE());
2822   LVer->prepareNoAliasMetadata();
2823 }
2824 
2825 Value *InnerLoopVectorizer::emitTransformedIndex(
2826     IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL,
2827     const InductionDescriptor &ID) const {
2828 
2829   SCEVExpander Exp(*SE, DL, "induction");
2830   auto Step = ID.getStep();
2831   auto StartValue = ID.getStartValue();
2832   assert(Index->getType() == Step->getType() &&
2833          "Index type does not match StepValue type");
2834 
2835   // Note: the IR at this point is broken. We cannot use SE to create any new
2836   // SCEV and then expand it, hoping that SCEV's simplification will give us
2837   // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
2838   // lead to various SCEV crashes. So all we can do is to use builder and rely
2839   // on InstCombine for future simplifications. Here we handle some trivial
2840   // cases only.
2841   auto CreateAdd = [&B](Value *X, Value *Y) {
2842     assert(X->getType() == Y->getType() && "Types don't match!");
2843     if (auto *CX = dyn_cast<ConstantInt>(X))
2844       if (CX->isZero())
2845         return Y;
2846     if (auto *CY = dyn_cast<ConstantInt>(Y))
2847       if (CY->isZero())
2848         return X;
2849     return B.CreateAdd(X, Y);
2850   };
2851 
2852   auto CreateMul = [&B](Value *X, Value *Y) {
2853     assert(X->getType() == Y->getType() && "Types don't match!");
2854     if (auto *CX = dyn_cast<ConstantInt>(X))
2855       if (CX->isOne())
2856         return Y;
2857     if (auto *CY = dyn_cast<ConstantInt>(Y))
2858       if (CY->isOne())
2859         return X;
2860     return B.CreateMul(X, Y);
2861   };
2862 
2863   switch (ID.getKind()) {
2864   case InductionDescriptor::IK_IntInduction: {
2865     assert(Index->getType() == StartValue->getType() &&
2866            "Index type does not match StartValue type");
2867     if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne())
2868       return B.CreateSub(StartValue, Index);
2869     auto *Offset = CreateMul(
2870         Index, Exp.expandCodeFor(Step, Index->getType(), &*B.GetInsertPoint()));
2871     return CreateAdd(StartValue, Offset);
2872   }
2873   case InductionDescriptor::IK_PtrInduction: {
2874     assert(isa<SCEVConstant>(Step) &&
2875            "Expected constant step for pointer induction");
2876     return B.CreateGEP(
2877         StartValue->getType()->getPointerElementType(), StartValue,
2878         CreateMul(Index, Exp.expandCodeFor(Step, Index->getType(),
2879                                            &*B.GetInsertPoint())));
2880   }
2881   case InductionDescriptor::IK_FpInduction: {
2882     assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
2883     auto InductionBinOp = ID.getInductionBinOp();
2884     assert(InductionBinOp &&
2885            (InductionBinOp->getOpcode() == Instruction::FAdd ||
2886             InductionBinOp->getOpcode() == Instruction::FSub) &&
2887            "Original bin op should be defined for FP induction");
2888 
2889     Value *StepValue = cast<SCEVUnknown>(Step)->getValue();
2890 
2891     // Floating point operations had to be 'fast' to enable the induction.
2892     FastMathFlags Flags;
2893     Flags.setFast();
2894 
2895     Value *MulExp = B.CreateFMul(StepValue, Index);
2896     if (isa<Instruction>(MulExp))
2897       // We have to check, the MulExp may be a constant.
2898       cast<Instruction>(MulExp)->setFastMathFlags(Flags);
2899 
2900     Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
2901                                "induction");
2902     if (isa<Instruction>(BOp))
2903       cast<Instruction>(BOp)->setFastMathFlags(Flags);
2904 
2905     return BOp;
2906   }
2907   case InductionDescriptor::IK_NoInduction:
2908     return nullptr;
2909   }
2910   llvm_unreachable("invalid enum");
2911 }
2912 
2913 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
2914   /*
2915    In this function we generate a new loop. The new loop will contain
2916    the vectorized instructions while the old loop will continue to run the
2917    scalar remainder.
2918 
2919        [ ] <-- loop iteration number check.
2920     /   |
2921    /    v
2922   |    [ ] <-- vector loop bypass (may consist of multiple blocks).
2923   |  /  |
2924   | /   v
2925   ||   [ ]     <-- vector pre header.
2926   |/    |
2927   |     v
2928   |    [  ] \
2929   |    [  ]_|   <-- vector loop.
2930   |     |
2931   |     v
2932   |   -[ ]   <--- middle-block.
2933   |  /  |
2934   | /   v
2935   -|- >[ ]     <--- new preheader.
2936    |    |
2937    |    v
2938    |   [ ] \
2939    |   [ ]_|   <-- old scalar loop to handle remainder.
2940     \   |
2941      \  v
2942       >[ ]     <-- exit block.
2943    ...
2944    */
2945 
2946   MDNode *OrigLoopID = OrigLoop->getLoopID();
2947 
2948   // Some loops have a single integer induction variable, while other loops
2949   // don't. One example is c++ iterators that often have multiple pointer
2950   // induction variables. In the code below we also support a case where we
2951   // don't have a single induction variable.
2952   //
2953   // We try to obtain an induction variable from the original loop as hard
2954   // as possible. However if we don't find one that:
2955   //   - is an integer
2956   //   - counts from zero, stepping by one
2957   //   - is the size of the widest induction variable type
2958   // then we create a new one.
2959   OldInduction = Legal->getPrimaryInduction();
2960   Type *IdxTy = Legal->getWidestInductionType();
2961 
2962   // Split the single block loop into the two loop structure described above.
2963   LoopScalarBody = OrigLoop->getHeader();
2964   LoopVectorPreHeader = OrigLoop->getLoopPreheader();
2965   LoopExitBlock = OrigLoop->getExitBlock();
2966   assert(LoopExitBlock && "Must have an exit block");
2967   assert(LoopVectorPreHeader && "Invalid loop structure");
2968 
2969   LoopMiddleBlock =
2970       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
2971                  LI, nullptr, "middle.block");
2972   LoopScalarPreHeader =
2973       SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,
2974                  nullptr, "scalar.ph");
2975   // We intentionally don't let SplitBlock to update LoopInfo since
2976   // LoopVectorBody should belong to another loop than LoopVectorPreHeader.
2977   // LoopVectorBody is explicitly added to the correct place few lines later.
2978   LoopVectorBody =
2979       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
2980                  nullptr, nullptr, "vector.body");
2981 
2982   // Update dominator for loop exit.
2983   DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
2984 
2985   // Create and register the new vector loop.
2986   Loop *Lp = LI->AllocateLoop();
2987   Loop *ParentLoop = OrigLoop->getParentLoop();
2988 
2989   // Insert the new loop into the loop nest and register the new basic blocks
2990   // before calling any utilities such as SCEV that require valid LoopInfo.
2991   if (ParentLoop) {
2992     ParentLoop->addChildLoop(Lp);
2993   } else {
2994     LI->addTopLevelLoop(Lp);
2995   }
2996   Lp->addBasicBlockToLoop(LoopVectorBody, *LI);
2997 
2998   // Find the loop boundaries.
2999   Value *Count = getOrCreateTripCount(Lp);
3000 
3001   Value *StartIdx = ConstantInt::get(IdxTy, 0);
3002 
3003   // Now, compare the new count to zero. If it is zero skip the vector loop and
3004   // jump to the scalar loop. This check also covers the case where the
3005   // backedge-taken count is uint##_max: adding one to it will overflow leading
3006   // to an incorrect trip count of zero. In this (rare) case we will also jump
3007   // to the scalar loop.
3008   emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader);
3009 
3010   // Generate the code to check any assumptions that we've made for SCEV
3011   // expressions.
3012   emitSCEVChecks(Lp, LoopScalarPreHeader);
3013 
3014   // Generate the code that checks in runtime if arrays overlap. We put the
3015   // checks into a separate block to make the more common case of few elements
3016   // faster.
3017   emitMemRuntimeChecks(Lp, LoopScalarPreHeader);
3018 
3019   // Generate the induction variable.
3020   // The loop step is equal to the vectorization factor (num of SIMD elements)
3021   // times the unroll factor (num of SIMD instructions).
3022   Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
3023   Constant *Step = ConstantInt::get(IdxTy, VF * UF);
3024   Induction =
3025       createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
3026                               getDebugLocFromInstOrOperands(OldInduction));
3027 
3028   // We are going to resume the execution of the scalar loop.
3029   // Go over all of the induction variables that we found and fix the
3030   // PHIs that are left in the scalar version of the loop.
3031   // The starting values of PHI nodes depend on the counter of the last
3032   // iteration in the vectorized loop.
3033   // If we come from a bypass edge then we need to start from the original
3034   // start value.
3035 
3036   // This variable saves the new starting index for the scalar loop. It is used
3037   // to test if there are any tail iterations left once the vector loop has
3038   // completed.
3039   LoopVectorizationLegality::InductionList *List = Legal->getInductionVars();
3040   for (auto &InductionEntry : *List) {
3041     PHINode *OrigPhi = InductionEntry.first;
3042     InductionDescriptor II = InductionEntry.second;
3043 
3044     // Create phi nodes to merge from the  backedge-taken check block.
3045     PHINode *BCResumeVal =
3046         PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
3047                         LoopScalarPreHeader->getTerminator());
3048     // Copy original phi DL over to the new one.
3049     BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
3050     Value *&EndValue = IVEndValues[OrigPhi];
3051     if (OrigPhi == OldInduction) {
3052       // We know what the end value is.
3053       EndValue = CountRoundDown;
3054     } else {
3055       IRBuilder<> B(Lp->getLoopPreheader()->getTerminator());
3056       Type *StepType = II.getStep()->getType();
3057       Instruction::CastOps CastOp =
3058           CastInst::getCastOpcode(CountRoundDown, true, StepType, true);
3059       Value *CRD = B.CreateCast(CastOp, CountRoundDown, StepType, "cast.crd");
3060       const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout();
3061       EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
3062       EndValue->setName("ind.end");
3063     }
3064 
3065     // The new PHI merges the original incoming value, in case of a bypass,
3066     // or the value at the end of the vectorized loop.
3067     BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);
3068 
3069     // Fix the scalar body counter (PHI node).
3070     // The old induction's phi node in the scalar body needs the truncated
3071     // value.
3072     for (BasicBlock *BB : LoopBypassBlocks)
3073       BCResumeVal->addIncoming(II.getStartValue(), BB);
3074     OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
3075   }
3076 
3077   // We need the OrigLoop (scalar loop part) latch terminator to help
3078   // produce correct debug info for the middle block BB instructions.
3079   // The legality check stage guarantees that the loop will have a single
3080   // latch.
3081   assert(isa<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()) &&
3082          "Scalar loop latch terminator isn't a branch");
3083   BranchInst *ScalarLatchBr =
3084       cast<BranchInst>(OrigLoop->getLoopLatch()->getTerminator());
3085 
3086   // Add a check in the middle block to see if we have completed
3087   // all of the iterations in the first vector loop.
3088   // If (N - N%VF) == N, then we *don't* need to run the remainder.
3089   // If tail is to be folded, we know we don't need to run the remainder.
3090   Value *CmpN = Builder.getTrue();
3091   if (!Cost->foldTailByMasking()) {
3092     CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count,
3093                            CountRoundDown, "cmp.n",
3094                            LoopMiddleBlock->getTerminator());
3095 
3096     // Here we use the same DebugLoc as the scalar loop latch branch instead
3097     // of the corresponding compare because they may have ended up with
3098     // different line numbers and we want to avoid awkward line stepping while
3099     // debugging. Eg. if the compare has got a line number inside the loop.
3100     cast<Instruction>(CmpN)->setDebugLoc(ScalarLatchBr->getDebugLoc());
3101   }
3102 
3103   BranchInst *BrInst =
3104       BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, CmpN);
3105   BrInst->setDebugLoc(ScalarLatchBr->getDebugLoc());
3106   ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst);
3107 
3108   // Get ready to start creating new instructions into the vectorized body.
3109   assert(LoopVectorPreHeader == Lp->getLoopPreheader() &&
3110          "Inconsistent vector loop preheader");
3111   Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt());
3112 
3113   Optional<MDNode *> VectorizedLoopID =
3114       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
3115                                       LLVMLoopVectorizeFollowupVectorized});
3116   if (VectorizedLoopID.hasValue()) {
3117     Lp->setLoopID(VectorizedLoopID.getValue());
3118 
3119     // Do not setAlreadyVectorized if loop attributes have been defined
3120     // explicitly.
3121     return LoopVectorPreHeader;
3122   }
3123 
3124   // Keep all loop hints from the original loop on the vector loop (we'll
3125   // replace the vectorizer-specific hints below).
3126   if (MDNode *LID = OrigLoop->getLoopID())
3127     Lp->setLoopID(LID);
3128 
3129   LoopVectorizeHints Hints(Lp, true, *ORE);
3130   Hints.setAlreadyVectorized();
3131 
3132 #ifdef EXPENSIVE_CHECKS
3133   assert(DT->verify(DominatorTree::VerificationLevel::Fast));
3134   LI->verify(*DT);
3135 #endif
3136 
3137   return LoopVectorPreHeader;
3138 }
3139 
3140 // Fix up external users of the induction variable. At this point, we are
3141 // in LCSSA form, with all external PHIs that use the IV having one input value,
3142 // coming from the remainder loop. We need those PHIs to also have a correct
3143 // value for the IV when arriving directly from the middle block.
3144 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
3145                                        const InductionDescriptor &II,
3146                                        Value *CountRoundDown, Value *EndValue,
3147                                        BasicBlock *MiddleBlock) {
3148   // There are two kinds of external IV usages - those that use the value
3149   // computed in the last iteration (the PHI) and those that use the penultimate
3150   // value (the value that feeds into the phi from the loop latch).
3151   // We allow both, but they, obviously, have different values.
3152 
3153   assert(OrigLoop->getExitBlock() && "Expected a single exit block");
3154 
3155   DenseMap<Value *, Value *> MissingVals;
3156 
3157   // An external user of the last iteration's value should see the value that
3158   // the remainder loop uses to initialize its own IV.
3159   Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
3160   for (User *U : PostInc->users()) {
3161     Instruction *UI = cast<Instruction>(U);
3162     if (!OrigLoop->contains(UI)) {
3163       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3164       MissingVals[UI] = EndValue;
3165     }
3166   }
3167 
3168   // An external user of the penultimate value need to see EndValue - Step.
3169   // The simplest way to get this is to recompute it from the constituent SCEVs,
3170   // that is Start + (Step * (CRD - 1)).
3171   for (User *U : OrigPhi->users()) {
3172     auto *UI = cast<Instruction>(U);
3173     if (!OrigLoop->contains(UI)) {
3174       const DataLayout &DL =
3175           OrigLoop->getHeader()->getModule()->getDataLayout();
3176       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3177 
3178       IRBuilder<> B(MiddleBlock->getTerminator());
3179       Value *CountMinusOne = B.CreateSub(
3180           CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1));
3181       Value *CMO =
3182           !II.getStep()->getType()->isIntegerTy()
3183               ? B.CreateCast(Instruction::SIToFP, CountMinusOne,
3184                              II.getStep()->getType())
3185               : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType());
3186       CMO->setName("cast.cmo");
3187       Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II);
3188       Escape->setName("ind.escape");
3189       MissingVals[UI] = Escape;
3190     }
3191   }
3192 
3193   for (auto &I : MissingVals) {
3194     PHINode *PHI = cast<PHINode>(I.first);
3195     // One corner case we have to handle is two IVs "chasing" each-other,
3196     // that is %IV2 = phi [...], [ %IV1, %latch ]
3197     // In this case, if IV1 has an external use, we need to avoid adding both
3198     // "last value of IV1" and "penultimate value of IV2". So, verify that we
3199     // don't already have an incoming value for the middle block.
3200     if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
3201       PHI->addIncoming(I.second, MiddleBlock);
3202   }
3203 }
3204 
3205 namespace {
3206 
3207 struct CSEDenseMapInfo {
3208   static bool canHandle(const Instruction *I) {
3209     return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
3210            isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
3211   }
3212 
3213   static inline Instruction *getEmptyKey() {
3214     return DenseMapInfo<Instruction *>::getEmptyKey();
3215   }
3216 
3217   static inline Instruction *getTombstoneKey() {
3218     return DenseMapInfo<Instruction *>::getTombstoneKey();
3219   }
3220 
3221   static unsigned getHashValue(const Instruction *I) {
3222     assert(canHandle(I) && "Unknown instruction!");
3223     return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
3224                                                            I->value_op_end()));
3225   }
3226 
3227   static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
3228     if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3229         LHS == getTombstoneKey() || RHS == getTombstoneKey())
3230       return LHS == RHS;
3231     return LHS->isIdenticalTo(RHS);
3232   }
3233 };
3234 
3235 } // end anonymous namespace
3236 
3237 ///Perform cse of induction variable instructions.
3238 static void cse(BasicBlock *BB) {
3239   // Perform simple cse.
3240   SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
3241   for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
3242     Instruction *In = &*I++;
3243 
3244     if (!CSEDenseMapInfo::canHandle(In))
3245       continue;
3246 
3247     // Check if we can replace this instruction with any of the
3248     // visited instructions.
3249     if (Instruction *V = CSEMap.lookup(In)) {
3250       In->replaceAllUsesWith(V);
3251       In->eraseFromParent();
3252       continue;
3253     }
3254 
3255     CSEMap[In] = In;
3256   }
3257 }
3258 
3259 unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
3260                                                        unsigned VF,
3261                                                        bool &NeedToScalarize) {
3262   Function *F = CI->getCalledFunction();
3263   Type *ScalarRetTy = CI->getType();
3264   SmallVector<Type *, 4> Tys, ScalarTys;
3265   for (auto &ArgOp : CI->arg_operands())
3266     ScalarTys.push_back(ArgOp->getType());
3267 
3268   // Estimate cost of scalarized vector call. The source operands are assumed
3269   // to be vectors, so we need to extract individual elements from there,
3270   // execute VF scalar calls, and then gather the result into the vector return
3271   // value.
3272   unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys);
3273   if (VF == 1)
3274     return ScalarCallCost;
3275 
3276   // Compute corresponding vector type for return value and arguments.
3277   Type *RetTy = ToVectorTy(ScalarRetTy, VF);
3278   for (Type *ScalarTy : ScalarTys)
3279     Tys.push_back(ToVectorTy(ScalarTy, VF));
3280 
3281   // Compute costs of unpacking argument values for the scalar calls and
3282   // packing the return values to a vector.
3283   unsigned ScalarizationCost = getScalarizationOverhead(CI, VF);
3284 
3285   unsigned Cost = ScalarCallCost * VF + ScalarizationCost;
3286 
3287   // If we can't emit a vector call for this function, then the currently found
3288   // cost is the cost we need to return.
3289   NeedToScalarize = true;
3290   if (!TLI || CI->isNoBuiltin() || VFDatabase::getMappings(*CI).empty())
3291     return Cost;
3292 
3293   // If the corresponding vector cost is cheaper, return its cost.
3294   unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys);
3295   if (VectorCallCost < Cost) {
3296     NeedToScalarize = false;
3297     return VectorCallCost;
3298   }
3299   return Cost;
3300 }
3301 
3302 unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
3303                                                             unsigned VF) {
3304   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
3305   assert(ID && "Expected intrinsic call!");
3306 
3307   FastMathFlags FMF;
3308   if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
3309     FMF = FPMO->getFastMathFlags();
3310 
3311   SmallVector<Value *, 4> Operands(CI->arg_operands());
3312   return TTI.getIntrinsicInstrCost(ID, CI->getType(), Operands, FMF, VF);
3313 }
3314 
3315 static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
3316   auto *I1 = cast<IntegerType>(T1->getVectorElementType());
3317   auto *I2 = cast<IntegerType>(T2->getVectorElementType());
3318   return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3319 }
3320 static Type *largestIntegerVectorType(Type *T1, Type *T2) {
3321   auto *I1 = cast<IntegerType>(T1->getVectorElementType());
3322   auto *I2 = cast<IntegerType>(T2->getVectorElementType());
3323   return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3324 }
3325 
3326 void InnerLoopVectorizer::truncateToMinimalBitwidths() {
3327   // For every instruction `I` in MinBWs, truncate the operands, create a
3328   // truncated version of `I` and reextend its result. InstCombine runs
3329   // later and will remove any ext/trunc pairs.
3330   SmallPtrSet<Value *, 4> Erased;
3331   for (const auto &KV : Cost->getMinimalBitwidths()) {
3332     // If the value wasn't vectorized, we must maintain the original scalar
3333     // type. The absence of the value from VectorLoopValueMap indicates that it
3334     // wasn't vectorized.
3335     if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3336       continue;
3337     for (unsigned Part = 0; Part < UF; ++Part) {
3338       Value *I = getOrCreateVectorValue(KV.first, Part);
3339       if (Erased.find(I) != Erased.end() || I->use_empty() ||
3340           !isa<Instruction>(I))
3341         continue;
3342       Type *OriginalTy = I->getType();
3343       Type *ScalarTruncatedTy =
3344           IntegerType::get(OriginalTy->getContext(), KV.second);
3345       Type *TruncatedTy = VectorType::get(ScalarTruncatedTy,
3346                                           OriginalTy->getVectorNumElements());
3347       if (TruncatedTy == OriginalTy)
3348         continue;
3349 
3350       IRBuilder<> B(cast<Instruction>(I));
3351       auto ShrinkOperand = [&](Value *V) -> Value * {
3352         if (auto *ZI = dyn_cast<ZExtInst>(V))
3353           if (ZI->getSrcTy() == TruncatedTy)
3354             return ZI->getOperand(0);
3355         return B.CreateZExtOrTrunc(V, TruncatedTy);
3356       };
3357 
3358       // The actual instruction modification depends on the instruction type,
3359       // unfortunately.
3360       Value *NewI = nullptr;
3361       if (auto *BO = dyn_cast<BinaryOperator>(I)) {
3362         NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
3363                              ShrinkOperand(BO->getOperand(1)));
3364 
3365         // Any wrapping introduced by shrinking this operation shouldn't be
3366         // considered undefined behavior. So, we can't unconditionally copy
3367         // arithmetic wrapping flags to NewI.
3368         cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
3369       } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
3370         NewI =
3371             B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
3372                          ShrinkOperand(CI->getOperand(1)));
3373       } else if (auto *SI = dyn_cast<SelectInst>(I)) {
3374         NewI = B.CreateSelect(SI->getCondition(),
3375                               ShrinkOperand(SI->getTrueValue()),
3376                               ShrinkOperand(SI->getFalseValue()));
3377       } else if (auto *CI = dyn_cast<CastInst>(I)) {
3378         switch (CI->getOpcode()) {
3379         default:
3380           llvm_unreachable("Unhandled cast!");
3381         case Instruction::Trunc:
3382           NewI = ShrinkOperand(CI->getOperand(0));
3383           break;
3384         case Instruction::SExt:
3385           NewI = B.CreateSExtOrTrunc(
3386               CI->getOperand(0),
3387               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3388           break;
3389         case Instruction::ZExt:
3390           NewI = B.CreateZExtOrTrunc(
3391               CI->getOperand(0),
3392               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3393           break;
3394         }
3395       } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
3396         auto Elements0 = SI->getOperand(0)->getType()->getVectorNumElements();
3397         auto *O0 = B.CreateZExtOrTrunc(
3398             SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0));
3399         auto Elements1 = SI->getOperand(1)->getType()->getVectorNumElements();
3400         auto *O1 = B.CreateZExtOrTrunc(
3401             SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1));
3402 
3403         NewI = B.CreateShuffleVector(O0, O1, SI->getMask());
3404       } else if (isa<LoadInst>(I) || isa<PHINode>(I)) {
3405         // Don't do anything with the operands, just extend the result.
3406         continue;
3407       } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
3408         auto Elements = IE->getOperand(0)->getType()->getVectorNumElements();
3409         auto *O0 = B.CreateZExtOrTrunc(
3410             IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3411         auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
3412         NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
3413       } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
3414         auto Elements = EE->getOperand(0)->getType()->getVectorNumElements();
3415         auto *O0 = B.CreateZExtOrTrunc(
3416             EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3417         NewI = B.CreateExtractElement(O0, EE->getOperand(2));
3418       } else {
3419         // If we don't know what to do, be conservative and don't do anything.
3420         continue;
3421       }
3422 
3423       // Lastly, extend the result.
3424       NewI->takeName(cast<Instruction>(I));
3425       Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
3426       I->replaceAllUsesWith(Res);
3427       cast<Instruction>(I)->eraseFromParent();
3428       Erased.insert(I);
3429       VectorLoopValueMap.resetVectorValue(KV.first, Part, Res);
3430     }
3431   }
3432 
3433   // We'll have created a bunch of ZExts that are now parentless. Clean up.
3434   for (const auto &KV : Cost->getMinimalBitwidths()) {
3435     // If the value wasn't vectorized, we must maintain the original scalar
3436     // type. The absence of the value from VectorLoopValueMap indicates that it
3437     // wasn't vectorized.
3438     if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3439       continue;
3440     for (unsigned Part = 0; Part < UF; ++Part) {
3441       Value *I = getOrCreateVectorValue(KV.first, Part);
3442       ZExtInst *Inst = dyn_cast<ZExtInst>(I);
3443       if (Inst && Inst->use_empty()) {
3444         Value *NewI = Inst->getOperand(0);
3445         Inst->eraseFromParent();
3446         VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI);
3447       }
3448     }
3449   }
3450 }
3451 
3452 void InnerLoopVectorizer::fixVectorizedLoop() {
3453   // Insert truncates and extends for any truncated instructions as hints to
3454   // InstCombine.
3455   if (VF > 1)
3456     truncateToMinimalBitwidths();
3457 
3458   // Fix widened non-induction PHIs by setting up the PHI operands.
3459   if (OrigPHIsToFix.size()) {
3460     assert(EnableVPlanNativePath &&
3461            "Unexpected non-induction PHIs for fixup in non VPlan-native path");
3462     fixNonInductionPHIs();
3463   }
3464 
3465   // At this point every instruction in the original loop is widened to a
3466   // vector form. Now we need to fix the recurrences in the loop. These PHI
3467   // nodes are currently empty because we did not want to introduce cycles.
3468   // This is the second stage of vectorizing recurrences.
3469   fixCrossIterationPHIs();
3470 
3471   // Forget the original basic block.
3472   PSE.getSE()->forgetLoop(OrigLoop);
3473 
3474   // Fix-up external users of the induction variables.
3475   for (auto &Entry : *Legal->getInductionVars())
3476     fixupIVUsers(Entry.first, Entry.second,
3477                  getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)),
3478                  IVEndValues[Entry.first], LoopMiddleBlock);
3479 
3480   fixLCSSAPHIs();
3481   for (Instruction *PI : PredicatedInstructions)
3482     sinkScalarOperands(&*PI);
3483 
3484   // Remove redundant induction instructions.
3485   cse(LoopVectorBody);
3486 }
3487 
3488 void InnerLoopVectorizer::fixCrossIterationPHIs() {
3489   // In order to support recurrences we need to be able to vectorize Phi nodes.
3490   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3491   // stage #2: We now need to fix the recurrences by adding incoming edges to
3492   // the currently empty PHI nodes. At this point every instruction in the
3493   // original loop is widened to a vector form so we can use them to construct
3494   // the incoming edges.
3495   for (PHINode &Phi : OrigLoop->getHeader()->phis()) {
3496     // Handle first-order recurrences and reductions that need to be fixed.
3497     if (Legal->isFirstOrderRecurrence(&Phi))
3498       fixFirstOrderRecurrence(&Phi);
3499     else if (Legal->isReductionVariable(&Phi))
3500       fixReduction(&Phi);
3501   }
3502 }
3503 
3504 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
3505   // This is the second phase of vectorizing first-order recurrences. An
3506   // overview of the transformation is described below. Suppose we have the
3507   // following loop.
3508   //
3509   //   for (int i = 0; i < n; ++i)
3510   //     b[i] = a[i] - a[i - 1];
3511   //
3512   // There is a first-order recurrence on "a". For this loop, the shorthand
3513   // scalar IR looks like:
3514   //
3515   //   scalar.ph:
3516   //     s_init = a[-1]
3517   //     br scalar.body
3518   //
3519   //   scalar.body:
3520   //     i = phi [0, scalar.ph], [i+1, scalar.body]
3521   //     s1 = phi [s_init, scalar.ph], [s2, scalar.body]
3522   //     s2 = a[i]
3523   //     b[i] = s2 - s1
3524   //     br cond, scalar.body, ...
3525   //
3526   // In this example, s1 is a recurrence because it's value depends on the
3527   // previous iteration. In the first phase of vectorization, we created a
3528   // temporary value for s1. We now complete the vectorization and produce the
3529   // shorthand vector IR shown below (for VF = 4, UF = 1).
3530   //
3531   //   vector.ph:
3532   //     v_init = vector(..., ..., ..., a[-1])
3533   //     br vector.body
3534   //
3535   //   vector.body
3536   //     i = phi [0, vector.ph], [i+4, vector.body]
3537   //     v1 = phi [v_init, vector.ph], [v2, vector.body]
3538   //     v2 = a[i, i+1, i+2, i+3];
3539   //     v3 = vector(v1(3), v2(0, 1, 2))
3540   //     b[i, i+1, i+2, i+3] = v2 - v3
3541   //     br cond, vector.body, middle.block
3542   //
3543   //   middle.block:
3544   //     x = v2(3)
3545   //     br scalar.ph
3546   //
3547   //   scalar.ph:
3548   //     s_init = phi [x, middle.block], [a[-1], otherwise]
3549   //     br scalar.body
3550   //
3551   // After execution completes the vector loop, we extract the next value of
3552   // the recurrence (x) to use as the initial value in the scalar loop.
3553 
3554   // Get the original loop preheader and single loop latch.
3555   auto *Preheader = OrigLoop->getLoopPreheader();
3556   auto *Latch = OrigLoop->getLoopLatch();
3557 
3558   // Get the initial and previous values of the scalar recurrence.
3559   auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader);
3560   auto *Previous = Phi->getIncomingValueForBlock(Latch);
3561 
3562   // Create a vector from the initial value.
3563   auto *VectorInit = ScalarInit;
3564   if (VF > 1) {
3565     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
3566     VectorInit = Builder.CreateInsertElement(
3567         UndefValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit,
3568         Builder.getInt32(VF - 1), "vector.recur.init");
3569   }
3570 
3571   // We constructed a temporary phi node in the first phase of vectorization.
3572   // This phi node will eventually be deleted.
3573   Builder.SetInsertPoint(
3574       cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0)));
3575 
3576   // Create a phi node for the new recurrence. The current value will either be
3577   // the initial value inserted into a vector or loop-varying vector value.
3578   auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur");
3579   VecPhi->addIncoming(VectorInit, LoopVectorPreHeader);
3580 
3581   // Get the vectorized previous value of the last part UF - 1. It appears last
3582   // among all unrolled iterations, due to the order of their construction.
3583   Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1);
3584 
3585   // Find and set the insertion point after the previous value if it is an
3586   // instruction.
3587   BasicBlock::iterator InsertPt;
3588   // Note that the previous value may have been constant-folded so it is not
3589   // guaranteed to be an instruction in the vector loop.
3590   // FIXME: Loop invariant values do not form recurrences. We should deal with
3591   //        them earlier.
3592   if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart))
3593     InsertPt = LoopVectorBody->getFirstInsertionPt();
3594   else {
3595     Instruction *PreviousInst = cast<Instruction>(PreviousLastPart);
3596     if (isa<PHINode>(PreviousLastPart))
3597       // If the previous value is a phi node, we should insert after all the phi
3598       // nodes in the block containing the PHI to avoid breaking basic block
3599       // verification. Note that the basic block may be different to
3600       // LoopVectorBody, in case we predicate the loop.
3601       InsertPt = PreviousInst->getParent()->getFirstInsertionPt();
3602     else
3603       InsertPt = ++PreviousInst->getIterator();
3604   }
3605   Builder.SetInsertPoint(&*InsertPt);
3606 
3607   // We will construct a vector for the recurrence by combining the values for
3608   // the current and previous iterations. This is the required shuffle mask.
3609   SmallVector<Constant *, 8> ShuffleMask(VF);
3610   ShuffleMask[0] = Builder.getInt32(VF - 1);
3611   for (unsigned I = 1; I < VF; ++I)
3612     ShuffleMask[I] = Builder.getInt32(I + VF - 1);
3613 
3614   // The vector from which to take the initial value for the current iteration
3615   // (actual or unrolled). Initially, this is the vector phi node.
3616   Value *Incoming = VecPhi;
3617 
3618   // Shuffle the current and previous vector and update the vector parts.
3619   for (unsigned Part = 0; Part < UF; ++Part) {
3620     Value *PreviousPart = getOrCreateVectorValue(Previous, Part);
3621     Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part);
3622     auto *Shuffle =
3623         VF > 1 ? Builder.CreateShuffleVector(Incoming, PreviousPart,
3624                                              ConstantVector::get(ShuffleMask))
3625                : Incoming;
3626     PhiPart->replaceAllUsesWith(Shuffle);
3627     cast<Instruction>(PhiPart)->eraseFromParent();
3628     VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle);
3629     Incoming = PreviousPart;
3630   }
3631 
3632   // Fix the latch value of the new recurrence in the vector loop.
3633   VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
3634 
3635   // Extract the last vector element in the middle block. This will be the
3636   // initial value for the recurrence when jumping to the scalar loop.
3637   auto *ExtractForScalar = Incoming;
3638   if (VF > 1) {
3639     Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
3640     ExtractForScalar = Builder.CreateExtractElement(
3641         ExtractForScalar, Builder.getInt32(VF - 1), "vector.recur.extract");
3642   }
3643   // Extract the second last element in the middle block if the
3644   // Phi is used outside the loop. We need to extract the phi itself
3645   // and not the last element (the phi update in the current iteration). This
3646   // will be the value when jumping to the exit block from the LoopMiddleBlock,
3647   // when the scalar loop is not run at all.
3648   Value *ExtractForPhiUsedOutsideLoop = nullptr;
3649   if (VF > 1)
3650     ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
3651         Incoming, Builder.getInt32(VF - 2), "vector.recur.extract.for.phi");
3652   // When loop is unrolled without vectorizing, initialize
3653   // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of
3654   // `Incoming`. This is analogous to the vectorized case above: extracting the
3655   // second last element when VF > 1.
3656   else if (UF > 1)
3657     ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2);
3658 
3659   // Fix the initial value of the original recurrence in the scalar loop.
3660   Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
3661   auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
3662   for (auto *BB : predecessors(LoopScalarPreHeader)) {
3663     auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
3664     Start->addIncoming(Incoming, BB);
3665   }
3666 
3667   Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start);
3668   Phi->setName("scalar.recur");
3669 
3670   // Finally, fix users of the recurrence outside the loop. The users will need
3671   // either the last value of the scalar recurrence or the last value of the
3672   // vector recurrence we extracted in the middle block. Since the loop is in
3673   // LCSSA form, we just need to find all the phi nodes for the original scalar
3674   // recurrence in the exit block, and then add an edge for the middle block.
3675   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3676     if (LCSSAPhi.getIncomingValue(0) == Phi) {
3677       LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
3678     }
3679   }
3680 }
3681 
3682 void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
3683   Constant *Zero = Builder.getInt32(0);
3684 
3685   // Get it's reduction variable descriptor.
3686   assert(Legal->isReductionVariable(Phi) &&
3687          "Unable to find the reduction variable");
3688   RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[Phi];
3689 
3690   RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();
3691   TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
3692   Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
3693   RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind =
3694     RdxDesc.getMinMaxRecurrenceKind();
3695   setDebugLocFromInst(Builder, ReductionStartValue);
3696 
3697   // We need to generate a reduction vector from the incoming scalar.
3698   // To do so, we need to generate the 'identity' vector and override
3699   // one of the elements with the incoming scalar reduction. We need
3700   // to do it in the vector-loop preheader.
3701   Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
3702 
3703   // This is the vector-clone of the value that leaves the loop.
3704   Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType();
3705 
3706   // Find the reduction identity variable. Zero for addition, or, xor,
3707   // one for multiplication, -1 for And.
3708   Value *Identity;
3709   Value *VectorStart;
3710   if (RK == RecurrenceDescriptor::RK_IntegerMinMax ||
3711       RK == RecurrenceDescriptor::RK_FloatMinMax) {
3712     // MinMax reduction have the start value as their identify.
3713     if (VF == 1) {
3714       VectorStart = Identity = ReductionStartValue;
3715     } else {
3716       VectorStart = Identity =
3717         Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident");
3718     }
3719   } else {
3720     // Handle other reduction kinds:
3721     Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity(
3722         RK, VecTy->getScalarType());
3723     if (VF == 1) {
3724       Identity = Iden;
3725       // This vector is the Identity vector where the first element is the
3726       // incoming scalar reduction.
3727       VectorStart = ReductionStartValue;
3728     } else {
3729       Identity = ConstantVector::getSplat(VF, Iden);
3730 
3731       // This vector is the Identity vector where the first element is the
3732       // incoming scalar reduction.
3733       VectorStart =
3734         Builder.CreateInsertElement(Identity, ReductionStartValue, Zero);
3735     }
3736   }
3737 
3738   // Wrap flags are in general invalid after vectorization, clear them.
3739   clearReductionWrapFlags(RdxDesc);
3740 
3741   // Fix the vector-loop phi.
3742 
3743   // Reductions do not have to start at zero. They can start with
3744   // any loop invariant values.
3745   BasicBlock *Latch = OrigLoop->getLoopLatch();
3746   Value *LoopVal = Phi->getIncomingValueForBlock(Latch);
3747 
3748   for (unsigned Part = 0; Part < UF; ++Part) {
3749     Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part);
3750     Value *Val = getOrCreateVectorValue(LoopVal, Part);
3751     // Make sure to add the reduction start value only to the
3752     // first unroll part.
3753     Value *StartVal = (Part == 0) ? VectorStart : Identity;
3754     cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader);
3755     cast<PHINode>(VecRdxPhi)
3756       ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
3757   }
3758 
3759   // Before each round, move the insertion point right between
3760   // the PHIs and the values we are going to write.
3761   // This allows us to write both PHINodes and the extractelement
3762   // instructions.
3763   Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3764 
3765   setDebugLocFromInst(Builder, LoopExitInst);
3766 
3767   // If tail is folded by masking, the vector value to leave the loop should be
3768   // a Select choosing between the vectorized LoopExitInst and vectorized Phi,
3769   // instead of the former.
3770   if (Cost->foldTailByMasking()) {
3771     for (unsigned Part = 0; Part < UF; ++Part) {
3772       Value *VecLoopExitInst =
3773           VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3774       Value *Sel = nullptr;
3775       for (User *U : VecLoopExitInst->users()) {
3776         if (isa<SelectInst>(U)) {
3777           assert(!Sel && "Reduction exit feeding two selects");
3778           Sel = U;
3779         } else
3780           assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select");
3781       }
3782       assert(Sel && "Reduction exit feeds no select");
3783       VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, Sel);
3784     }
3785   }
3786 
3787   // If the vector reduction can be performed in a smaller type, we truncate
3788   // then extend the loop exit value to enable InstCombine to evaluate the
3789   // entire expression in the smaller type.
3790   if (VF > 1 && Phi->getType() != RdxDesc.getRecurrenceType()) {
3791     Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
3792     Builder.SetInsertPoint(
3793         LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator());
3794     VectorParts RdxParts(UF);
3795     for (unsigned Part = 0; Part < UF; ++Part) {
3796       RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3797       Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3798       Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
3799                                         : Builder.CreateZExt(Trunc, VecTy);
3800       for (Value::user_iterator UI = RdxParts[Part]->user_begin();
3801            UI != RdxParts[Part]->user_end();)
3802         if (*UI != Trunc) {
3803           (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd);
3804           RdxParts[Part] = Extnd;
3805         } else {
3806           ++UI;
3807         }
3808     }
3809     Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3810     for (unsigned Part = 0; Part < UF; ++Part) {
3811       RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3812       VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]);
3813     }
3814   }
3815 
3816   // Reduce all of the unrolled parts into a single vector.
3817   Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0);
3818   unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK);
3819 
3820   // The middle block terminator has already been assigned a DebugLoc here (the
3821   // OrigLoop's single latch terminator). We want the whole middle block to
3822   // appear to execute on this line because: (a) it is all compiler generated,
3823   // (b) these instructions are always executed after evaluating the latch
3824   // conditional branch, and (c) other passes may add new predecessors which
3825   // terminate on this line. This is the easiest way to ensure we don't
3826   // accidentally cause an extra step back into the loop while debugging.
3827   setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator());
3828   for (unsigned Part = 1; Part < UF; ++Part) {
3829     Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3830     if (Op != Instruction::ICmp && Op != Instruction::FCmp)
3831       // Floating point operations had to be 'fast' to enable the reduction.
3832       ReducedPartRdx = addFastMathFlag(
3833           Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxPart,
3834                               ReducedPartRdx, "bin.rdx"),
3835           RdxDesc.getFastMathFlags());
3836     else
3837       ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx,
3838                                       RdxPart);
3839   }
3840 
3841   if (VF > 1) {
3842     bool NoNaN = Legal->hasFunNoNaNAttr();
3843     ReducedPartRdx =
3844         createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN);
3845     // If the reduction can be performed in a smaller type, we need to extend
3846     // the reduction to the wider type before we branch to the original loop.
3847     if (Phi->getType() != RdxDesc.getRecurrenceType())
3848       ReducedPartRdx =
3849         RdxDesc.isSigned()
3850         ? Builder.CreateSExt(ReducedPartRdx, Phi->getType())
3851         : Builder.CreateZExt(ReducedPartRdx, Phi->getType());
3852   }
3853 
3854   // Create a phi node that merges control-flow from the backedge-taken check
3855   // block and the middle block.
3856   PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx",
3857                                         LoopScalarPreHeader->getTerminator());
3858   for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
3859     BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]);
3860   BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
3861 
3862   // Now, we need to fix the users of the reduction variable
3863   // inside and outside of the scalar remainder loop.
3864   // We know that the loop is in LCSSA form. We need to update the
3865   // PHI nodes in the exit blocks.
3866   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3867     // All PHINodes need to have a single entry edge, or two if
3868     // we already fixed them.
3869     assert(LCSSAPhi.getNumIncomingValues() < 3 && "Invalid LCSSA PHI");
3870 
3871     // We found a reduction value exit-PHI. Update it with the
3872     // incoming bypass edge.
3873     if (LCSSAPhi.getIncomingValue(0) == LoopExitInst)
3874       LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
3875   } // end of the LCSSA phi scan.
3876 
3877     // Fix the scalar loop reduction variable with the incoming reduction sum
3878     // from the vector body and from the backedge value.
3879   int IncomingEdgeBlockIdx =
3880     Phi->getBasicBlockIndex(OrigLoop->getLoopLatch());
3881   assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
3882   // Pick the other block.
3883   int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
3884   Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
3885   Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
3886 }
3887 
3888 void InnerLoopVectorizer::clearReductionWrapFlags(
3889     RecurrenceDescriptor &RdxDesc) {
3890   RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();
3891   if (RK != RecurrenceDescriptor::RK_IntegerAdd &&
3892       RK != RecurrenceDescriptor::RK_IntegerMult)
3893     return;
3894 
3895   Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr();
3896   assert(LoopExitInstr && "null loop exit instruction");
3897   SmallVector<Instruction *, 8> Worklist;
3898   SmallPtrSet<Instruction *, 8> Visited;
3899   Worklist.push_back(LoopExitInstr);
3900   Visited.insert(LoopExitInstr);
3901 
3902   while (!Worklist.empty()) {
3903     Instruction *Cur = Worklist.pop_back_val();
3904     if (isa<OverflowingBinaryOperator>(Cur))
3905       for (unsigned Part = 0; Part < UF; ++Part) {
3906         Value *V = getOrCreateVectorValue(Cur, Part);
3907         cast<Instruction>(V)->dropPoisonGeneratingFlags();
3908       }
3909 
3910     for (User *U : Cur->users()) {
3911       Instruction *UI = cast<Instruction>(U);
3912       if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) &&
3913           Visited.insert(UI).second)
3914         Worklist.push_back(UI);
3915     }
3916   }
3917 }
3918 
3919 void InnerLoopVectorizer::fixLCSSAPHIs() {
3920   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3921     if (LCSSAPhi.getNumIncomingValues() == 1) {
3922       auto *IncomingValue = LCSSAPhi.getIncomingValue(0);
3923       // Non-instruction incoming values will have only one value.
3924       unsigned LastLane = 0;
3925       if (isa<Instruction>(IncomingValue))
3926           LastLane = Cost->isUniformAfterVectorization(
3927                          cast<Instruction>(IncomingValue), VF)
3928                          ? 0
3929                          : VF - 1;
3930       // Can be a loop invariant incoming value or the last scalar value to be
3931       // extracted from the vectorized loop.
3932       Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
3933       Value *lastIncomingValue =
3934           getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane });
3935       LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock);
3936     }
3937   }
3938 }
3939 
3940 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
3941   // The basic block and loop containing the predicated instruction.
3942   auto *PredBB = PredInst->getParent();
3943   auto *VectorLoop = LI->getLoopFor(PredBB);
3944 
3945   // Initialize a worklist with the operands of the predicated instruction.
3946   SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
3947 
3948   // Holds instructions that we need to analyze again. An instruction may be
3949   // reanalyzed if we don't yet know if we can sink it or not.
3950   SmallVector<Instruction *, 8> InstsToReanalyze;
3951 
3952   // Returns true if a given use occurs in the predicated block. Phi nodes use
3953   // their operands in their corresponding predecessor blocks.
3954   auto isBlockOfUsePredicated = [&](Use &U) -> bool {
3955     auto *I = cast<Instruction>(U.getUser());
3956     BasicBlock *BB = I->getParent();
3957     if (auto *Phi = dyn_cast<PHINode>(I))
3958       BB = Phi->getIncomingBlock(
3959           PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
3960     return BB == PredBB;
3961   };
3962 
3963   // Iteratively sink the scalarized operands of the predicated instruction
3964   // into the block we created for it. When an instruction is sunk, it's
3965   // operands are then added to the worklist. The algorithm ends after one pass
3966   // through the worklist doesn't sink a single instruction.
3967   bool Changed;
3968   do {
3969     // Add the instructions that need to be reanalyzed to the worklist, and
3970     // reset the changed indicator.
3971     Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
3972     InstsToReanalyze.clear();
3973     Changed = false;
3974 
3975     while (!Worklist.empty()) {
3976       auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
3977 
3978       // We can't sink an instruction if it is a phi node, is already in the
3979       // predicated block, is not in the loop, or may have side effects.
3980       if (!I || isa<PHINode>(I) || I->getParent() == PredBB ||
3981           !VectorLoop->contains(I) || I->mayHaveSideEffects())
3982         continue;
3983 
3984       // It's legal to sink the instruction if all its uses occur in the
3985       // predicated block. Otherwise, there's nothing to do yet, and we may
3986       // need to reanalyze the instruction.
3987       if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
3988         InstsToReanalyze.push_back(I);
3989         continue;
3990       }
3991 
3992       // Move the instruction to the beginning of the predicated block, and add
3993       // it's operands to the worklist.
3994       I->moveBefore(&*PredBB->getFirstInsertionPt());
3995       Worklist.insert(I->op_begin(), I->op_end());
3996 
3997       // The sinking may have enabled other instructions to be sunk, so we will
3998       // need to iterate.
3999       Changed = true;
4000     }
4001   } while (Changed);
4002 }
4003 
4004 void InnerLoopVectorizer::fixNonInductionPHIs() {
4005   for (PHINode *OrigPhi : OrigPHIsToFix) {
4006     PHINode *NewPhi =
4007         cast<PHINode>(VectorLoopValueMap.getVectorValue(OrigPhi, 0));
4008     unsigned NumIncomingValues = OrigPhi->getNumIncomingValues();
4009 
4010     SmallVector<BasicBlock *, 2> ScalarBBPredecessors(
4011         predecessors(OrigPhi->getParent()));
4012     SmallVector<BasicBlock *, 2> VectorBBPredecessors(
4013         predecessors(NewPhi->getParent()));
4014     assert(ScalarBBPredecessors.size() == VectorBBPredecessors.size() &&
4015            "Scalar and Vector BB should have the same number of predecessors");
4016 
4017     // The insertion point in Builder may be invalidated by the time we get
4018     // here. Force the Builder insertion point to something valid so that we do
4019     // not run into issues during insertion point restore in
4020     // getOrCreateVectorValue calls below.
4021     Builder.SetInsertPoint(NewPhi);
4022 
4023     // The predecessor order is preserved and we can rely on mapping between
4024     // scalar and vector block predecessors.
4025     for (unsigned i = 0; i < NumIncomingValues; ++i) {
4026       BasicBlock *NewPredBB = VectorBBPredecessors[i];
4027 
4028       // When looking up the new scalar/vector values to fix up, use incoming
4029       // values from original phi.
4030       Value *ScIncV =
4031           OrigPhi->getIncomingValueForBlock(ScalarBBPredecessors[i]);
4032 
4033       // Scalar incoming value may need a broadcast
4034       Value *NewIncV = getOrCreateVectorValue(ScIncV, 0);
4035       NewPhi->addIncoming(NewIncV, NewPredBB);
4036     }
4037   }
4038 }
4039 
4040 void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, unsigned UF,
4041                                    unsigned VF, bool IsPtrLoopInvariant,
4042                                    SmallBitVector &IsIndexLoopInvariant) {
4043   // Construct a vector GEP by widening the operands of the scalar GEP as
4044   // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
4045   // results in a vector of pointers when at least one operand of the GEP
4046   // is vector-typed. Thus, to keep the representation compact, we only use
4047   // vector-typed operands for loop-varying values.
4048 
4049   if (VF > 1 && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) {
4050     // If we are vectorizing, but the GEP has only loop-invariant operands,
4051     // the GEP we build (by only using vector-typed operands for
4052     // loop-varying values) would be a scalar pointer. Thus, to ensure we
4053     // produce a vector of pointers, we need to either arbitrarily pick an
4054     // operand to broadcast, or broadcast a clone of the original GEP.
4055     // Here, we broadcast a clone of the original.
4056     //
4057     // TODO: If at some point we decide to scalarize instructions having
4058     //       loop-invariant operands, this special case will no longer be
4059     //       required. We would add the scalarization decision to
4060     //       collectLoopScalars() and teach getVectorValue() to broadcast
4061     //       the lane-zero scalar value.
4062     auto *Clone = Builder.Insert(GEP->clone());
4063     for (unsigned Part = 0; Part < UF; ++Part) {
4064       Value *EntryPart = Builder.CreateVectorSplat(VF, Clone);
4065       VectorLoopValueMap.setVectorValue(GEP, Part, EntryPart);
4066       addMetadata(EntryPart, GEP);
4067     }
4068   } else {
4069     // If the GEP has at least one loop-varying operand, we are sure to
4070     // produce a vector of pointers. But if we are only unrolling, we want
4071     // to produce a scalar GEP for each unroll part. Thus, the GEP we
4072     // produce with the code below will be scalar (if VF == 1) or vector
4073     // (otherwise). Note that for the unroll-only case, we still maintain
4074     // values in the vector mapping with initVector, as we do for other
4075     // instructions.
4076     for (unsigned Part = 0; Part < UF; ++Part) {
4077       // The pointer operand of the new GEP. If it's loop-invariant, we
4078       // won't broadcast it.
4079       auto *Ptr = IsPtrLoopInvariant
4080                       ? GEP->getPointerOperand()
4081                       : getOrCreateVectorValue(GEP->getPointerOperand(), Part);
4082 
4083       // Collect all the indices for the new GEP. If any index is
4084       // loop-invariant, we won't broadcast it.
4085       SmallVector<Value *, 4> Indices;
4086       for (auto Index : enumerate(GEP->indices())) {
4087         Value *User = Index.value().get();
4088         if (IsIndexLoopInvariant[Index.index()])
4089           Indices.push_back(User);
4090         else
4091           Indices.push_back(getOrCreateVectorValue(User, Part));
4092       }
4093 
4094       // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
4095       // but it should be a vector, otherwise.
4096       auto *NewGEP =
4097           GEP->isInBounds()
4098               ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr,
4099                                           Indices)
4100               : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices);
4101       assert((VF == 1 || NewGEP->getType()->isVectorTy()) &&
4102              "NewGEP is not a pointer vector");
4103       VectorLoopValueMap.setVectorValue(GEP, Part, NewGEP);
4104       addMetadata(NewGEP, GEP);
4105     }
4106   }
4107 }
4108 
4109 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
4110                                               unsigned VF) {
4111   PHINode *P = cast<PHINode>(PN);
4112   if (EnableVPlanNativePath) {
4113     // Currently we enter here in the VPlan-native path for non-induction
4114     // PHIs where all control flow is uniform. We simply widen these PHIs.
4115     // Create a vector phi with no operands - the vector phi operands will be
4116     // set at the end of vector code generation.
4117     Type *VecTy =
4118         (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF);
4119     Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi");
4120     VectorLoopValueMap.setVectorValue(P, 0, VecPhi);
4121     OrigPHIsToFix.push_back(P);
4122 
4123     return;
4124   }
4125 
4126   assert(PN->getParent() == OrigLoop->getHeader() &&
4127          "Non-header phis should have been handled elsewhere");
4128 
4129   // In order to support recurrences we need to be able to vectorize Phi nodes.
4130   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
4131   // stage #1: We create a new vector PHI node with no incoming edges. We'll use
4132   // this value when we vectorize all of the instructions that use the PHI.
4133   if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) {
4134     for (unsigned Part = 0; Part < UF; ++Part) {
4135       // This is phase one of vectorizing PHIs.
4136       Type *VecTy =
4137           (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF);
4138       Value *EntryPart = PHINode::Create(
4139           VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt());
4140       VectorLoopValueMap.setVectorValue(P, Part, EntryPart);
4141     }
4142     return;
4143   }
4144 
4145   setDebugLocFromInst(Builder, P);
4146 
4147   // This PHINode must be an induction variable.
4148   // Make sure that we know about it.
4149   assert(Legal->getInductionVars()->count(P) && "Not an induction variable");
4150 
4151   InductionDescriptor II = Legal->getInductionVars()->lookup(P);
4152   const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
4153 
4154   // FIXME: The newly created binary instructions should contain nsw/nuw flags,
4155   // which can be found from the original scalar operations.
4156   switch (II.getKind()) {
4157   case InductionDescriptor::IK_NoInduction:
4158     llvm_unreachable("Unknown induction");
4159   case InductionDescriptor::IK_IntInduction:
4160   case InductionDescriptor::IK_FpInduction:
4161     llvm_unreachable("Integer/fp induction is handled elsewhere.");
4162   case InductionDescriptor::IK_PtrInduction: {
4163     // Handle the pointer induction variable case.
4164     assert(P->getType()->isPointerTy() && "Unexpected type.");
4165     // This is the normalized GEP that starts counting at zero.
4166     Value *PtrInd = Induction;
4167     PtrInd = Builder.CreateSExtOrTrunc(PtrInd, II.getStep()->getType());
4168     // Determine the number of scalars we need to generate for each unroll
4169     // iteration. If the instruction is uniform, we only need to generate the
4170     // first lane. Otherwise, we generate all VF values.
4171     unsigned Lanes = Cost->isUniformAfterVectorization(P, VF) ? 1 : VF;
4172     // These are the scalar results. Notice that we don't generate vector GEPs
4173     // because scalar GEPs result in better code.
4174     for (unsigned Part = 0; Part < UF; ++Part) {
4175       for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
4176         Constant *Idx = ConstantInt::get(PtrInd->getType(), Lane + Part * VF);
4177         Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
4178         Value *SclrGep =
4179             emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II);
4180         SclrGep->setName("next.gep");
4181         VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep);
4182       }
4183     }
4184     return;
4185   }
4186   }
4187 }
4188 
4189 /// A helper function for checking whether an integer division-related
4190 /// instruction may divide by zero (in which case it must be predicated if
4191 /// executed conditionally in the scalar code).
4192 /// TODO: It may be worthwhile to generalize and check isKnownNonZero().
4193 /// Non-zero divisors that are non compile-time constants will not be
4194 /// converted into multiplication, so we will still end up scalarizing
4195 /// the division, but can do so w/o predication.
4196 static bool mayDivideByZero(Instruction &I) {
4197   assert((I.getOpcode() == Instruction::UDiv ||
4198           I.getOpcode() == Instruction::SDiv ||
4199           I.getOpcode() == Instruction::URem ||
4200           I.getOpcode() == Instruction::SRem) &&
4201          "Unexpected instruction");
4202   Value *Divisor = I.getOperand(1);
4203   auto *CInt = dyn_cast<ConstantInt>(Divisor);
4204   return !CInt || CInt->isZero();
4205 }
4206 
4207 void InnerLoopVectorizer::widenInstruction(Instruction &I) {
4208   switch (I.getOpcode()) {
4209   case Instruction::Br:
4210   case Instruction::PHI:
4211   case Instruction::GetElementPtr:
4212     llvm_unreachable("This instruction is handled by a different recipe.");
4213   case Instruction::UDiv:
4214   case Instruction::SDiv:
4215   case Instruction::SRem:
4216   case Instruction::URem:
4217   case Instruction::Add:
4218   case Instruction::FAdd:
4219   case Instruction::Sub:
4220   case Instruction::FSub:
4221   case Instruction::FNeg:
4222   case Instruction::Mul:
4223   case Instruction::FMul:
4224   case Instruction::FDiv:
4225   case Instruction::FRem:
4226   case Instruction::Shl:
4227   case Instruction::LShr:
4228   case Instruction::AShr:
4229   case Instruction::And:
4230   case Instruction::Or:
4231   case Instruction::Xor: {
4232     // Just widen unops and binops.
4233     setDebugLocFromInst(Builder, &I);
4234 
4235     for (unsigned Part = 0; Part < UF; ++Part) {
4236       SmallVector<Value *, 2> Ops;
4237       for (Value *Op : I.operands())
4238         Ops.push_back(getOrCreateVectorValue(Op, Part));
4239 
4240       Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops);
4241 
4242       if (auto *VecOp = dyn_cast<Instruction>(V))
4243         VecOp->copyIRFlags(&I);
4244 
4245       // Use this vector value for all users of the original instruction.
4246       VectorLoopValueMap.setVectorValue(&I, Part, V);
4247       addMetadata(V, &I);
4248     }
4249 
4250     break;
4251   }
4252   case Instruction::Select: {
4253     // Widen selects.
4254     // If the selector is loop invariant we can create a select
4255     // instruction with a scalar condition. Otherwise, use vector-select.
4256     auto *SE = PSE.getSE();
4257     bool InvariantCond =
4258         SE->isLoopInvariant(PSE.getSCEV(I.getOperand(0)), OrigLoop);
4259     setDebugLocFromInst(Builder, &I);
4260 
4261     // The condition can be loop invariant  but still defined inside the
4262     // loop. This means that we can't just use the original 'cond' value.
4263     // We have to take the 'vectorized' value and pick the first lane.
4264     // Instcombine will make this a no-op.
4265 
4266     auto *ScalarCond = getOrCreateScalarValue(I.getOperand(0), {0, 0});
4267 
4268     for (unsigned Part = 0; Part < UF; ++Part) {
4269       Value *Cond = getOrCreateVectorValue(I.getOperand(0), Part);
4270       Value *Op0 = getOrCreateVectorValue(I.getOperand(1), Part);
4271       Value *Op1 = getOrCreateVectorValue(I.getOperand(2), Part);
4272       Value *Sel =
4273           Builder.CreateSelect(InvariantCond ? ScalarCond : Cond, Op0, Op1);
4274       VectorLoopValueMap.setVectorValue(&I, Part, Sel);
4275       addMetadata(Sel, &I);
4276     }
4277 
4278     break;
4279   }
4280 
4281   case Instruction::ICmp:
4282   case Instruction::FCmp: {
4283     // Widen compares. Generate vector compares.
4284     bool FCmp = (I.getOpcode() == Instruction::FCmp);
4285     auto *Cmp = cast<CmpInst>(&I);
4286     setDebugLocFromInst(Builder, Cmp);
4287     for (unsigned Part = 0; Part < UF; ++Part) {
4288       Value *A = getOrCreateVectorValue(Cmp->getOperand(0), Part);
4289       Value *B = getOrCreateVectorValue(Cmp->getOperand(1), Part);
4290       Value *C = nullptr;
4291       if (FCmp) {
4292         // Propagate fast math flags.
4293         IRBuilder<>::FastMathFlagGuard FMFG(Builder);
4294         Builder.setFastMathFlags(Cmp->getFastMathFlags());
4295         C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
4296       } else {
4297         C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
4298       }
4299       VectorLoopValueMap.setVectorValue(&I, Part, C);
4300       addMetadata(C, &I);
4301     }
4302 
4303     break;
4304   }
4305 
4306   case Instruction::ZExt:
4307   case Instruction::SExt:
4308   case Instruction::FPToUI:
4309   case Instruction::FPToSI:
4310   case Instruction::FPExt:
4311   case Instruction::PtrToInt:
4312   case Instruction::IntToPtr:
4313   case Instruction::SIToFP:
4314   case Instruction::UIToFP:
4315   case Instruction::Trunc:
4316   case Instruction::FPTrunc:
4317   case Instruction::BitCast: {
4318     auto *CI = cast<CastInst>(&I);
4319     setDebugLocFromInst(Builder, CI);
4320 
4321     /// Vectorize casts.
4322     Type *DestTy =
4323         (VF == 1) ? CI->getType() : VectorType::get(CI->getType(), VF);
4324 
4325     for (unsigned Part = 0; Part < UF; ++Part) {
4326       Value *A = getOrCreateVectorValue(CI->getOperand(0), Part);
4327       Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
4328       VectorLoopValueMap.setVectorValue(&I, Part, Cast);
4329       addMetadata(Cast, &I);
4330     }
4331     break;
4332   }
4333 
4334   case Instruction::Call: {
4335     // Ignore dbg intrinsics.
4336     if (isa<DbgInfoIntrinsic>(I))
4337       break;
4338     setDebugLocFromInst(Builder, &I);
4339 
4340     Module *M = I.getParent()->getParent()->getParent();
4341     auto *CI = cast<CallInst>(&I);
4342 
4343     SmallVector<Type *, 4> Tys;
4344     for (Value *ArgOperand : CI->arg_operands())
4345       Tys.push_back(ToVectorTy(ArgOperand->getType(), VF));
4346 
4347     Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
4348 
4349     // The flag shows whether we use Intrinsic or a usual Call for vectorized
4350     // version of the instruction.
4351     // Is it beneficial to perform intrinsic call compared to lib call?
4352     bool NeedToScalarize;
4353     unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize);
4354     bool UseVectorIntrinsic =
4355         ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost;
4356     assert((UseVectorIntrinsic || !NeedToScalarize) &&
4357            "Instruction should be scalarized elsewhere.");
4358 
4359     for (unsigned Part = 0; Part < UF; ++Part) {
4360       SmallVector<Value *, 4> Args;
4361       for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) {
4362         Value *Arg = CI->getArgOperand(i);
4363         // Some intrinsics have a scalar argument - don't replace it with a
4364         // vector.
4365         if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, i))
4366           Arg = getOrCreateVectorValue(CI->getArgOperand(i), Part);
4367         Args.push_back(Arg);
4368       }
4369 
4370       Function *VectorF;
4371       if (UseVectorIntrinsic) {
4372         // Use vector version of the intrinsic.
4373         Type *TysForDecl[] = {CI->getType()};
4374         if (VF > 1)
4375           TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
4376         VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
4377       } else {
4378         // Use vector version of the function call.
4379         const VFShape Shape =
4380             VFShape::get(*CI, {VF, false} /*EC*/, false /*HasGlobalPred*/);
4381 #ifndef NDEBUG
4382         const SmallVector<VFInfo, 8> Infos = VFDatabase::getMappings(*CI);
4383         assert(std::find_if(Infos.begin(), Infos.end(),
4384                             [&Shape](const VFInfo &Info) {
4385                               return Info.Shape == Shape;
4386                             }) != Infos.end() &&
4387                "Vector function shape is missing from the database.");
4388 #endif
4389         VectorF = VFDatabase(*CI).getVectorizedFunction(Shape);
4390       }
4391       assert(VectorF && "Can't create vector function.");
4392 
4393       SmallVector<OperandBundleDef, 1> OpBundles;
4394       CI->getOperandBundlesAsDefs(OpBundles);
4395       CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
4396 
4397       if (isa<FPMathOperator>(V))
4398         V->copyFastMathFlags(CI);
4399 
4400       VectorLoopValueMap.setVectorValue(&I, Part, V);
4401       addMetadata(V, &I);
4402     }
4403 
4404     break;
4405   }
4406 
4407   default:
4408     // This instruction is not vectorized by simple widening.
4409     LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
4410     llvm_unreachable("Unhandled instruction!");
4411   } // end of switch.
4412 }
4413 
4414 void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
4415   // We should not collect Scalars more than once per VF. Right now, this
4416   // function is called from collectUniformsAndScalars(), which already does
4417   // this check. Collecting Scalars for VF=1 does not make any sense.
4418   assert(VF >= 2 && Scalars.find(VF) == Scalars.end() &&
4419          "This function should not be visited twice for the same VF");
4420 
4421   SmallSetVector<Instruction *, 8> Worklist;
4422 
4423   // These sets are used to seed the analysis with pointers used by memory
4424   // accesses that will remain scalar.
4425   SmallSetVector<Instruction *, 8> ScalarPtrs;
4426   SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
4427 
4428   // A helper that returns true if the use of Ptr by MemAccess will be scalar.
4429   // The pointer operands of loads and stores will be scalar as long as the
4430   // memory access is not a gather or scatter operation. The value operand of a
4431   // store will remain scalar if the store is scalarized.
4432   auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
4433     InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
4434     assert(WideningDecision != CM_Unknown &&
4435            "Widening decision should be ready at this moment");
4436     if (auto *Store = dyn_cast<StoreInst>(MemAccess))
4437       if (Ptr == Store->getValueOperand())
4438         return WideningDecision == CM_Scalarize;
4439     assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
4440            "Ptr is neither a value or pointer operand");
4441     return WideningDecision != CM_GatherScatter;
4442   };
4443 
4444   // A helper that returns true if the given value is a bitcast or
4445   // getelementptr instruction contained in the loop.
4446   auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
4447     return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
4448             isa<GetElementPtrInst>(V)) &&
4449            !TheLoop->isLoopInvariant(V);
4450   };
4451 
4452   // A helper that evaluates a memory access's use of a pointer. If the use
4453   // will be a scalar use, and the pointer is only used by memory accesses, we
4454   // place the pointer in ScalarPtrs. Otherwise, the pointer is placed in
4455   // PossibleNonScalarPtrs.
4456   auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
4457     // We only care about bitcast and getelementptr instructions contained in
4458     // the loop.
4459     if (!isLoopVaryingBitCastOrGEP(Ptr))
4460       return;
4461 
4462     // If the pointer has already been identified as scalar (e.g., if it was
4463     // also identified as uniform), there's nothing to do.
4464     auto *I = cast<Instruction>(Ptr);
4465     if (Worklist.count(I))
4466       return;
4467 
4468     // If the use of the pointer will be a scalar use, and all users of the
4469     // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
4470     // place the pointer in PossibleNonScalarPtrs.
4471     if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
4472           return isa<LoadInst>(U) || isa<StoreInst>(U);
4473         }))
4474       ScalarPtrs.insert(I);
4475     else
4476       PossibleNonScalarPtrs.insert(I);
4477   };
4478 
4479   // We seed the scalars analysis with three classes of instructions: (1)
4480   // instructions marked uniform-after-vectorization, (2) bitcast and
4481   // getelementptr instructions used by memory accesses requiring a scalar use,
4482   // and (3) pointer induction variables and their update instructions (we
4483   // currently only scalarize these).
4484   //
4485   // (1) Add to the worklist all instructions that have been identified as
4486   // uniform-after-vectorization.
4487   Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
4488 
4489   // (2) Add to the worklist all bitcast and getelementptr instructions used by
4490   // memory accesses requiring a scalar use. The pointer operands of loads and
4491   // stores will be scalar as long as the memory accesses is not a gather or
4492   // scatter operation. The value operand of a store will remain scalar if the
4493   // store is scalarized.
4494   for (auto *BB : TheLoop->blocks())
4495     for (auto &I : *BB) {
4496       if (auto *Load = dyn_cast<LoadInst>(&I)) {
4497         evaluatePtrUse(Load, Load->getPointerOperand());
4498       } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
4499         evaluatePtrUse(Store, Store->getPointerOperand());
4500         evaluatePtrUse(Store, Store->getValueOperand());
4501       }
4502     }
4503   for (auto *I : ScalarPtrs)
4504     if (PossibleNonScalarPtrs.find(I) == PossibleNonScalarPtrs.end()) {
4505       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
4506       Worklist.insert(I);
4507     }
4508 
4509   // (3) Add to the worklist all pointer induction variables and their update
4510   // instructions.
4511   //
4512   // TODO: Once we are able to vectorize pointer induction variables we should
4513   //       no longer insert them into the worklist here.
4514   auto *Latch = TheLoop->getLoopLatch();
4515   for (auto &Induction : *Legal->getInductionVars()) {
4516     auto *Ind = Induction.first;
4517     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4518     if (Induction.second.getKind() != InductionDescriptor::IK_PtrInduction)
4519       continue;
4520     Worklist.insert(Ind);
4521     Worklist.insert(IndUpdate);
4522     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
4523     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
4524                       << "\n");
4525   }
4526 
4527   // Insert the forced scalars.
4528   // FIXME: Currently widenPHIInstruction() often creates a dead vector
4529   // induction variable when the PHI user is scalarized.
4530   auto ForcedScalar = ForcedScalars.find(VF);
4531   if (ForcedScalar != ForcedScalars.end())
4532     for (auto *I : ForcedScalar->second)
4533       Worklist.insert(I);
4534 
4535   // Expand the worklist by looking through any bitcasts and getelementptr
4536   // instructions we've already identified as scalar. This is similar to the
4537   // expansion step in collectLoopUniforms(); however, here we're only
4538   // expanding to include additional bitcasts and getelementptr instructions.
4539   unsigned Idx = 0;
4540   while (Idx != Worklist.size()) {
4541     Instruction *Dst = Worklist[Idx++];
4542     if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
4543       continue;
4544     auto *Src = cast<Instruction>(Dst->getOperand(0));
4545     if (llvm::all_of(Src->users(), [&](User *U) -> bool {
4546           auto *J = cast<Instruction>(U);
4547           return !TheLoop->contains(J) || Worklist.count(J) ||
4548                  ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
4549                   isScalarUse(J, Src));
4550         })) {
4551       Worklist.insert(Src);
4552       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
4553     }
4554   }
4555 
4556   // An induction variable will remain scalar if all users of the induction
4557   // variable and induction variable update remain scalar.
4558   for (auto &Induction : *Legal->getInductionVars()) {
4559     auto *Ind = Induction.first;
4560     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4561 
4562     // We already considered pointer induction variables, so there's no reason
4563     // to look at their users again.
4564     //
4565     // TODO: Once we are able to vectorize pointer induction variables we
4566     //       should no longer skip over them here.
4567     if (Induction.second.getKind() == InductionDescriptor::IK_PtrInduction)
4568       continue;
4569 
4570     // Determine if all users of the induction variable are scalar after
4571     // vectorization.
4572     auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4573       auto *I = cast<Instruction>(U);
4574       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I);
4575     });
4576     if (!ScalarInd)
4577       continue;
4578 
4579     // Determine if all users of the induction variable update instruction are
4580     // scalar after vectorization.
4581     auto ScalarIndUpdate =
4582         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4583           auto *I = cast<Instruction>(U);
4584           return I == Ind || !TheLoop->contains(I) || Worklist.count(I);
4585         });
4586     if (!ScalarIndUpdate)
4587       continue;
4588 
4589     // The induction variable and its update instruction will remain scalar.
4590     Worklist.insert(Ind);
4591     Worklist.insert(IndUpdate);
4592     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
4593     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
4594                       << "\n");
4595   }
4596 
4597   Scalars[VF].insert(Worklist.begin(), Worklist.end());
4598 }
4599 
4600 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, unsigned VF) {
4601   if (!blockNeedsPredication(I->getParent()))
4602     return false;
4603   switch(I->getOpcode()) {
4604   default:
4605     break;
4606   case Instruction::Load:
4607   case Instruction::Store: {
4608     if (!Legal->isMaskRequired(I))
4609       return false;
4610     auto *Ptr = getLoadStorePointerOperand(I);
4611     auto *Ty = getMemInstValueType(I);
4612     // We have already decided how to vectorize this instruction, get that
4613     // result.
4614     if (VF > 1) {
4615       InstWidening WideningDecision = getWideningDecision(I, VF);
4616       assert(WideningDecision != CM_Unknown &&
4617              "Widening decision should be ready at this moment");
4618       return WideningDecision == CM_Scalarize;
4619     }
4620     const MaybeAlign Alignment = getLoadStoreAlignment(I);
4621     return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
4622                                 isLegalMaskedGather(Ty, Alignment))
4623                             : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
4624                                 isLegalMaskedScatter(Ty, Alignment));
4625   }
4626   case Instruction::UDiv:
4627   case Instruction::SDiv:
4628   case Instruction::SRem:
4629   case Instruction::URem:
4630     return mayDivideByZero(*I);
4631   }
4632   return false;
4633 }
4634 
4635 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(Instruction *I,
4636                                                                unsigned VF) {
4637   assert(isAccessInterleaved(I) && "Expecting interleaved access.");
4638   assert(getWideningDecision(I, VF) == CM_Unknown &&
4639          "Decision should not be set yet.");
4640   auto *Group = getInterleavedAccessGroup(I);
4641   assert(Group && "Must have a group.");
4642 
4643   // If the instruction's allocated size doesn't equal it's type size, it
4644   // requires padding and will be scalarized.
4645   auto &DL = I->getModule()->getDataLayout();
4646   auto *ScalarTy = getMemInstValueType(I);
4647   if (hasIrregularType(ScalarTy, DL, VF))
4648     return false;
4649 
4650   // Check if masking is required.
4651   // A Group may need masking for one of two reasons: it resides in a block that
4652   // needs predication, or it was decided to use masking to deal with gaps.
4653   bool PredicatedAccessRequiresMasking =
4654       Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I);
4655   bool AccessWithGapsRequiresMasking =
4656       Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
4657   if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking)
4658     return true;
4659 
4660   // If masked interleaving is required, we expect that the user/target had
4661   // enabled it, because otherwise it either wouldn't have been created or
4662   // it should have been invalidated by the CostModel.
4663   assert(useMaskedInterleavedAccesses(TTI) &&
4664          "Masked interleave-groups for predicated accesses are not enabled.");
4665 
4666   auto *Ty = getMemInstValueType(I);
4667   const MaybeAlign Alignment = getLoadStoreAlignment(I);
4668   return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
4669                           : TTI.isLegalMaskedStore(Ty, Alignment);
4670 }
4671 
4672 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(Instruction *I,
4673                                                                unsigned VF) {
4674   // Get and ensure we have a valid memory instruction.
4675   LoadInst *LI = dyn_cast<LoadInst>(I);
4676   StoreInst *SI = dyn_cast<StoreInst>(I);
4677   assert((LI || SI) && "Invalid memory instruction");
4678 
4679   auto *Ptr = getLoadStorePointerOperand(I);
4680 
4681   // In order to be widened, the pointer should be consecutive, first of all.
4682   if (!Legal->isConsecutivePtr(Ptr))
4683     return false;
4684 
4685   // If the instruction is a store located in a predicated block, it will be
4686   // scalarized.
4687   if (isScalarWithPredication(I))
4688     return false;
4689 
4690   // If the instruction's allocated size doesn't equal it's type size, it
4691   // requires padding and will be scalarized.
4692   auto &DL = I->getModule()->getDataLayout();
4693   auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType();
4694   if (hasIrregularType(ScalarTy, DL, VF))
4695     return false;
4696 
4697   return true;
4698 }
4699 
4700 void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
4701   // We should not collect Uniforms more than once per VF. Right now,
4702   // this function is called from collectUniformsAndScalars(), which
4703   // already does this check. Collecting Uniforms for VF=1 does not make any
4704   // sense.
4705 
4706   assert(VF >= 2 && Uniforms.find(VF) == Uniforms.end() &&
4707          "This function should not be visited twice for the same VF");
4708 
4709   // Visit the list of Uniforms. If we'll not find any uniform value, we'll
4710   // not analyze again.  Uniforms.count(VF) will return 1.
4711   Uniforms[VF].clear();
4712 
4713   // We now know that the loop is vectorizable!
4714   // Collect instructions inside the loop that will remain uniform after
4715   // vectorization.
4716 
4717   // Global values, params and instructions outside of current loop are out of
4718   // scope.
4719   auto isOutOfScope = [&](Value *V) -> bool {
4720     Instruction *I = dyn_cast<Instruction>(V);
4721     return (!I || !TheLoop->contains(I));
4722   };
4723 
4724   SetVector<Instruction *> Worklist;
4725   BasicBlock *Latch = TheLoop->getLoopLatch();
4726 
4727   // Instructions that are scalar with predication must not be considered
4728   // uniform after vectorization, because that would create an erroneous
4729   // replicating region where only a single instance out of VF should be formed.
4730   // TODO: optimize such seldom cases if found important, see PR40816.
4731   auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
4732     if (isScalarWithPredication(I, VF)) {
4733       LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "
4734                         << *I << "\n");
4735       return;
4736     }
4737     LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
4738     Worklist.insert(I);
4739   };
4740 
4741   // Start with the conditional branch. If the branch condition is an
4742   // instruction contained in the loop that is only used by the branch, it is
4743   // uniform.
4744   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
4745   if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
4746     addToWorklistIfAllowed(Cmp);
4747 
4748   // Holds consecutive and consecutive-like pointers. Consecutive-like pointers
4749   // are pointers that are treated like consecutive pointers during
4750   // vectorization. The pointer operands of interleaved accesses are an
4751   // example.
4752   SmallSetVector<Instruction *, 8> ConsecutiveLikePtrs;
4753 
4754   // Holds pointer operands of instructions that are possibly non-uniform.
4755   SmallPtrSet<Instruction *, 8> PossibleNonUniformPtrs;
4756 
4757   auto isUniformDecision = [&](Instruction *I, unsigned VF) {
4758     InstWidening WideningDecision = getWideningDecision(I, VF);
4759     assert(WideningDecision != CM_Unknown &&
4760            "Widening decision should be ready at this moment");
4761 
4762     return (WideningDecision == CM_Widen ||
4763             WideningDecision == CM_Widen_Reverse ||
4764             WideningDecision == CM_Interleave);
4765   };
4766   // Iterate over the instructions in the loop, and collect all
4767   // consecutive-like pointer operands in ConsecutiveLikePtrs. If it's possible
4768   // that a consecutive-like pointer operand will be scalarized, we collect it
4769   // in PossibleNonUniformPtrs instead. We use two sets here because a single
4770   // getelementptr instruction can be used by both vectorized and scalarized
4771   // memory instructions. For example, if a loop loads and stores from the same
4772   // location, but the store is conditional, the store will be scalarized, and
4773   // the getelementptr won't remain uniform.
4774   for (auto *BB : TheLoop->blocks())
4775     for (auto &I : *BB) {
4776       // If there's no pointer operand, there's nothing to do.
4777       auto *Ptr = dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
4778       if (!Ptr)
4779         continue;
4780 
4781       // True if all users of Ptr are memory accesses that have Ptr as their
4782       // pointer operand.
4783       auto UsersAreMemAccesses =
4784           llvm::all_of(Ptr->users(), [&](User *U) -> bool {
4785             return getLoadStorePointerOperand(U) == Ptr;
4786           });
4787 
4788       // Ensure the memory instruction will not be scalarized or used by
4789       // gather/scatter, making its pointer operand non-uniform. If the pointer
4790       // operand is used by any instruction other than a memory access, we
4791       // conservatively assume the pointer operand may be non-uniform.
4792       if (!UsersAreMemAccesses || !isUniformDecision(&I, VF))
4793         PossibleNonUniformPtrs.insert(Ptr);
4794 
4795       // If the memory instruction will be vectorized and its pointer operand
4796       // is consecutive-like, or interleaving - the pointer operand should
4797       // remain uniform.
4798       else
4799         ConsecutiveLikePtrs.insert(Ptr);
4800     }
4801 
4802   // Add to the Worklist all consecutive and consecutive-like pointers that
4803   // aren't also identified as possibly non-uniform.
4804   for (auto *V : ConsecutiveLikePtrs)
4805     if (PossibleNonUniformPtrs.find(V) == PossibleNonUniformPtrs.end())
4806       addToWorklistIfAllowed(V);
4807 
4808   // Expand Worklist in topological order: whenever a new instruction
4809   // is added , its users should be already inside Worklist.  It ensures
4810   // a uniform instruction will only be used by uniform instructions.
4811   unsigned idx = 0;
4812   while (idx != Worklist.size()) {
4813     Instruction *I = Worklist[idx++];
4814 
4815     for (auto OV : I->operand_values()) {
4816       // isOutOfScope operands cannot be uniform instructions.
4817       if (isOutOfScope(OV))
4818         continue;
4819       // First order recurrence Phi's should typically be considered
4820       // non-uniform.
4821       auto *OP = dyn_cast<PHINode>(OV);
4822       if (OP && Legal->isFirstOrderRecurrence(OP))
4823         continue;
4824       // If all the users of the operand are uniform, then add the
4825       // operand into the uniform worklist.
4826       auto *OI = cast<Instruction>(OV);
4827       if (llvm::all_of(OI->users(), [&](User *U) -> bool {
4828             auto *J = cast<Instruction>(U);
4829             return Worklist.count(J) ||
4830                    (OI == getLoadStorePointerOperand(J) &&
4831                     isUniformDecision(J, VF));
4832           }))
4833         addToWorklistIfAllowed(OI);
4834     }
4835   }
4836 
4837   // Returns true if Ptr is the pointer operand of a memory access instruction
4838   // I, and I is known to not require scalarization.
4839   auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
4840     return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF);
4841   };
4842 
4843   // For an instruction to be added into Worklist above, all its users inside
4844   // the loop should also be in Worklist. However, this condition cannot be
4845   // true for phi nodes that form a cyclic dependence. We must process phi
4846   // nodes separately. An induction variable will remain uniform if all users
4847   // of the induction variable and induction variable update remain uniform.
4848   // The code below handles both pointer and non-pointer induction variables.
4849   for (auto &Induction : *Legal->getInductionVars()) {
4850     auto *Ind = Induction.first;
4851     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4852 
4853     // Determine if all users of the induction variable are uniform after
4854     // vectorization.
4855     auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4856       auto *I = cast<Instruction>(U);
4857       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
4858              isVectorizedMemAccessUse(I, Ind);
4859     });
4860     if (!UniformInd)
4861       continue;
4862 
4863     // Determine if all users of the induction variable update instruction are
4864     // uniform after vectorization.
4865     auto UniformIndUpdate =
4866         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4867           auto *I = cast<Instruction>(U);
4868           return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
4869                  isVectorizedMemAccessUse(I, IndUpdate);
4870         });
4871     if (!UniformIndUpdate)
4872       continue;
4873 
4874     // The induction variable and its update instruction will remain uniform.
4875     addToWorklistIfAllowed(Ind);
4876     addToWorklistIfAllowed(IndUpdate);
4877   }
4878 
4879   Uniforms[VF].insert(Worklist.begin(), Worklist.end());
4880 }
4881 
4882 bool LoopVectorizationCostModel::runtimeChecksRequired() {
4883   LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
4884 
4885   if (Legal->getRuntimePointerChecking()->Need) {
4886     reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
4887         "runtime pointer checks needed. Enable vectorization of this "
4888         "loop with '#pragma clang loop vectorize(enable)' when "
4889         "compiling with -Os/-Oz",
4890         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4891     return true;
4892   }
4893 
4894   if (!PSE.getUnionPredicate().getPredicates().empty()) {
4895     reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
4896         "runtime SCEV checks needed. Enable vectorization of this "
4897         "loop with '#pragma clang loop vectorize(enable)' when "
4898         "compiling with -Os/-Oz",
4899         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4900     return true;
4901   }
4902 
4903   // FIXME: Avoid specializing for stride==1 instead of bailing out.
4904   if (!Legal->getLAI()->getSymbolicStrides().empty()) {
4905     reportVectorizationFailure("Runtime stride check is required with -Os/-Oz",
4906         "runtime stride == 1 checks needed. Enable vectorization of "
4907         "this loop with '#pragma clang loop vectorize(enable)' when "
4908         "compiling with -Os/-Oz",
4909         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4910     return true;
4911   }
4912 
4913   return false;
4914 }
4915 
4916 Optional<unsigned> LoopVectorizationCostModel::computeMaxVF() {
4917   if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
4918     // TODO: It may by useful to do since it's still likely to be dynamically
4919     // uniform if the target can skip.
4920     reportVectorizationFailure(
4921         "Not inserting runtime ptr check for divergent target",
4922         "runtime pointer checks needed. Not enabled for divergent target",
4923         "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
4924     return None;
4925   }
4926 
4927   unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
4928   LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
4929   if (TC == 1) {
4930     reportVectorizationFailure("Single iteration (non) loop",
4931         "loop trip count is one, irrelevant for vectorization",
4932         "SingleIterationLoop", ORE, TheLoop);
4933     return None;
4934   }
4935 
4936   switch (ScalarEpilogueStatus) {
4937   case CM_ScalarEpilogueAllowed:
4938     return computeFeasibleMaxVF(TC);
4939   case CM_ScalarEpilogueNotNeededUsePredicate:
4940     LLVM_DEBUG(
4941         dbgs() << "LV: vector predicate hint/switch found.\n"
4942                << "LV: Not allowing scalar epilogue, creating predicated "
4943                << "vector loop.\n");
4944     break;
4945   case CM_ScalarEpilogueNotAllowedLowTripLoop:
4946     // fallthrough as a special case of OptForSize
4947   case CM_ScalarEpilogueNotAllowedOptSize:
4948     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
4949       LLVM_DEBUG(
4950           dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
4951     else
4952       LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
4953                         << "count.\n");
4954 
4955     // Bail if runtime checks are required, which are not good when optimising
4956     // for size.
4957     if (runtimeChecksRequired())
4958       return None;
4959     break;
4960   }
4961 
4962   // Now try the tail folding
4963 
4964   // Invalidate interleave groups that require an epilogue if we can't mask
4965   // the interleave-group.
4966   if (!useMaskedInterleavedAccesses(TTI))
4967     InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
4968 
4969   unsigned MaxVF = computeFeasibleMaxVF(TC);
4970   if (TC > 0 && TC % MaxVF == 0) {
4971     // Accept MaxVF if we do not have a tail.
4972     LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
4973     return MaxVF;
4974   }
4975 
4976   // If we don't know the precise trip count, or if the trip count that we
4977   // found modulo the vectorization factor is not zero, try to fold the tail
4978   // by masking.
4979   // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
4980   if (Legal->prepareToFoldTailByMasking()) {
4981     FoldTailByMasking = true;
4982     return MaxVF;
4983   }
4984 
4985   if (TC == 0) {
4986     reportVectorizationFailure(
4987         "Unable to calculate the loop count due to complex control flow",
4988         "unable to calculate the loop count due to complex control flow",
4989         "UnknownLoopCountComplexCFG", ORE, TheLoop);
4990     return None;
4991   }
4992 
4993   reportVectorizationFailure(
4994       "Cannot optimize for size and vectorize at the same time.",
4995       "cannot optimize for size and vectorize at the same time. "
4996       "Enable vectorization of this loop with '#pragma clang loop "
4997       "vectorize(enable)' when compiling with -Os/-Oz",
4998       "NoTailLoopWithOptForSize", ORE, TheLoop);
4999   return None;
5000 }
5001 
5002 unsigned
5003 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) {
5004   MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
5005   unsigned SmallestType, WidestType;
5006   std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
5007   unsigned WidestRegister = TTI.getRegisterBitWidth(true);
5008 
5009   // Get the maximum safe dependence distance in bits computed by LAA.
5010   // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
5011   // the memory accesses that is most restrictive (involved in the smallest
5012   // dependence distance).
5013   unsigned MaxSafeRegisterWidth = Legal->getMaxSafeRegisterWidth();
5014 
5015   WidestRegister = std::min(WidestRegister, MaxSafeRegisterWidth);
5016 
5017   unsigned MaxVectorSize = WidestRegister / WidestType;
5018 
5019   LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
5020                     << " / " << WidestType << " bits.\n");
5021   LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
5022                     << WidestRegister << " bits.\n");
5023 
5024   assert(MaxVectorSize <= 256 && "Did not expect to pack so many elements"
5025                                  " into one vector!");
5026   if (MaxVectorSize == 0) {
5027     LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n");
5028     MaxVectorSize = 1;
5029     return MaxVectorSize;
5030   } else if (ConstTripCount && ConstTripCount < MaxVectorSize &&
5031              isPowerOf2_32(ConstTripCount)) {
5032     // We need to clamp the VF to be the ConstTripCount. There is no point in
5033     // choosing a higher viable VF as done in the loop below.
5034     LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: "
5035                       << ConstTripCount << "\n");
5036     MaxVectorSize = ConstTripCount;
5037     return MaxVectorSize;
5038   }
5039 
5040   unsigned MaxVF = MaxVectorSize;
5041   if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) ||
5042       (MaximizeBandwidth && isScalarEpilogueAllowed())) {
5043     // Collect all viable vectorization factors larger than the default MaxVF
5044     // (i.e. MaxVectorSize).
5045     SmallVector<unsigned, 8> VFs;
5046     unsigned NewMaxVectorSize = WidestRegister / SmallestType;
5047     for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2)
5048       VFs.push_back(VS);
5049 
5050     // For each VF calculate its register usage.
5051     auto RUs = calculateRegisterUsage(VFs);
5052 
5053     // Select the largest VF which doesn't require more registers than existing
5054     // ones.
5055     for (int i = RUs.size() - 1; i >= 0; --i) {
5056       bool Selected = true;
5057       for (auto& pair : RUs[i].MaxLocalUsers) {
5058         unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5059         if (pair.second > TargetNumRegisters)
5060           Selected = false;
5061       }
5062       if (Selected) {
5063         MaxVF = VFs[i];
5064         break;
5065       }
5066     }
5067     if (unsigned MinVF = TTI.getMinimumVF(SmallestType)) {
5068       if (MaxVF < MinVF) {
5069         LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
5070                           << ") with target's minimum: " << MinVF << '\n');
5071         MaxVF = MinVF;
5072       }
5073     }
5074   }
5075   return MaxVF;
5076 }
5077 
5078 VectorizationFactor
5079 LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) {
5080   float Cost = expectedCost(1).first;
5081   const float ScalarCost = Cost;
5082   unsigned Width = 1;
5083   LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n");
5084 
5085   bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
5086   if (ForceVectorization && MaxVF > 1) {
5087     // Ignore scalar width, because the user explicitly wants vectorization.
5088     // Initialize cost to max so that VF = 2 is, at least, chosen during cost
5089     // evaluation.
5090     Cost = std::numeric_limits<float>::max();
5091   }
5092 
5093   for (unsigned i = 2; i <= MaxVF; i *= 2) {
5094     // Notice that the vector loop needs to be executed less times, so
5095     // we need to divide the cost of the vector loops by the width of
5096     // the vector elements.
5097     VectorizationCostTy C = expectedCost(i);
5098     float VectorCost = C.first / (float)i;
5099     LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
5100                       << " costs: " << (int)VectorCost << ".\n");
5101     if (!C.second && !ForceVectorization) {
5102       LLVM_DEBUG(
5103           dbgs() << "LV: Not considering vector loop of width " << i
5104                  << " because it will not generate any vector instructions.\n");
5105       continue;
5106     }
5107     if (VectorCost < Cost) {
5108       Cost = VectorCost;
5109       Width = i;
5110     }
5111   }
5112 
5113   if (!EnableCondStoresVectorization && NumPredStores) {
5114     reportVectorizationFailure("There are conditional stores.",
5115         "store that is conditionally executed prevents vectorization",
5116         "ConditionalStore", ORE, TheLoop);
5117     Width = 1;
5118     Cost = ScalarCost;
5119   }
5120 
5121   LLVM_DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs()
5122              << "LV: Vectorization seems to be not beneficial, "
5123              << "but was forced by a user.\n");
5124   LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n");
5125   VectorizationFactor Factor = {Width, (unsigned)(Width * Cost)};
5126   return Factor;
5127 }
5128 
5129 std::pair<unsigned, unsigned>
5130 LoopVectorizationCostModel::getSmallestAndWidestTypes() {
5131   unsigned MinWidth = -1U;
5132   unsigned MaxWidth = 8;
5133   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5134 
5135   // For each block.
5136   for (BasicBlock *BB : TheLoop->blocks()) {
5137     // For each instruction in the loop.
5138     for (Instruction &I : BB->instructionsWithoutDebug()) {
5139       Type *T = I.getType();
5140 
5141       // Skip ignored values.
5142       if (ValuesToIgnore.find(&I) != ValuesToIgnore.end())
5143         continue;
5144 
5145       // Only examine Loads, Stores and PHINodes.
5146       if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
5147         continue;
5148 
5149       // Examine PHI nodes that are reduction variables. Update the type to
5150       // account for the recurrence type.
5151       if (auto *PN = dyn_cast<PHINode>(&I)) {
5152         if (!Legal->isReductionVariable(PN))
5153           continue;
5154         RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[PN];
5155         T = RdxDesc.getRecurrenceType();
5156       }
5157 
5158       // Examine the stored values.
5159       if (auto *ST = dyn_cast<StoreInst>(&I))
5160         T = ST->getValueOperand()->getType();
5161 
5162       // Ignore loaded pointer types and stored pointer types that are not
5163       // vectorizable.
5164       //
5165       // FIXME: The check here attempts to predict whether a load or store will
5166       //        be vectorized. We only know this for certain after a VF has
5167       //        been selected. Here, we assume that if an access can be
5168       //        vectorized, it will be. We should also look at extending this
5169       //        optimization to non-pointer types.
5170       //
5171       if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) &&
5172           !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I))
5173         continue;
5174 
5175       MinWidth = std::min(MinWidth,
5176                           (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
5177       MaxWidth = std::max(MaxWidth,
5178                           (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
5179     }
5180   }
5181 
5182   return {MinWidth, MaxWidth};
5183 }
5184 
5185 unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF,
5186                                                            unsigned LoopCost) {
5187   // -- The interleave heuristics --
5188   // We interleave the loop in order to expose ILP and reduce the loop overhead.
5189   // There are many micro-architectural considerations that we can't predict
5190   // at this level. For example, frontend pressure (on decode or fetch) due to
5191   // code size, or the number and capabilities of the execution ports.
5192   //
5193   // We use the following heuristics to select the interleave count:
5194   // 1. If the code has reductions, then we interleave to break the cross
5195   // iteration dependency.
5196   // 2. If the loop is really small, then we interleave to reduce the loop
5197   // overhead.
5198   // 3. We don't interleave if we think that we will spill registers to memory
5199   // due to the increased register pressure.
5200 
5201   if (!isScalarEpilogueAllowed())
5202     return 1;
5203 
5204   // We used the distance for the interleave count.
5205   if (Legal->getMaxSafeDepDistBytes() != -1U)
5206     return 1;
5207 
5208   // Do not interleave loops with a relatively small known or estimated trip
5209   // count.
5210   auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
5211   if (BestKnownTC && *BestKnownTC < TinyTripCountInterleaveThreshold)
5212     return 1;
5213 
5214   RegisterUsage R = calculateRegisterUsage({VF})[0];
5215   // We divide by these constants so assume that we have at least one
5216   // instruction that uses at least one register.
5217   for (auto& pair : R.MaxLocalUsers) {
5218     pair.second = std::max(pair.second, 1U);
5219   }
5220 
5221   // We calculate the interleave count using the following formula.
5222   // Subtract the number of loop invariants from the number of available
5223   // registers. These registers are used by all of the interleaved instances.
5224   // Next, divide the remaining registers by the number of registers that is
5225   // required by the loop, in order to estimate how many parallel instances
5226   // fit without causing spills. All of this is rounded down if necessary to be
5227   // a power of two. We want power of two interleave count to simplify any
5228   // addressing operations or alignment considerations.
5229   // We also want power of two interleave counts to ensure that the induction
5230   // variable of the vector loop wraps to zero, when tail is folded by masking;
5231   // this currently happens when OptForSize, in which case IC is set to 1 above.
5232   unsigned IC = UINT_MAX;
5233 
5234   for (auto& pair : R.MaxLocalUsers) {
5235     unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5236     LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
5237                       << " registers of "
5238                       << TTI.getRegisterClassName(pair.first) << " register class\n");
5239     if (VF == 1) {
5240       if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
5241         TargetNumRegisters = ForceTargetNumScalarRegs;
5242     } else {
5243       if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
5244         TargetNumRegisters = ForceTargetNumVectorRegs;
5245     }
5246     unsigned MaxLocalUsers = pair.second;
5247     unsigned LoopInvariantRegs = 0;
5248     if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end())
5249       LoopInvariantRegs = R.LoopInvariantRegs[pair.first];
5250 
5251     unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers);
5252     // Don't count the induction variable as interleaved.
5253     if (EnableIndVarRegisterHeur) {
5254       TmpIC =
5255           PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) /
5256                         std::max(1U, (MaxLocalUsers - 1)));
5257     }
5258 
5259     IC = std::min(IC, TmpIC);
5260   }
5261 
5262   // Clamp the interleave ranges to reasonable counts.
5263   unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF);
5264 
5265   // Check if the user has overridden the max.
5266   if (VF == 1) {
5267     if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
5268       MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
5269   } else {
5270     if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
5271       MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
5272   }
5273 
5274   // If trip count is known or estimated compile time constant, limit the
5275   // interleave count to be less than the trip count divided by VF.
5276   if (BestKnownTC) {
5277     MaxInterleaveCount = std::min(*BestKnownTC / VF, MaxInterleaveCount);
5278   }
5279 
5280   // If we did not calculate the cost for VF (because the user selected the VF)
5281   // then we calculate the cost of VF here.
5282   if (LoopCost == 0)
5283     LoopCost = expectedCost(VF).first;
5284 
5285   assert(LoopCost && "Non-zero loop cost expected");
5286 
5287   // Clamp the calculated IC to be between the 1 and the max interleave count
5288   // that the target and trip count allows.
5289   if (IC > MaxInterleaveCount)
5290     IC = MaxInterleaveCount;
5291   else if (IC < 1)
5292     IC = 1;
5293 
5294   // Interleave if we vectorized this loop and there is a reduction that could
5295   // benefit from interleaving.
5296   if (VF > 1 && !Legal->getReductionVars()->empty()) {
5297     LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
5298     return IC;
5299   }
5300 
5301   // Note that if we've already vectorized the loop we will have done the
5302   // runtime check and so interleaving won't require further checks.
5303   bool InterleavingRequiresRuntimePointerCheck =
5304       (VF == 1 && Legal->getRuntimePointerChecking()->Need);
5305 
5306   // We want to interleave small loops in order to reduce the loop overhead and
5307   // potentially expose ILP opportunities.
5308   LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n');
5309   if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) {
5310     // We assume that the cost overhead is 1 and we use the cost model
5311     // to estimate the cost of the loop and interleave until the cost of the
5312     // loop overhead is about 5% of the cost of the loop.
5313     unsigned SmallIC =
5314         std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost));
5315 
5316     // Interleave until store/load ports (estimated by max interleave count) are
5317     // saturated.
5318     unsigned NumStores = Legal->getNumStores();
5319     unsigned NumLoads = Legal->getNumLoads();
5320     unsigned StoresIC = IC / (NumStores ? NumStores : 1);
5321     unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
5322 
5323     // If we have a scalar reduction (vector reductions are already dealt with
5324     // by this point), we can increase the critical path length if the loop
5325     // we're interleaving is inside another loop. Limit, by default to 2, so the
5326     // critical path only gets increased by one reduction operation.
5327     if (!Legal->getReductionVars()->empty() && TheLoop->getLoopDepth() > 1) {
5328       unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
5329       SmallIC = std::min(SmallIC, F);
5330       StoresIC = std::min(StoresIC, F);
5331       LoadsIC = std::min(LoadsIC, F);
5332     }
5333 
5334     if (EnableLoadStoreRuntimeInterleave &&
5335         std::max(StoresIC, LoadsIC) > SmallIC) {
5336       LLVM_DEBUG(
5337           dbgs() << "LV: Interleaving to saturate store or load ports.\n");
5338       return std::max(StoresIC, LoadsIC);
5339     }
5340 
5341     LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
5342     return SmallIC;
5343   }
5344 
5345   // Interleave if this is a large loop (small loops are already dealt with by
5346   // this point) that could benefit from interleaving.
5347   bool HasReductions = !Legal->getReductionVars()->empty();
5348   if (TTI.enableAggressiveInterleaving(HasReductions)) {
5349     LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5350     return IC;
5351   }
5352 
5353   LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
5354   return 1;
5355 }
5356 
5357 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
5358 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {
5359   // This function calculates the register usage by measuring the highest number
5360   // of values that are alive at a single location. Obviously, this is a very
5361   // rough estimation. We scan the loop in a topological order in order and
5362   // assign a number to each instruction. We use RPO to ensure that defs are
5363   // met before their users. We assume that each instruction that has in-loop
5364   // users starts an interval. We record every time that an in-loop value is
5365   // used, so we have a list of the first and last occurrences of each
5366   // instruction. Next, we transpose this data structure into a multi map that
5367   // holds the list of intervals that *end* at a specific location. This multi
5368   // map allows us to perform a linear search. We scan the instructions linearly
5369   // and record each time that a new interval starts, by placing it in a set.
5370   // If we find this value in the multi-map then we remove it from the set.
5371   // The max register usage is the maximum size of the set.
5372   // We also search for instructions that are defined outside the loop, but are
5373   // used inside the loop. We need this number separately from the max-interval
5374   // usage number because when we unroll, loop-invariant values do not take
5375   // more register.
5376   LoopBlocksDFS DFS(TheLoop);
5377   DFS.perform(LI);
5378 
5379   RegisterUsage RU;
5380 
5381   // Each 'key' in the map opens a new interval. The values
5382   // of the map are the index of the 'last seen' usage of the
5383   // instruction that is the key.
5384   using IntervalMap = DenseMap<Instruction *, unsigned>;
5385 
5386   // Maps instruction to its index.
5387   SmallVector<Instruction *, 64> IdxToInstr;
5388   // Marks the end of each interval.
5389   IntervalMap EndPoint;
5390   // Saves the list of instruction indices that are used in the loop.
5391   SmallPtrSet<Instruction *, 8> Ends;
5392   // Saves the list of values that are used in the loop but are
5393   // defined outside the loop, such as arguments and constants.
5394   SmallPtrSet<Value *, 8> LoopInvariants;
5395 
5396   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
5397     for (Instruction &I : BB->instructionsWithoutDebug()) {
5398       IdxToInstr.push_back(&I);
5399 
5400       // Save the end location of each USE.
5401       for (Value *U : I.operands()) {
5402         auto *Instr = dyn_cast<Instruction>(U);
5403 
5404         // Ignore non-instruction values such as arguments, constants, etc.
5405         if (!Instr)
5406           continue;
5407 
5408         // If this instruction is outside the loop then record it and continue.
5409         if (!TheLoop->contains(Instr)) {
5410           LoopInvariants.insert(Instr);
5411           continue;
5412         }
5413 
5414         // Overwrite previous end points.
5415         EndPoint[Instr] = IdxToInstr.size();
5416         Ends.insert(Instr);
5417       }
5418     }
5419   }
5420 
5421   // Saves the list of intervals that end with the index in 'key'.
5422   using InstrList = SmallVector<Instruction *, 2>;
5423   DenseMap<unsigned, InstrList> TransposeEnds;
5424 
5425   // Transpose the EndPoints to a list of values that end at each index.
5426   for (auto &Interval : EndPoint)
5427     TransposeEnds[Interval.second].push_back(Interval.first);
5428 
5429   SmallPtrSet<Instruction *, 8> OpenIntervals;
5430 
5431   // Get the size of the widest register.
5432   unsigned MaxSafeDepDist = -1U;
5433   if (Legal->getMaxSafeDepDistBytes() != -1U)
5434     MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8;
5435   unsigned WidestRegister =
5436       std::min(TTI.getRegisterBitWidth(true), MaxSafeDepDist);
5437   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5438 
5439   SmallVector<RegisterUsage, 8> RUs(VFs.size());
5440   SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size());
5441 
5442   LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
5443 
5444   // A lambda that gets the register usage for the given type and VF.
5445   auto GetRegUsage = [&DL, WidestRegister](Type *Ty, unsigned VF) {
5446     if (Ty->isTokenTy())
5447       return 0U;
5448     unsigned TypeSize = DL.getTypeSizeInBits(Ty->getScalarType());
5449     return std::max<unsigned>(1, VF * TypeSize / WidestRegister);
5450   };
5451 
5452   for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
5453     Instruction *I = IdxToInstr[i];
5454 
5455     // Remove all of the instructions that end at this location.
5456     InstrList &List = TransposeEnds[i];
5457     for (Instruction *ToRemove : List)
5458       OpenIntervals.erase(ToRemove);
5459 
5460     // Ignore instructions that are never used within the loop.
5461     if (Ends.find(I) == Ends.end())
5462       continue;
5463 
5464     // Skip ignored values.
5465     if (ValuesToIgnore.find(I) != ValuesToIgnore.end())
5466       continue;
5467 
5468     // For each VF find the maximum usage of registers.
5469     for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
5470       // Count the number of live intervals.
5471       SmallMapVector<unsigned, unsigned, 4> RegUsage;
5472 
5473       if (VFs[j] == 1) {
5474         for (auto Inst : OpenIntervals) {
5475           unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
5476           if (RegUsage.find(ClassID) == RegUsage.end())
5477             RegUsage[ClassID] = 1;
5478           else
5479             RegUsage[ClassID] += 1;
5480         }
5481       } else {
5482         collectUniformsAndScalars(VFs[j]);
5483         for (auto Inst : OpenIntervals) {
5484           // Skip ignored values for VF > 1.
5485           if (VecValuesToIgnore.find(Inst) != VecValuesToIgnore.end())
5486             continue;
5487           if (isScalarAfterVectorization(Inst, VFs[j])) {
5488             unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
5489             if (RegUsage.find(ClassID) == RegUsage.end())
5490               RegUsage[ClassID] = 1;
5491             else
5492               RegUsage[ClassID] += 1;
5493           } else {
5494             unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType());
5495             if (RegUsage.find(ClassID) == RegUsage.end())
5496               RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]);
5497             else
5498               RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]);
5499           }
5500         }
5501       }
5502 
5503       for (auto& pair : RegUsage) {
5504         if (MaxUsages[j].find(pair.first) != MaxUsages[j].end())
5505           MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second);
5506         else
5507           MaxUsages[j][pair.first] = pair.second;
5508       }
5509     }
5510 
5511     LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
5512                       << OpenIntervals.size() << '\n');
5513 
5514     // Add the current instruction to the list of open intervals.
5515     OpenIntervals.insert(I);
5516   }
5517 
5518   for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
5519     SmallMapVector<unsigned, unsigned, 4> Invariant;
5520 
5521     for (auto Inst : LoopInvariants) {
5522       unsigned Usage = VFs[i] == 1 ? 1 : GetRegUsage(Inst->getType(), VFs[i]);
5523       unsigned ClassID = TTI.getRegisterClassForType(VFs[i] > 1, Inst->getType());
5524       if (Invariant.find(ClassID) == Invariant.end())
5525         Invariant[ClassID] = Usage;
5526       else
5527         Invariant[ClassID] += Usage;
5528     }
5529 
5530     LLVM_DEBUG({
5531       dbgs() << "LV(REG): VF = " << VFs[i] << '\n';
5532       dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size()
5533              << " item\n";
5534       for (const auto &pair : MaxUsages[i]) {
5535         dbgs() << "LV(REG): RegisterClass: "
5536                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5537                << " registers\n";
5538       }
5539       dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()
5540              << " item\n";
5541       for (const auto &pair : Invariant) {
5542         dbgs() << "LV(REG): RegisterClass: "
5543                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5544                << " registers\n";
5545       }
5546     });
5547 
5548     RU.LoopInvariantRegs = Invariant;
5549     RU.MaxLocalUsers = MaxUsages[i];
5550     RUs[i] = RU;
5551   }
5552 
5553   return RUs;
5554 }
5555 
5556 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){
5557   // TODO: Cost model for emulated masked load/store is completely
5558   // broken. This hack guides the cost model to use an artificially
5559   // high enough value to practically disable vectorization with such
5560   // operations, except where previously deployed legality hack allowed
5561   // using very low cost values. This is to avoid regressions coming simply
5562   // from moving "masked load/store" check from legality to cost model.
5563   // Masked Load/Gather emulation was previously never allowed.
5564   // Limited number of Masked Store/Scatter emulation was allowed.
5565   assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction");
5566   return isa<LoadInst>(I) ||
5567          (isa<StoreInst>(I) &&
5568           NumPredStores > NumberOfStoresToPredicate);
5569 }
5570 
5571 void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) {
5572   // If we aren't vectorizing the loop, or if we've already collected the
5573   // instructions to scalarize, there's nothing to do. Collection may already
5574   // have occurred if we have a user-selected VF and are now computing the
5575   // expected cost for interleaving.
5576   if (VF < 2 || InstsToScalarize.find(VF) != InstsToScalarize.end())
5577     return;
5578 
5579   // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
5580   // not profitable to scalarize any instructions, the presence of VF in the
5581   // map will indicate that we've analyzed it already.
5582   ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
5583 
5584   // Find all the instructions that are scalar with predication in the loop and
5585   // determine if it would be better to not if-convert the blocks they are in.
5586   // If so, we also record the instructions to scalarize.
5587   for (BasicBlock *BB : TheLoop->blocks()) {
5588     if (!blockNeedsPredication(BB))
5589       continue;
5590     for (Instruction &I : *BB)
5591       if (isScalarWithPredication(&I)) {
5592         ScalarCostsTy ScalarCosts;
5593         // Do not apply discount logic if hacked cost is needed
5594         // for emulated masked memrefs.
5595         if (!useEmulatedMaskMemRefHack(&I) &&
5596             computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
5597           ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
5598         // Remember that BB will remain after vectorization.
5599         PredicatedBBsAfterVectorization.insert(BB);
5600       }
5601   }
5602 }
5603 
5604 int LoopVectorizationCostModel::computePredInstDiscount(
5605     Instruction *PredInst, DenseMap<Instruction *, unsigned> &ScalarCosts,
5606     unsigned VF) {
5607   assert(!isUniformAfterVectorization(PredInst, VF) &&
5608          "Instruction marked uniform-after-vectorization will be predicated");
5609 
5610   // Initialize the discount to zero, meaning that the scalar version and the
5611   // vector version cost the same.
5612   int Discount = 0;
5613 
5614   // Holds instructions to analyze. The instructions we visit are mapped in
5615   // ScalarCosts. Those instructions are the ones that would be scalarized if
5616   // we find that the scalar version costs less.
5617   SmallVector<Instruction *, 8> Worklist;
5618 
5619   // Returns true if the given instruction can be scalarized.
5620   auto canBeScalarized = [&](Instruction *I) -> bool {
5621     // We only attempt to scalarize instructions forming a single-use chain
5622     // from the original predicated block that would otherwise be vectorized.
5623     // Although not strictly necessary, we give up on instructions we know will
5624     // already be scalar to avoid traversing chains that are unlikely to be
5625     // beneficial.
5626     if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
5627         isScalarAfterVectorization(I, VF))
5628       return false;
5629 
5630     // If the instruction is scalar with predication, it will be analyzed
5631     // separately. We ignore it within the context of PredInst.
5632     if (isScalarWithPredication(I))
5633       return false;
5634 
5635     // If any of the instruction's operands are uniform after vectorization,
5636     // the instruction cannot be scalarized. This prevents, for example, a
5637     // masked load from being scalarized.
5638     //
5639     // We assume we will only emit a value for lane zero of an instruction
5640     // marked uniform after vectorization, rather than VF identical values.
5641     // Thus, if we scalarize an instruction that uses a uniform, we would
5642     // create uses of values corresponding to the lanes we aren't emitting code
5643     // for. This behavior can be changed by allowing getScalarValue to clone
5644     // the lane zero values for uniforms rather than asserting.
5645     for (Use &U : I->operands())
5646       if (auto *J = dyn_cast<Instruction>(U.get()))
5647         if (isUniformAfterVectorization(J, VF))
5648           return false;
5649 
5650     // Otherwise, we can scalarize the instruction.
5651     return true;
5652   };
5653 
5654   // Compute the expected cost discount from scalarizing the entire expression
5655   // feeding the predicated instruction. We currently only consider expressions
5656   // that are single-use instruction chains.
5657   Worklist.push_back(PredInst);
5658   while (!Worklist.empty()) {
5659     Instruction *I = Worklist.pop_back_val();
5660 
5661     // If we've already analyzed the instruction, there's nothing to do.
5662     if (ScalarCosts.find(I) != ScalarCosts.end())
5663       continue;
5664 
5665     // Compute the cost of the vector instruction. Note that this cost already
5666     // includes the scalarization overhead of the predicated instruction.
5667     unsigned VectorCost = getInstructionCost(I, VF).first;
5668 
5669     // Compute the cost of the scalarized instruction. This cost is the cost of
5670     // the instruction as if it wasn't if-converted and instead remained in the
5671     // predicated block. We will scale this cost by block probability after
5672     // computing the scalarization overhead.
5673     unsigned ScalarCost = VF * getInstructionCost(I, 1).first;
5674 
5675     // Compute the scalarization overhead of needed insertelement instructions
5676     // and phi nodes.
5677     if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) {
5678       ScalarCost += TTI.getScalarizationOverhead(ToVectorTy(I->getType(), VF),
5679                                                  true, false);
5680       ScalarCost += VF * TTI.getCFInstrCost(Instruction::PHI);
5681     }
5682 
5683     // Compute the scalarization overhead of needed extractelement
5684     // instructions. For each of the instruction's operands, if the operand can
5685     // be scalarized, add it to the worklist; otherwise, account for the
5686     // overhead.
5687     for (Use &U : I->operands())
5688       if (auto *J = dyn_cast<Instruction>(U.get())) {
5689         assert(VectorType::isValidElementType(J->getType()) &&
5690                "Instruction has non-scalar type");
5691         if (canBeScalarized(J))
5692           Worklist.push_back(J);
5693         else if (needsExtract(J, VF))
5694           ScalarCost += TTI.getScalarizationOverhead(
5695                               ToVectorTy(J->getType(),VF), false, true);
5696       }
5697 
5698     // Scale the total scalar cost by block probability.
5699     ScalarCost /= getReciprocalPredBlockProb();
5700 
5701     // Compute the discount. A non-negative discount means the vector version
5702     // of the instruction costs more, and scalarizing would be beneficial.
5703     Discount += VectorCost - ScalarCost;
5704     ScalarCosts[I] = ScalarCost;
5705   }
5706 
5707   return Discount;
5708 }
5709 
5710 LoopVectorizationCostModel::VectorizationCostTy
5711 LoopVectorizationCostModel::expectedCost(unsigned VF) {
5712   VectorizationCostTy Cost;
5713 
5714   // For each block.
5715   for (BasicBlock *BB : TheLoop->blocks()) {
5716     VectorizationCostTy BlockCost;
5717 
5718     // For each instruction in the old loop.
5719     for (Instruction &I : BB->instructionsWithoutDebug()) {
5720       // Skip ignored values.
5721       if (ValuesToIgnore.find(&I) != ValuesToIgnore.end() ||
5722           (VF > 1 && VecValuesToIgnore.find(&I) != VecValuesToIgnore.end()))
5723         continue;
5724 
5725       VectorizationCostTy C = getInstructionCost(&I, VF);
5726 
5727       // Check if we should override the cost.
5728       if (ForceTargetInstructionCost.getNumOccurrences() > 0)
5729         C.first = ForceTargetInstructionCost;
5730 
5731       BlockCost.first += C.first;
5732       BlockCost.second |= C.second;
5733       LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first
5734                         << " for VF " << VF << " For instruction: " << I
5735                         << '\n');
5736     }
5737 
5738     // If we are vectorizing a predicated block, it will have been
5739     // if-converted. This means that the block's instructions (aside from
5740     // stores and instructions that may divide by zero) will now be
5741     // unconditionally executed. For the scalar case, we may not always execute
5742     // the predicated block. Thus, scale the block's cost by the probability of
5743     // executing it.
5744     if (VF == 1 && blockNeedsPredication(BB))
5745       BlockCost.first /= getReciprocalPredBlockProb();
5746 
5747     Cost.first += BlockCost.first;
5748     Cost.second |= BlockCost.second;
5749   }
5750 
5751   return Cost;
5752 }
5753 
5754 /// Gets Address Access SCEV after verifying that the access pattern
5755 /// is loop invariant except the induction variable dependence.
5756 ///
5757 /// This SCEV can be sent to the Target in order to estimate the address
5758 /// calculation cost.
5759 static const SCEV *getAddressAccessSCEV(
5760               Value *Ptr,
5761               LoopVectorizationLegality *Legal,
5762               PredicatedScalarEvolution &PSE,
5763               const Loop *TheLoop) {
5764 
5765   auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
5766   if (!Gep)
5767     return nullptr;
5768 
5769   // We are looking for a gep with all loop invariant indices except for one
5770   // which should be an induction variable.
5771   auto SE = PSE.getSE();
5772   unsigned NumOperands = Gep->getNumOperands();
5773   for (unsigned i = 1; i < NumOperands; ++i) {
5774     Value *Opd = Gep->getOperand(i);
5775     if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
5776         !Legal->isInductionVariable(Opd))
5777       return nullptr;
5778   }
5779 
5780   // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
5781   return PSE.getSCEV(Ptr);
5782 }
5783 
5784 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
5785   return Legal->hasStride(I->getOperand(0)) ||
5786          Legal->hasStride(I->getOperand(1));
5787 }
5788 
5789 unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
5790                                                                  unsigned VF) {
5791   assert(VF > 1 && "Scalarization cost of instruction implies vectorization.");
5792   Type *ValTy = getMemInstValueType(I);
5793   auto SE = PSE.getSE();
5794 
5795   unsigned AS = getLoadStoreAddressSpace(I);
5796   Value *Ptr = getLoadStorePointerOperand(I);
5797   Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
5798 
5799   // Figure out whether the access is strided and get the stride value
5800   // if it's known in compile time
5801   const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
5802 
5803   // Get the cost of the scalar memory instruction and address computation.
5804   unsigned Cost = VF * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
5805 
5806   // Don't pass *I here, since it is scalar but will actually be part of a
5807   // vectorized loop where the user of it is a vectorized instruction.
5808   const MaybeAlign Alignment = getLoadStoreAlignment(I);
5809   Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(),
5810                                    Alignment, AS);
5811 
5812   // Get the overhead of the extractelement and insertelement instructions
5813   // we might create due to scalarization.
5814   Cost += getScalarizationOverhead(I, VF);
5815 
5816   // If we have a predicated store, it may not be executed for each vector
5817   // lane. Scale the cost by the probability of executing the predicated
5818   // block.
5819   if (isPredicatedInst(I)) {
5820     Cost /= getReciprocalPredBlockProb();
5821 
5822     if (useEmulatedMaskMemRefHack(I))
5823       // Artificially setting to a high enough value to practically disable
5824       // vectorization with such operations.
5825       Cost = 3000000;
5826   }
5827 
5828   return Cost;
5829 }
5830 
5831 unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
5832                                                              unsigned VF) {
5833   Type *ValTy = getMemInstValueType(I);
5834   Type *VectorTy = ToVectorTy(ValTy, VF);
5835   Value *Ptr = getLoadStorePointerOperand(I);
5836   unsigned AS = getLoadStoreAddressSpace(I);
5837   int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
5838 
5839   assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
5840          "Stride should be 1 or -1 for consecutive memory access");
5841   const MaybeAlign Alignment = getLoadStoreAlignment(I);
5842   unsigned Cost = 0;
5843   if (Legal->isMaskRequired(I))
5844     Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy,
5845                                       Alignment ? Alignment->value() : 0, AS);
5846   else
5847     Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, I);
5848 
5849   bool Reverse = ConsecutiveStride < 0;
5850   if (Reverse)
5851     Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
5852   return Cost;
5853 }
5854 
5855 unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
5856                                                          unsigned VF) {
5857   Type *ValTy = getMemInstValueType(I);
5858   Type *VectorTy = ToVectorTy(ValTy, VF);
5859   const MaybeAlign Alignment = getLoadStoreAlignment(I);
5860   unsigned AS = getLoadStoreAddressSpace(I);
5861   if (isa<LoadInst>(I)) {
5862     return TTI.getAddressComputationCost(ValTy) +
5863            TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS) +
5864            TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
5865   }
5866   StoreInst *SI = cast<StoreInst>(I);
5867 
5868   bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand());
5869   return TTI.getAddressComputationCost(ValTy) +
5870          TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS) +
5871          (isLoopInvariantStoreValue
5872               ? 0
5873               : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
5874                                        VF - 1));
5875 }
5876 
5877 unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
5878                                                           unsigned VF) {
5879   Type *ValTy = getMemInstValueType(I);
5880   Type *VectorTy = ToVectorTy(ValTy, VF);
5881   const MaybeAlign Alignment = getLoadStoreAlignment(I);
5882   Value *Ptr = getLoadStorePointerOperand(I);
5883 
5884   return TTI.getAddressComputationCost(VectorTy) +
5885          TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr,
5886                                     Legal->isMaskRequired(I),
5887                                     Alignment ? Alignment->value() : 0);
5888 }
5889 
5890 unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
5891                                                             unsigned VF) {
5892   Type *ValTy = getMemInstValueType(I);
5893   Type *VectorTy = ToVectorTy(ValTy, VF);
5894   unsigned AS = getLoadStoreAddressSpace(I);
5895 
5896   auto Group = getInterleavedAccessGroup(I);
5897   assert(Group && "Fail to get an interleaved access group.");
5898 
5899   unsigned InterleaveFactor = Group->getFactor();
5900   Type *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
5901 
5902   // Holds the indices of existing members in an interleaved load group.
5903   // An interleaved store group doesn't need this as it doesn't allow gaps.
5904   SmallVector<unsigned, 4> Indices;
5905   if (isa<LoadInst>(I)) {
5906     for (unsigned i = 0; i < InterleaveFactor; i++)
5907       if (Group->getMember(i))
5908         Indices.push_back(i);
5909   }
5910 
5911   // Calculate the cost of the whole interleaved group.
5912   bool UseMaskForGaps =
5913       Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
5914   unsigned Cost = TTI.getInterleavedMemoryOpCost(
5915       I->getOpcode(), WideVecTy, Group->getFactor(), Indices,
5916       Group->getAlignment(), AS, Legal->isMaskRequired(I), UseMaskForGaps);
5917 
5918   if (Group->isReverse()) {
5919     // TODO: Add support for reversed masked interleaved access.
5920     assert(!Legal->isMaskRequired(I) &&
5921            "Reverse masked interleaved access not supported.");
5922     Cost += Group->getNumMembers() *
5923             TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
5924   }
5925   return Cost;
5926 }
5927 
5928 unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
5929                                                               unsigned VF) {
5930   // Calculate scalar cost only. Vectorization cost should be ready at this
5931   // moment.
5932   if (VF == 1) {
5933     Type *ValTy = getMemInstValueType(I);
5934     const MaybeAlign Alignment = getLoadStoreAlignment(I);
5935     unsigned AS = getLoadStoreAddressSpace(I);
5936 
5937     return TTI.getAddressComputationCost(ValTy) +
5938            TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, I);
5939   }
5940   return getWideningCost(I, VF);
5941 }
5942 
5943 LoopVectorizationCostModel::VectorizationCostTy
5944 LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
5945   // If we know that this instruction will remain uniform, check the cost of
5946   // the scalar version.
5947   if (isUniformAfterVectorization(I, VF))
5948     VF = 1;
5949 
5950   if (VF > 1 && isProfitableToScalarize(I, VF))
5951     return VectorizationCostTy(InstsToScalarize[VF][I], false);
5952 
5953   // Forced scalars do not have any scalarization overhead.
5954   auto ForcedScalar = ForcedScalars.find(VF);
5955   if (VF > 1 && ForcedScalar != ForcedScalars.end()) {
5956     auto InstSet = ForcedScalar->second;
5957     if (InstSet.find(I) != InstSet.end())
5958       return VectorizationCostTy((getInstructionCost(I, 1).first * VF), false);
5959   }
5960 
5961   Type *VectorTy;
5962   unsigned C = getInstructionCost(I, VF, VectorTy);
5963 
5964   bool TypeNotScalarized =
5965       VF > 1 && VectorTy->isVectorTy() && TTI.getNumberOfParts(VectorTy) < VF;
5966   return VectorizationCostTy(C, TypeNotScalarized);
5967 }
5968 
5969 unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
5970                                                               unsigned VF) {
5971 
5972   if (VF == 1)
5973     return 0;
5974 
5975   unsigned Cost = 0;
5976   Type *RetTy = ToVectorTy(I->getType(), VF);
5977   if (!RetTy->isVoidTy() &&
5978       (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
5979     Cost += TTI.getScalarizationOverhead(RetTy, true, false);
5980 
5981   // Some targets keep addresses scalar.
5982   if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
5983     return Cost;
5984 
5985   // Some targets support efficient element stores.
5986   if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
5987     return Cost;
5988 
5989   // Collect operands to consider.
5990   CallInst *CI = dyn_cast<CallInst>(I);
5991   Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands();
5992 
5993   // Skip operands that do not require extraction/scalarization and do not incur
5994   // any overhead.
5995   return Cost + TTI.getOperandsScalarizationOverhead(
5996                     filterExtractingOperands(Ops, VF), VF);
5997 }
5998 
5999 void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {
6000   if (VF == 1)
6001     return;
6002   NumPredStores = 0;
6003   for (BasicBlock *BB : TheLoop->blocks()) {
6004     // For each instruction in the old loop.
6005     for (Instruction &I : *BB) {
6006       Value *Ptr =  getLoadStorePointerOperand(&I);
6007       if (!Ptr)
6008         continue;
6009 
6010       // TODO: We should generate better code and update the cost model for
6011       // predicated uniform stores. Today they are treated as any other
6012       // predicated store (see added test cases in
6013       // invariant-store-vectorization.ll).
6014       if (isa<StoreInst>(&I) && isScalarWithPredication(&I))
6015         NumPredStores++;
6016 
6017       if (Legal->isUniform(Ptr) &&
6018           // Conditional loads and stores should be scalarized and predicated.
6019           // isScalarWithPredication cannot be used here since masked
6020           // gather/scatters are not considered scalar with predication.
6021           !Legal->blockNeedsPredication(I.getParent())) {
6022         // TODO: Avoid replicating loads and stores instead of
6023         // relying on instcombine to remove them.
6024         // Load: Scalar load + broadcast
6025         // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
6026         unsigned Cost = getUniformMemOpCost(&I, VF);
6027         setWideningDecision(&I, VF, CM_Scalarize, Cost);
6028         continue;
6029       }
6030 
6031       // We assume that widening is the best solution when possible.
6032       if (memoryInstructionCanBeWidened(&I, VF)) {
6033         unsigned Cost = getConsecutiveMemOpCost(&I, VF);
6034         int ConsecutiveStride =
6035                Legal->isConsecutivePtr(getLoadStorePointerOperand(&I));
6036         assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6037                "Expected consecutive stride.");
6038         InstWidening Decision =
6039             ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
6040         setWideningDecision(&I, VF, Decision, Cost);
6041         continue;
6042       }
6043 
6044       // Choose between Interleaving, Gather/Scatter or Scalarization.
6045       unsigned InterleaveCost = std::numeric_limits<unsigned>::max();
6046       unsigned NumAccesses = 1;
6047       if (isAccessInterleaved(&I)) {
6048         auto Group = getInterleavedAccessGroup(&I);
6049         assert(Group && "Fail to get an interleaved access group.");
6050 
6051         // Make one decision for the whole group.
6052         if (getWideningDecision(&I, VF) != CM_Unknown)
6053           continue;
6054 
6055         NumAccesses = Group->getNumMembers();
6056         if (interleavedAccessCanBeWidened(&I, VF))
6057           InterleaveCost = getInterleaveGroupCost(&I, VF);
6058       }
6059 
6060       unsigned GatherScatterCost =
6061           isLegalGatherOrScatter(&I)
6062               ? getGatherScatterCost(&I, VF) * NumAccesses
6063               : std::numeric_limits<unsigned>::max();
6064 
6065       unsigned ScalarizationCost =
6066           getMemInstScalarizationCost(&I, VF) * NumAccesses;
6067 
6068       // Choose better solution for the current VF,
6069       // write down this decision and use it during vectorization.
6070       unsigned Cost;
6071       InstWidening Decision;
6072       if (InterleaveCost <= GatherScatterCost &&
6073           InterleaveCost < ScalarizationCost) {
6074         Decision = CM_Interleave;
6075         Cost = InterleaveCost;
6076       } else if (GatherScatterCost < ScalarizationCost) {
6077         Decision = CM_GatherScatter;
6078         Cost = GatherScatterCost;
6079       } else {
6080         Decision = CM_Scalarize;
6081         Cost = ScalarizationCost;
6082       }
6083       // If the instructions belongs to an interleave group, the whole group
6084       // receives the same decision. The whole group receives the cost, but
6085       // the cost will actually be assigned to one instruction.
6086       if (auto Group = getInterleavedAccessGroup(&I))
6087         setWideningDecision(Group, VF, Decision, Cost);
6088       else
6089         setWideningDecision(&I, VF, Decision, Cost);
6090     }
6091   }
6092 
6093   // Make sure that any load of address and any other address computation
6094   // remains scalar unless there is gather/scatter support. This avoids
6095   // inevitable extracts into address registers, and also has the benefit of
6096   // activating LSR more, since that pass can't optimize vectorized
6097   // addresses.
6098   if (TTI.prefersVectorizedAddressing())
6099     return;
6100 
6101   // Start with all scalar pointer uses.
6102   SmallPtrSet<Instruction *, 8> AddrDefs;
6103   for (BasicBlock *BB : TheLoop->blocks())
6104     for (Instruction &I : *BB) {
6105       Instruction *PtrDef =
6106         dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
6107       if (PtrDef && TheLoop->contains(PtrDef) &&
6108           getWideningDecision(&I, VF) != CM_GatherScatter)
6109         AddrDefs.insert(PtrDef);
6110     }
6111 
6112   // Add all instructions used to generate the addresses.
6113   SmallVector<Instruction *, 4> Worklist;
6114   for (auto *I : AddrDefs)
6115     Worklist.push_back(I);
6116   while (!Worklist.empty()) {
6117     Instruction *I = Worklist.pop_back_val();
6118     for (auto &Op : I->operands())
6119       if (auto *InstOp = dyn_cast<Instruction>(Op))
6120         if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
6121             AddrDefs.insert(InstOp).second)
6122           Worklist.push_back(InstOp);
6123   }
6124 
6125   for (auto *I : AddrDefs) {
6126     if (isa<LoadInst>(I)) {
6127       // Setting the desired widening decision should ideally be handled in
6128       // by cost functions, but since this involves the task of finding out
6129       // if the loaded register is involved in an address computation, it is
6130       // instead changed here when we know this is the case.
6131       InstWidening Decision = getWideningDecision(I, VF);
6132       if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
6133         // Scalarize a widened load of address.
6134         setWideningDecision(I, VF, CM_Scalarize,
6135                             (VF * getMemoryInstructionCost(I, 1)));
6136       else if (auto Group = getInterleavedAccessGroup(I)) {
6137         // Scalarize an interleave group of address loads.
6138         for (unsigned I = 0; I < Group->getFactor(); ++I) {
6139           if (Instruction *Member = Group->getMember(I))
6140             setWideningDecision(Member, VF, CM_Scalarize,
6141                                 (VF * getMemoryInstructionCost(Member, 1)));
6142         }
6143       }
6144     } else
6145       // Make sure I gets scalarized and a cost estimate without
6146       // scalarization overhead.
6147       ForcedScalars[VF].insert(I);
6148   }
6149 }
6150 
6151 unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
6152                                                         unsigned VF,
6153                                                         Type *&VectorTy) {
6154   Type *RetTy = I->getType();
6155   if (canTruncateToMinimalBitwidth(I, VF))
6156     RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
6157   VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF);
6158   auto SE = PSE.getSE();
6159 
6160   // TODO: We need to estimate the cost of intrinsic calls.
6161   switch (I->getOpcode()) {
6162   case Instruction::GetElementPtr:
6163     // We mark this instruction as zero-cost because the cost of GEPs in
6164     // vectorized code depends on whether the corresponding memory instruction
6165     // is scalarized or not. Therefore, we handle GEPs with the memory
6166     // instruction cost.
6167     return 0;
6168   case Instruction::Br: {
6169     // In cases of scalarized and predicated instructions, there will be VF
6170     // predicated blocks in the vectorized loop. Each branch around these
6171     // blocks requires also an extract of its vector compare i1 element.
6172     bool ScalarPredicatedBB = false;
6173     BranchInst *BI = cast<BranchInst>(I);
6174     if (VF > 1 && BI->isConditional() &&
6175         (PredicatedBBsAfterVectorization.find(BI->getSuccessor(0)) !=
6176              PredicatedBBsAfterVectorization.end() ||
6177          PredicatedBBsAfterVectorization.find(BI->getSuccessor(1)) !=
6178              PredicatedBBsAfterVectorization.end()))
6179       ScalarPredicatedBB = true;
6180 
6181     if (ScalarPredicatedBB) {
6182       // Return cost for branches around scalarized and predicated blocks.
6183       Type *Vec_i1Ty =
6184           VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
6185       return (TTI.getScalarizationOverhead(Vec_i1Ty, false, true) +
6186               (TTI.getCFInstrCost(Instruction::Br) * VF));
6187     } else if (I->getParent() == TheLoop->getLoopLatch() || VF == 1)
6188       // The back-edge branch will remain, as will all scalar branches.
6189       return TTI.getCFInstrCost(Instruction::Br);
6190     else
6191       // This branch will be eliminated by if-conversion.
6192       return 0;
6193     // Note: We currently assume zero cost for an unconditional branch inside
6194     // a predicated block since it will become a fall-through, although we
6195     // may decide in the future to call TTI for all branches.
6196   }
6197   case Instruction::PHI: {
6198     auto *Phi = cast<PHINode>(I);
6199 
6200     // First-order recurrences are replaced by vector shuffles inside the loop.
6201     // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type.
6202     if (VF > 1 && Legal->isFirstOrderRecurrence(Phi))
6203       return TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector,
6204                                 VectorTy, VF - 1, VectorType::get(RetTy, 1));
6205 
6206     // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
6207     // converted into select instructions. We require N - 1 selects per phi
6208     // node, where N is the number of incoming values.
6209     if (VF > 1 && Phi->getParent() != TheLoop->getHeader())
6210       return (Phi->getNumIncomingValues() - 1) *
6211              TTI.getCmpSelInstrCost(
6212                  Instruction::Select, ToVectorTy(Phi->getType(), VF),
6213                  ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF));
6214 
6215     return TTI.getCFInstrCost(Instruction::PHI);
6216   }
6217   case Instruction::UDiv:
6218   case Instruction::SDiv:
6219   case Instruction::URem:
6220   case Instruction::SRem:
6221     // If we have a predicated instruction, it may not be executed for each
6222     // vector lane. Get the scalarization cost and scale this amount by the
6223     // probability of executing the predicated block. If the instruction is not
6224     // predicated, we fall through to the next case.
6225     if (VF > 1 && isScalarWithPredication(I)) {
6226       unsigned Cost = 0;
6227 
6228       // These instructions have a non-void type, so account for the phi nodes
6229       // that we will create. This cost is likely to be zero. The phi node
6230       // cost, if any, should be scaled by the block probability because it
6231       // models a copy at the end of each predicated block.
6232       Cost += VF * TTI.getCFInstrCost(Instruction::PHI);
6233 
6234       // The cost of the non-predicated instruction.
6235       Cost += VF * TTI.getArithmeticInstrCost(I->getOpcode(), RetTy);
6236 
6237       // The cost of insertelement and extractelement instructions needed for
6238       // scalarization.
6239       Cost += getScalarizationOverhead(I, VF);
6240 
6241       // Scale the cost by the probability of executing the predicated blocks.
6242       // This assumes the predicated block for each vector lane is equally
6243       // likely.
6244       return Cost / getReciprocalPredBlockProb();
6245     }
6246     LLVM_FALLTHROUGH;
6247   case Instruction::Add:
6248   case Instruction::FAdd:
6249   case Instruction::Sub:
6250   case Instruction::FSub:
6251   case Instruction::Mul:
6252   case Instruction::FMul:
6253   case Instruction::FDiv:
6254   case Instruction::FRem:
6255   case Instruction::Shl:
6256   case Instruction::LShr:
6257   case Instruction::AShr:
6258   case Instruction::And:
6259   case Instruction::Or:
6260   case Instruction::Xor: {
6261     // Since we will replace the stride by 1 the multiplication should go away.
6262     if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal))
6263       return 0;
6264     // Certain instructions can be cheaper to vectorize if they have a constant
6265     // second vector operand. One example of this are shifts on x86.
6266     Value *Op2 = I->getOperand(1);
6267     TargetTransformInfo::OperandValueProperties Op2VP;
6268     TargetTransformInfo::OperandValueKind Op2VK =
6269         TTI.getOperandInfo(Op2, Op2VP);
6270     if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2))
6271       Op2VK = TargetTransformInfo::OK_UniformValue;
6272 
6273     SmallVector<const Value *, 4> Operands(I->operand_values());
6274     unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
6275     return N * TTI.getArithmeticInstrCost(
6276                    I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue,
6277                    Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I);
6278   }
6279   case Instruction::FNeg: {
6280     unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
6281     return N * TTI.getArithmeticInstrCost(
6282                    I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue,
6283                    TargetTransformInfo::OK_AnyValue,
6284                    TargetTransformInfo::OP_None, TargetTransformInfo::OP_None,
6285                    I->getOperand(0), I);
6286   }
6287   case Instruction::Select: {
6288     SelectInst *SI = cast<SelectInst>(I);
6289     const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
6290     bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
6291     Type *CondTy = SI->getCondition()->getType();
6292     if (!ScalarCond)
6293       CondTy = VectorType::get(CondTy, VF);
6294 
6295     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, I);
6296   }
6297   case Instruction::ICmp:
6298   case Instruction::FCmp: {
6299     Type *ValTy = I->getOperand(0)->getType();
6300     Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
6301     if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
6302       ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
6303     VectorTy = ToVectorTy(ValTy, VF);
6304     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, I);
6305   }
6306   case Instruction::Store:
6307   case Instruction::Load: {
6308     unsigned Width = VF;
6309     if (Width > 1) {
6310       InstWidening Decision = getWideningDecision(I, Width);
6311       assert(Decision != CM_Unknown &&
6312              "CM decision should be taken at this point");
6313       if (Decision == CM_Scalarize)
6314         Width = 1;
6315     }
6316     VectorTy = ToVectorTy(getMemInstValueType(I), Width);
6317     return getMemoryInstructionCost(I, VF);
6318   }
6319   case Instruction::ZExt:
6320   case Instruction::SExt:
6321   case Instruction::FPToUI:
6322   case Instruction::FPToSI:
6323   case Instruction::FPExt:
6324   case Instruction::PtrToInt:
6325   case Instruction::IntToPtr:
6326   case Instruction::SIToFP:
6327   case Instruction::UIToFP:
6328   case Instruction::Trunc:
6329   case Instruction::FPTrunc:
6330   case Instruction::BitCast: {
6331     // We optimize the truncation of induction variables having constant
6332     // integer steps. The cost of these truncations is the same as the scalar
6333     // operation.
6334     if (isOptimizableIVTruncate(I, VF)) {
6335       auto *Trunc = cast<TruncInst>(I);
6336       return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
6337                                   Trunc->getSrcTy(), Trunc);
6338     }
6339 
6340     Type *SrcScalarTy = I->getOperand(0)->getType();
6341     Type *SrcVecTy =
6342         VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
6343     if (canTruncateToMinimalBitwidth(I, VF)) {
6344       // This cast is going to be shrunk. This may remove the cast or it might
6345       // turn it into slightly different cast. For example, if MinBW == 16,
6346       // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
6347       //
6348       // Calculate the modified src and dest types.
6349       Type *MinVecTy = VectorTy;
6350       if (I->getOpcode() == Instruction::Trunc) {
6351         SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
6352         VectorTy =
6353             largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
6354       } else if (I->getOpcode() == Instruction::ZExt ||
6355                  I->getOpcode() == Instruction::SExt) {
6356         SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy);
6357         VectorTy =
6358             smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
6359       }
6360     }
6361 
6362     unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
6363     return N * TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy, I);
6364   }
6365   case Instruction::Call: {
6366     bool NeedToScalarize;
6367     CallInst *CI = cast<CallInst>(I);
6368     unsigned CallCost = getVectorCallCost(CI, VF, NeedToScalarize);
6369     if (getVectorIntrinsicIDForCall(CI, TLI))
6370       return std::min(CallCost, getVectorIntrinsicCost(CI, VF));
6371     return CallCost;
6372   }
6373   default:
6374     // The cost of executing VF copies of the scalar instruction. This opcode
6375     // is unknown. Assume that it is the same as 'mul'.
6376     return VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy) +
6377            getScalarizationOverhead(I, VF);
6378   } // end of switch.
6379 }
6380 
6381 char LoopVectorize::ID = 0;
6382 
6383 static const char lv_name[] = "Loop Vectorization";
6384 
6385 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
6386 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
6387 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)
6388 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
6389 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
6390 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
6391 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
6392 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
6393 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
6394 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
6395 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
6396 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
6397 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
6398 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
6399 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy)
6400 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
6401 
6402 namespace llvm {
6403 
6404 Pass *createLoopVectorizePass() { return new LoopVectorize(); }
6405 
6406 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced,
6407                               bool VectorizeOnlyWhenForced) {
6408   return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced);
6409 }
6410 
6411 } // end namespace llvm
6412 
6413 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
6414   // Check if the pointer operand of a load or store instruction is
6415   // consecutive.
6416   if (auto *Ptr = getLoadStorePointerOperand(Inst))
6417     return Legal->isConsecutivePtr(Ptr);
6418   return false;
6419 }
6420 
6421 void LoopVectorizationCostModel::collectValuesToIgnore() {
6422   // Ignore ephemeral values.
6423   CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
6424 
6425   // Ignore type-promoting instructions we identified during reduction
6426   // detection.
6427   for (auto &Reduction : *Legal->getReductionVars()) {
6428     RecurrenceDescriptor &RedDes = Reduction.second;
6429     SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
6430     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
6431   }
6432   // Ignore type-casting instructions we identified during induction
6433   // detection.
6434   for (auto &Induction : *Legal->getInductionVars()) {
6435     InductionDescriptor &IndDes = Induction.second;
6436     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
6437     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
6438   }
6439 }
6440 
6441 // TODO: we could return a pair of values that specify the max VF and
6442 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
6443 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
6444 // doesn't have a cost model that can choose which plan to execute if
6445 // more than one is generated.
6446 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits,
6447                                  LoopVectorizationCostModel &CM) {
6448   unsigned WidestType;
6449   std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
6450   return WidestVectorRegBits / WidestType;
6451 }
6452 
6453 VectorizationFactor
6454 LoopVectorizationPlanner::planInVPlanNativePath(unsigned UserVF) {
6455   unsigned VF = UserVF;
6456   // Outer loop handling: They may require CFG and instruction level
6457   // transformations before even evaluating whether vectorization is profitable.
6458   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
6459   // the vectorization pipeline.
6460   if (!OrigLoop->empty()) {
6461     // If the user doesn't provide a vectorization factor, determine a
6462     // reasonable one.
6463     if (!UserVF) {
6464       VF = determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM);
6465       LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
6466 
6467       // Make sure we have a VF > 1 for stress testing.
6468       if (VPlanBuildStressTest && VF < 2) {
6469         LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
6470                           << "overriding computed VF.\n");
6471         VF = 4;
6472       }
6473     }
6474     assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
6475     assert(isPowerOf2_32(VF) && "VF needs to be a power of two");
6476     LLVM_DEBUG(dbgs() << "LV: Using " << (UserVF ? "user " : "") << "VF " << VF
6477                       << " to build VPlans.\n");
6478     buildVPlans(VF, VF);
6479 
6480     // For VPlan build stress testing, we bail out after VPlan construction.
6481     if (VPlanBuildStressTest)
6482       return VectorizationFactor::Disabled();
6483 
6484     return {VF, 0};
6485   }
6486 
6487   LLVM_DEBUG(
6488       dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
6489                 "VPlan-native path.\n");
6490   return VectorizationFactor::Disabled();
6491 }
6492 
6493 Optional<VectorizationFactor> LoopVectorizationPlanner::plan(unsigned UserVF) {
6494   assert(OrigLoop->empty() && "Inner loop expected.");
6495   Optional<unsigned> MaybeMaxVF = CM.computeMaxVF();
6496   if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved.
6497     return None;
6498 
6499   // Invalidate interleave groups if all blocks of loop will be predicated.
6500   if (CM.blockNeedsPredication(OrigLoop->getHeader()) &&
6501       !useMaskedInterleavedAccesses(*TTI)) {
6502     LLVM_DEBUG(
6503         dbgs()
6504         << "LV: Invalidate all interleaved groups due to fold-tail by masking "
6505            "which requires masked-interleaved support.\n");
6506     CM.InterleaveInfo.reset();
6507   }
6508 
6509   if (UserVF) {
6510     LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
6511     assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two");
6512     // Collect the instructions (and their associated costs) that will be more
6513     // profitable to scalarize.
6514     CM.selectUserVectorizationFactor(UserVF);
6515     buildVPlansWithVPRecipes(UserVF, UserVF);
6516     LLVM_DEBUG(printPlans(dbgs()));
6517     return {{UserVF, 0}};
6518   }
6519 
6520   unsigned MaxVF = MaybeMaxVF.getValue();
6521   assert(MaxVF != 0 && "MaxVF is zero.");
6522 
6523   for (unsigned VF = 1; VF <= MaxVF; VF *= 2) {
6524     // Collect Uniform and Scalar instructions after vectorization with VF.
6525     CM.collectUniformsAndScalars(VF);
6526 
6527     // Collect the instructions (and their associated costs) that will be more
6528     // profitable to scalarize.
6529     if (VF > 1)
6530       CM.collectInstsToScalarize(VF);
6531   }
6532 
6533   buildVPlansWithVPRecipes(1, MaxVF);
6534   LLVM_DEBUG(printPlans(dbgs()));
6535   if (MaxVF == 1)
6536     return VectorizationFactor::Disabled();
6537 
6538   // Select the optimal vectorization factor.
6539   return CM.selectVectorizationFactor(MaxVF);
6540 }
6541 
6542 void LoopVectorizationPlanner::setBestPlan(unsigned VF, unsigned UF) {
6543   LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF
6544                     << '\n');
6545   BestVF = VF;
6546   BestUF = UF;
6547 
6548   erase_if(VPlans, [VF](const VPlanPtr &Plan) {
6549     return !Plan->hasVF(VF);
6550   });
6551   assert(VPlans.size() == 1 && "Best VF has not a single VPlan.");
6552 }
6553 
6554 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV,
6555                                            DominatorTree *DT) {
6556   // Perform the actual loop transformation.
6557 
6558   // 1. Create a new empty loop. Unlink the old loop and connect the new one.
6559   VPCallbackILV CallbackILV(ILV);
6560 
6561   VPTransformState State{BestVF, BestUF,      LI,
6562                          DT,     ILV.Builder, ILV.VectorLoopValueMap,
6563                          &ILV,   CallbackILV};
6564   State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
6565   State.TripCount = ILV.getOrCreateTripCount(nullptr);
6566 
6567   //===------------------------------------------------===//
6568   //
6569   // Notice: any optimization or new instruction that go
6570   // into the code below should also be implemented in
6571   // the cost-model.
6572   //
6573   //===------------------------------------------------===//
6574 
6575   // 2. Copy and widen instructions from the old loop into the new loop.
6576   assert(VPlans.size() == 1 && "Not a single VPlan to execute.");
6577   VPlans.front()->execute(&State);
6578 
6579   // 3. Fix the vectorized code: take care of header phi's, live-outs,
6580   //    predication, updating analyses.
6581   ILV.fixVectorizedLoop();
6582 }
6583 
6584 void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
6585     SmallPtrSetImpl<Instruction *> &DeadInstructions) {
6586   BasicBlock *Latch = OrigLoop->getLoopLatch();
6587 
6588   // We create new control-flow for the vectorized loop, so the original
6589   // condition will be dead after vectorization if it's only used by the
6590   // branch.
6591   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
6592   if (Cmp && Cmp->hasOneUse())
6593     DeadInstructions.insert(Cmp);
6594 
6595   // We create new "steps" for induction variable updates to which the original
6596   // induction variables map. An original update instruction will be dead if
6597   // all its users except the induction variable are dead.
6598   for (auto &Induction : *Legal->getInductionVars()) {
6599     PHINode *Ind = Induction.first;
6600     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
6601     if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
6602           return U == Ind || DeadInstructions.find(cast<Instruction>(U)) !=
6603                                  DeadInstructions.end();
6604         }))
6605       DeadInstructions.insert(IndUpdate);
6606 
6607     // We record as "Dead" also the type-casting instructions we had identified
6608     // during induction analysis. We don't need any handling for them in the
6609     // vectorized loop because we have proven that, under a proper runtime
6610     // test guarding the vectorized loop, the value of the phi, and the casted
6611     // value of the phi, are the same. The last instruction in this casting chain
6612     // will get its scalar/vector/widened def from the scalar/vector/widened def
6613     // of the respective phi node. Any other casts in the induction def-use chain
6614     // have no other uses outside the phi update chain, and will be ignored.
6615     InductionDescriptor &IndDes = Induction.second;
6616     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
6617     DeadInstructions.insert(Casts.begin(), Casts.end());
6618   }
6619 }
6620 
6621 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; }
6622 
6623 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; }
6624 
6625 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step,
6626                                         Instruction::BinaryOps BinOp) {
6627   // When unrolling and the VF is 1, we only need to add a simple scalar.
6628   Type *Ty = Val->getType();
6629   assert(!Ty->isVectorTy() && "Val must be a scalar");
6630 
6631   if (Ty->isFloatingPointTy()) {
6632     Constant *C = ConstantFP::get(Ty, (double)StartIdx);
6633 
6634     // Floating point operations had to be 'fast' to enable the unrolling.
6635     Value *MulOp = addFastMathFlag(Builder.CreateFMul(C, Step));
6636     return addFastMathFlag(Builder.CreateBinOp(BinOp, Val, MulOp));
6637   }
6638   Constant *C = ConstantInt::get(Ty, StartIdx);
6639   return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction");
6640 }
6641 
6642 static void AddRuntimeUnrollDisableMetaData(Loop *L) {
6643   SmallVector<Metadata *, 4> MDs;
6644   // Reserve first location for self reference to the LoopID metadata node.
6645   MDs.push_back(nullptr);
6646   bool IsUnrollMetadata = false;
6647   MDNode *LoopID = L->getLoopID();
6648   if (LoopID) {
6649     // First find existing loop unrolling disable metadata.
6650     for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
6651       auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
6652       if (MD) {
6653         const auto *S = dyn_cast<MDString>(MD->getOperand(0));
6654         IsUnrollMetadata =
6655             S && S->getString().startswith("llvm.loop.unroll.disable");
6656       }
6657       MDs.push_back(LoopID->getOperand(i));
6658     }
6659   }
6660 
6661   if (!IsUnrollMetadata) {
6662     // Add runtime unroll disable metadata.
6663     LLVMContext &Context = L->getHeader()->getContext();
6664     SmallVector<Metadata *, 1> DisableOperands;
6665     DisableOperands.push_back(
6666         MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
6667     MDNode *DisableNode = MDNode::get(Context, DisableOperands);
6668     MDs.push_back(DisableNode);
6669     MDNode *NewLoopID = MDNode::get(Context, MDs);
6670     // Set operand 0 to refer to the loop id itself.
6671     NewLoopID->replaceOperandWith(0, NewLoopID);
6672     L->setLoopID(NewLoopID);
6673   }
6674 }
6675 
6676 bool LoopVectorizationPlanner::getDecisionAndClampRange(
6677     const std::function<bool(unsigned)> &Predicate, VFRange &Range) {
6678   assert(Range.End > Range.Start && "Trying to test an empty VF range.");
6679   bool PredicateAtRangeStart = Predicate(Range.Start);
6680 
6681   for (unsigned TmpVF = Range.Start * 2; TmpVF < Range.End; TmpVF *= 2)
6682     if (Predicate(TmpVF) != PredicateAtRangeStart) {
6683       Range.End = TmpVF;
6684       break;
6685     }
6686 
6687   return PredicateAtRangeStart;
6688 }
6689 
6690 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
6691 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
6692 /// of VF's starting at a given VF and extending it as much as possible. Each
6693 /// vectorization decision can potentially shorten this sub-range during
6694 /// buildVPlan().
6695 void LoopVectorizationPlanner::buildVPlans(unsigned MinVF, unsigned MaxVF) {
6696   for (unsigned VF = MinVF; VF < MaxVF + 1;) {
6697     VFRange SubRange = {VF, MaxVF + 1};
6698     VPlans.push_back(buildVPlan(SubRange));
6699     VF = SubRange.End;
6700   }
6701 }
6702 
6703 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
6704                                          VPlanPtr &Plan) {
6705   assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
6706 
6707   // Look for cached value.
6708   std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
6709   EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
6710   if (ECEntryIt != EdgeMaskCache.end())
6711     return ECEntryIt->second;
6712 
6713   VPValue *SrcMask = createBlockInMask(Src, Plan);
6714 
6715   // The terminator has to be a branch inst!
6716   BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
6717   assert(BI && "Unexpected terminator found");
6718 
6719   if (!BI->isConditional())
6720     return EdgeMaskCache[Edge] = SrcMask;
6721 
6722   VPValue *EdgeMask = Plan->getVPValue(BI->getCondition());
6723   assert(EdgeMask && "No Edge Mask found for condition");
6724 
6725   if (BI->getSuccessor(0) != Dst)
6726     EdgeMask = Builder.createNot(EdgeMask);
6727 
6728   if (SrcMask) // Otherwise block in-mask is all-one, no need to AND.
6729     EdgeMask = Builder.createAnd(EdgeMask, SrcMask);
6730 
6731   return EdgeMaskCache[Edge] = EdgeMask;
6732 }
6733 
6734 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
6735   assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
6736 
6737   // Look for cached value.
6738   BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB);
6739   if (BCEntryIt != BlockMaskCache.end())
6740     return BCEntryIt->second;
6741 
6742   // All-one mask is modelled as no-mask following the convention for masked
6743   // load/store/gather/scatter. Initialize BlockMask to no-mask.
6744   VPValue *BlockMask = nullptr;
6745 
6746   if (OrigLoop->getHeader() == BB) {
6747     if (!CM.blockNeedsPredication(BB))
6748       return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one.
6749 
6750     // Introduce the early-exit compare IV <= BTC to form header block mask.
6751     // This is used instead of IV < TC because TC may wrap, unlike BTC.
6752     VPValue *IV = Plan->getVPValue(Legal->getPrimaryInduction());
6753     VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
6754     BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
6755     return BlockMaskCache[BB] = BlockMask;
6756   }
6757 
6758   // This is the block mask. We OR all incoming edges.
6759   for (auto *Predecessor : predecessors(BB)) {
6760     VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan);
6761     if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too.
6762       return BlockMaskCache[BB] = EdgeMask;
6763 
6764     if (!BlockMask) { // BlockMask has its initialized nullptr value.
6765       BlockMask = EdgeMask;
6766       continue;
6767     }
6768 
6769     BlockMask = Builder.createOr(BlockMask, EdgeMask);
6770   }
6771 
6772   return BlockMaskCache[BB] = BlockMask;
6773 }
6774 
6775 VPWidenMemoryInstructionRecipe *
6776 VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range,
6777                                   VPlanPtr &Plan) {
6778   if (!isa<LoadInst>(I) && !isa<StoreInst>(I))
6779     return nullptr;
6780 
6781   auto willWiden = [&](unsigned VF) -> bool {
6782     if (VF == 1)
6783       return false;
6784     LoopVectorizationCostModel::InstWidening Decision =
6785         CM.getWideningDecision(I, VF);
6786     assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
6787            "CM decision should be taken at this point.");
6788     if (Decision == LoopVectorizationCostModel::CM_Interleave)
6789       return true;
6790     if (CM.isScalarAfterVectorization(I, VF) ||
6791         CM.isProfitableToScalarize(I, VF))
6792       return false;
6793     return Decision != LoopVectorizationCostModel::CM_Scalarize;
6794   };
6795 
6796   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
6797     return nullptr;
6798 
6799   VPValue *Mask = nullptr;
6800   if (Legal->isMaskRequired(I))
6801     Mask = createBlockInMask(I->getParent(), Plan);
6802 
6803   VPValue *Addr = Plan->getOrAddVPValue(getLoadStorePointerOperand(I));
6804   return new VPWidenMemoryInstructionRecipe(*I, Addr, Mask);
6805 }
6806 
6807 VPWidenIntOrFpInductionRecipe *
6808 VPRecipeBuilder::tryToOptimizeInduction(Instruction *I, VFRange &Range) {
6809   if (PHINode *Phi = dyn_cast<PHINode>(I)) {
6810     // Check if this is an integer or fp induction. If so, build the recipe that
6811     // produces its scalar and vector values.
6812     InductionDescriptor II = Legal->getInductionVars()->lookup(Phi);
6813     if (II.getKind() == InductionDescriptor::IK_IntInduction ||
6814         II.getKind() == InductionDescriptor::IK_FpInduction)
6815       return new VPWidenIntOrFpInductionRecipe(Phi);
6816 
6817     return nullptr;
6818   }
6819 
6820   // Optimize the special case where the source is a constant integer
6821   // induction variable. Notice that we can only optimize the 'trunc' case
6822   // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
6823   // (c) other casts depend on pointer size.
6824 
6825   // Determine whether \p K is a truncation based on an induction variable that
6826   // can be optimized.
6827   auto isOptimizableIVTruncate =
6828       [&](Instruction *K) -> std::function<bool(unsigned)> {
6829     return
6830         [=](unsigned VF) -> bool { return CM.isOptimizableIVTruncate(K, VF); };
6831   };
6832 
6833   if (isa<TruncInst>(I) && LoopVectorizationPlanner::getDecisionAndClampRange(
6834                                isOptimizableIVTruncate(I), Range))
6835     return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)),
6836                                              cast<TruncInst>(I));
6837   return nullptr;
6838 }
6839 
6840 VPBlendRecipe *VPRecipeBuilder::tryToBlend(Instruction *I, VPlanPtr &Plan) {
6841   PHINode *Phi = dyn_cast<PHINode>(I);
6842   if (!Phi || Phi->getParent() == OrigLoop->getHeader())
6843     return nullptr;
6844 
6845   // We know that all PHIs in non-header blocks are converted into selects, so
6846   // we don't have to worry about the insertion order and we can just use the
6847   // builder. At this point we generate the predication tree. There may be
6848   // duplications since this is a simple recursive scan, but future
6849   // optimizations will clean it up.
6850 
6851   SmallVector<VPValue *, 2> Masks;
6852   unsigned NumIncoming = Phi->getNumIncomingValues();
6853   for (unsigned In = 0; In < NumIncoming; In++) {
6854     VPValue *EdgeMask =
6855       createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan);
6856     assert((EdgeMask || NumIncoming == 1) &&
6857            "Multiple predecessors with one having a full mask");
6858     if (EdgeMask)
6859       Masks.push_back(EdgeMask);
6860   }
6861   return new VPBlendRecipe(Phi, Masks);
6862 }
6863 
6864 bool VPRecipeBuilder::tryToWiden(Instruction *I, VPBasicBlock *VPBB,
6865                                  VFRange &Range) {
6866 
6867   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
6868       [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range);
6869 
6870   if (IsPredicated)
6871     return false;
6872 
6873   auto IsVectorizableOpcode = [](unsigned Opcode) {
6874     switch (Opcode) {
6875     case Instruction::Add:
6876     case Instruction::And:
6877     case Instruction::AShr:
6878     case Instruction::BitCast:
6879     case Instruction::Br:
6880     case Instruction::Call:
6881     case Instruction::FAdd:
6882     case Instruction::FCmp:
6883     case Instruction::FDiv:
6884     case Instruction::FMul:
6885     case Instruction::FNeg:
6886     case Instruction::FPExt:
6887     case Instruction::FPToSI:
6888     case Instruction::FPToUI:
6889     case Instruction::FPTrunc:
6890     case Instruction::FRem:
6891     case Instruction::FSub:
6892     case Instruction::ICmp:
6893     case Instruction::IntToPtr:
6894     case Instruction::Load:
6895     case Instruction::LShr:
6896     case Instruction::Mul:
6897     case Instruction::Or:
6898     case Instruction::PHI:
6899     case Instruction::PtrToInt:
6900     case Instruction::SDiv:
6901     case Instruction::Select:
6902     case Instruction::SExt:
6903     case Instruction::Shl:
6904     case Instruction::SIToFP:
6905     case Instruction::SRem:
6906     case Instruction::Store:
6907     case Instruction::Sub:
6908     case Instruction::Trunc:
6909     case Instruction::UDiv:
6910     case Instruction::UIToFP:
6911     case Instruction::URem:
6912     case Instruction::Xor:
6913     case Instruction::ZExt:
6914       return true;
6915     }
6916     return false;
6917   };
6918 
6919   if (!IsVectorizableOpcode(I->getOpcode()))
6920     return false;
6921 
6922   if (CallInst *CI = dyn_cast<CallInst>(I)) {
6923     Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
6924     if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
6925                ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect))
6926       return false;
6927   }
6928 
6929   auto willWiden = [&](unsigned VF) -> bool {
6930     if (!isa<PHINode>(I) && (CM.isScalarAfterVectorization(I, VF) ||
6931                              CM.isProfitableToScalarize(I, VF)))
6932       return false;
6933     if (CallInst *CI = dyn_cast<CallInst>(I)) {
6934       Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
6935       // The following case may be scalarized depending on the VF.
6936       // The flag shows whether we use Intrinsic or a usual Call for vectorized
6937       // version of the instruction.
6938       // Is it beneficial to perform intrinsic call compared to lib call?
6939       bool NeedToScalarize;
6940       unsigned CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize);
6941       bool UseVectorIntrinsic =
6942           ID && CM.getVectorIntrinsicCost(CI, VF) <= CallCost;
6943       return UseVectorIntrinsic || !NeedToScalarize;
6944     }
6945     if (isa<LoadInst>(I) || isa<StoreInst>(I)) {
6946       assert(CM.getWideningDecision(I, VF) ==
6947                  LoopVectorizationCostModel::CM_Scalarize &&
6948              "Memory widening decisions should have been taken care by now");
6949       return false;
6950     }
6951     return true;
6952   };
6953 
6954   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
6955     return false;
6956   // If this ingredient's recipe is to be recorded, keep its recipe a singleton
6957   // to avoid having to split recipes later.
6958   bool IsSingleton = Ingredient2Recipe.count(I);
6959 
6960   // Success: widen this instruction.
6961 
6962   // Use the default widening recipe. We optimize the common case where
6963   // consecutive instructions can be represented by a single recipe.
6964   if (!IsSingleton && !VPBB->empty() && LastExtensibleRecipe == &VPBB->back() &&
6965       LastExtensibleRecipe->appendInstruction(I))
6966     return true;
6967 
6968   VPWidenRecipe *WidenRecipe = new VPWidenRecipe(I);
6969   if (!IsSingleton)
6970     LastExtensibleRecipe = WidenRecipe;
6971   setRecipe(I, WidenRecipe);
6972   VPBB->appendRecipe(WidenRecipe);
6973   return true;
6974 }
6975 
6976 VPBasicBlock *VPRecipeBuilder::handleReplication(
6977     Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
6978     DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe,
6979     VPlanPtr &Plan) {
6980   bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
6981       [&](unsigned VF) { return CM.isUniformAfterVectorization(I, VF); },
6982       Range);
6983 
6984   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
6985       [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range);
6986 
6987   auto *Recipe = new VPReplicateRecipe(I, IsUniform, IsPredicated);
6988   setRecipe(I, Recipe);
6989 
6990   // Find if I uses a predicated instruction. If so, it will use its scalar
6991   // value. Avoid hoisting the insert-element which packs the scalar value into
6992   // a vector value, as that happens iff all users use the vector value.
6993   for (auto &Op : I->operands())
6994     if (auto *PredInst = dyn_cast<Instruction>(Op))
6995       if (PredInst2Recipe.find(PredInst) != PredInst2Recipe.end())
6996         PredInst2Recipe[PredInst]->setAlsoPack(false);
6997 
6998   // Finalize the recipe for Instr, first if it is not predicated.
6999   if (!IsPredicated) {
7000     LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
7001     VPBB->appendRecipe(Recipe);
7002     return VPBB;
7003   }
7004   LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
7005   assert(VPBB->getSuccessors().empty() &&
7006          "VPBB has successors when handling predicated replication.");
7007   // Record predicated instructions for above packing optimizations.
7008   PredInst2Recipe[I] = Recipe;
7009   VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan);
7010   VPBlockUtils::insertBlockAfter(Region, VPBB);
7011   auto *RegSucc = new VPBasicBlock();
7012   VPBlockUtils::insertBlockAfter(RegSucc, Region);
7013   return RegSucc;
7014 }
7015 
7016 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr,
7017                                                       VPRecipeBase *PredRecipe,
7018                                                       VPlanPtr &Plan) {
7019   // Instructions marked for predication are replicated and placed under an
7020   // if-then construct to prevent side-effects.
7021 
7022   // Generate recipes to compute the block mask for this region.
7023   VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan);
7024 
7025   // Build the triangular if-then region.
7026   std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
7027   assert(Instr->getParent() && "Predicated instruction not in any basic block");
7028   auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask);
7029   auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
7030   auto *PHIRecipe =
7031       Instr->getType()->isVoidTy() ? nullptr : new VPPredInstPHIRecipe(Instr);
7032   auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
7033   auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe);
7034   VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true);
7035 
7036   // Note: first set Entry as region entry and then connect successors starting
7037   // from it in order, to propagate the "parent" of each VPBasicBlock.
7038   VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry);
7039   VPBlockUtils::connectBlocks(Pred, Exit);
7040 
7041   return Region;
7042 }
7043 
7044 bool VPRecipeBuilder::tryToCreateRecipe(Instruction *Instr, VFRange &Range,
7045                                         VPlanPtr &Plan, VPBasicBlock *VPBB) {
7046   VPRecipeBase *Recipe = nullptr;
7047 
7048   // First, check for specific widening recipes that deal with memory
7049   // operations, inductions and Phi nodes.
7050   if ((Recipe = tryToWidenMemory(Instr, Range, Plan)) ||
7051       (Recipe = tryToOptimizeInduction(Instr, Range)) ||
7052       (Recipe = tryToBlend(Instr, Plan)) ||
7053       (isa<PHINode>(Instr) &&
7054        (Recipe = new VPWidenPHIRecipe(cast<PHINode>(Instr))))) {
7055     setRecipe(Instr, Recipe);
7056     VPBB->appendRecipe(Recipe);
7057     return true;
7058   }
7059 
7060   // Handle GEP widening.
7061   if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Instr)) {
7062     auto Scalarize = [&](unsigned VF) {
7063       return CM.isScalarWithPredication(Instr, VF) ||
7064              CM.isScalarAfterVectorization(Instr, VF) ||
7065              CM.isProfitableToScalarize(Instr, VF);
7066     };
7067     if (LoopVectorizationPlanner::getDecisionAndClampRange(Scalarize, Range))
7068       return false;
7069     VPWidenGEPRecipe *Recipe = new VPWidenGEPRecipe(GEP, OrigLoop);
7070     setRecipe(Instr, Recipe);
7071     VPBB->appendRecipe(Recipe);
7072     return true;
7073   }
7074 
7075   // Check if Instr is to be widened by a general VPWidenRecipe, after
7076   // having first checked for specific widening recipes.
7077   if (tryToWiden(Instr, VPBB, Range))
7078     return true;
7079 
7080   return false;
7081 }
7082 
7083 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF,
7084                                                         unsigned MaxVF) {
7085   assert(OrigLoop->empty() && "Inner loop expected.");
7086 
7087   // Collect conditions feeding internal conditional branches; they need to be
7088   // represented in VPlan for it to model masking.
7089   SmallPtrSet<Value *, 1> NeedDef;
7090 
7091   auto *Latch = OrigLoop->getLoopLatch();
7092   for (BasicBlock *BB : OrigLoop->blocks()) {
7093     if (BB == Latch)
7094       continue;
7095     BranchInst *Branch = dyn_cast<BranchInst>(BB->getTerminator());
7096     if (Branch && Branch->isConditional())
7097       NeedDef.insert(Branch->getCondition());
7098   }
7099 
7100   // If the tail is to be folded by masking, the primary induction variable
7101   // needs to be represented in VPlan for it to model early-exit masking.
7102   // Also, both the Phi and the live-out instruction of each reduction are
7103   // required in order to introduce a select between them in VPlan.
7104   if (CM.foldTailByMasking()) {
7105     NeedDef.insert(Legal->getPrimaryInduction());
7106     for (auto &Reduction : *Legal->getReductionVars()) {
7107       NeedDef.insert(Reduction.first);
7108       NeedDef.insert(Reduction.second.getLoopExitInstr());
7109     }
7110   }
7111 
7112   // Collect instructions from the original loop that will become trivially dead
7113   // in the vectorized loop. We don't need to vectorize these instructions. For
7114   // example, original induction update instructions can become dead because we
7115   // separately emit induction "steps" when generating code for the new loop.
7116   // Similarly, we create a new latch condition when setting up the structure
7117   // of the new loop, so the old one can become dead.
7118   SmallPtrSet<Instruction *, 4> DeadInstructions;
7119   collectTriviallyDeadInstructions(DeadInstructions);
7120 
7121   for (unsigned VF = MinVF; VF < MaxVF + 1;) {
7122     VFRange SubRange = {VF, MaxVF + 1};
7123     VPlans.push_back(
7124         buildVPlanWithVPRecipes(SubRange, NeedDef, DeadInstructions));
7125     VF = SubRange.End;
7126   }
7127 }
7128 
7129 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
7130     VFRange &Range, SmallPtrSetImpl<Value *> &NeedDef,
7131     SmallPtrSetImpl<Instruction *> &DeadInstructions) {
7132 
7133   // Hold a mapping from predicated instructions to their recipes, in order to
7134   // fix their AlsoPack behavior if a user is determined to replicate and use a
7135   // scalar instead of vector value.
7136   DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe;
7137 
7138   DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter();
7139 
7140   SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
7141 
7142   VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, Builder);
7143 
7144   // ---------------------------------------------------------------------------
7145   // Pre-construction: record ingredients whose recipes we'll need to further
7146   // process after constructing the initial VPlan.
7147   // ---------------------------------------------------------------------------
7148 
7149   // Mark instructions we'll need to sink later and their targets as
7150   // ingredients whose recipe we'll need to record.
7151   for (auto &Entry : SinkAfter) {
7152     RecipeBuilder.recordRecipeOf(Entry.first);
7153     RecipeBuilder.recordRecipeOf(Entry.second);
7154   }
7155 
7156   // For each interleave group which is relevant for this (possibly trimmed)
7157   // Range, add it to the set of groups to be later applied to the VPlan and add
7158   // placeholders for its members' Recipes which we'll be replacing with a
7159   // single VPInterleaveRecipe.
7160   for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
7161     auto applyIG = [IG, this](unsigned VF) -> bool {
7162       return (VF >= 2 && // Query is illegal for VF == 1
7163               CM.getWideningDecision(IG->getInsertPos(), VF) ==
7164                   LoopVectorizationCostModel::CM_Interleave);
7165     };
7166     if (!getDecisionAndClampRange(applyIG, Range))
7167       continue;
7168     InterleaveGroups.insert(IG);
7169     for (unsigned i = 0; i < IG->getFactor(); i++)
7170       if (Instruction *Member = IG->getMember(i))
7171         RecipeBuilder.recordRecipeOf(Member);
7172   };
7173 
7174   // ---------------------------------------------------------------------------
7175   // Build initial VPlan: Scan the body of the loop in a topological order to
7176   // visit each basic block after having visited its predecessor basic blocks.
7177   // ---------------------------------------------------------------------------
7178 
7179   // Add assume instructions we need to drop to DeadInstructions, to prevent
7180   // them from being added to the VPlan.
7181   // TODO: We only need to drop assumes in blocks that get flattend. If the
7182   // control flow is preserved, we should keep them.
7183   auto &ConditionalAssumes = Legal->getConditionalAssumes();
7184   DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end());
7185 
7186   // Create a dummy pre-entry VPBasicBlock to start building the VPlan.
7187   VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry");
7188   auto Plan = std::make_unique<VPlan>(VPBB);
7189 
7190   // Represent values that will have defs inside VPlan.
7191   for (Value *V : NeedDef)
7192     Plan->addVPValue(V);
7193 
7194   // Scan the body of the loop in a topological order to visit each basic block
7195   // after having visited its predecessor basic blocks.
7196   LoopBlocksDFS DFS(OrigLoop);
7197   DFS.perform(LI);
7198 
7199   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
7200     // Relevant instructions from basic block BB will be grouped into VPRecipe
7201     // ingredients and fill a new VPBasicBlock.
7202     unsigned VPBBsForBB = 0;
7203     auto *FirstVPBBForBB = new VPBasicBlock(BB->getName());
7204     VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB);
7205     VPBB = FirstVPBBForBB;
7206     Builder.setInsertPoint(VPBB);
7207 
7208     // Introduce each ingredient into VPlan.
7209     for (Instruction &I : BB->instructionsWithoutDebug()) {
7210       Instruction *Instr = &I;
7211 
7212       // First filter out irrelevant instructions, to ensure no recipes are
7213       // built for them.
7214       if (isa<BranchInst>(Instr) ||
7215           DeadInstructions.find(Instr) != DeadInstructions.end())
7216         continue;
7217 
7218       if (RecipeBuilder.tryToCreateRecipe(Instr, Range, Plan, VPBB))
7219         continue;
7220 
7221       // Otherwise, if all widening options failed, Instruction is to be
7222       // replicated. This may create a successor for VPBB.
7223       VPBasicBlock *NextVPBB = RecipeBuilder.handleReplication(
7224           Instr, Range, VPBB, PredInst2Recipe, Plan);
7225       if (NextVPBB != VPBB) {
7226         VPBB = NextVPBB;
7227         VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++)
7228                                     : "");
7229       }
7230     }
7231   }
7232 
7233   // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks
7234   // may also be empty, such as the last one VPBB, reflecting original
7235   // basic-blocks with no recipes.
7236   VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry());
7237   assert(PreEntry->empty() && "Expecting empty pre-entry block.");
7238   VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor());
7239   VPBlockUtils::disconnectBlocks(PreEntry, Entry);
7240   delete PreEntry;
7241 
7242   // ---------------------------------------------------------------------------
7243   // Transform initial VPlan: Apply previously taken decisions, in order, to
7244   // bring the VPlan to its final state.
7245   // ---------------------------------------------------------------------------
7246 
7247   // Apply Sink-After legal constraints.
7248   for (auto &Entry : SinkAfter) {
7249     VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first);
7250     VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second);
7251     Sink->moveAfter(Target);
7252   }
7253 
7254   // Interleave memory: for each Interleave Group we marked earlier as relevant
7255   // for this VPlan, replace the Recipes widening its memory instructions with a
7256   // single VPInterleaveRecipe at its insertion point.
7257   for (auto IG : InterleaveGroups) {
7258     auto *Recipe = cast<VPWidenMemoryInstructionRecipe>(
7259         RecipeBuilder.getRecipe(IG->getInsertPos()));
7260     (new VPInterleaveRecipe(IG, Recipe->getAddr(), Recipe->getMask()))
7261         ->insertBefore(Recipe);
7262 
7263     for (unsigned i = 0; i < IG->getFactor(); ++i)
7264       if (Instruction *Member = IG->getMember(i)) {
7265         RecipeBuilder.getRecipe(Member)->eraseFromParent();
7266       }
7267   }
7268 
7269   // Finally, if tail is folded by masking, introduce selects between the phi
7270   // and the live-out instruction of each reduction, at the end of the latch.
7271   if (CM.foldTailByMasking()) {
7272     Builder.setInsertPoint(VPBB);
7273     auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan);
7274     for (auto &Reduction : *Legal->getReductionVars()) {
7275       VPValue *Phi = Plan->getVPValue(Reduction.first);
7276       VPValue *Red = Plan->getVPValue(Reduction.second.getLoopExitInstr());
7277       Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi});
7278     }
7279   }
7280 
7281   std::string PlanName;
7282   raw_string_ostream RSO(PlanName);
7283   unsigned VF = Range.Start;
7284   Plan->addVF(VF);
7285   RSO << "Initial VPlan for VF={" << VF;
7286   for (VF *= 2; VF < Range.End; VF *= 2) {
7287     Plan->addVF(VF);
7288     RSO << "," << VF;
7289   }
7290   RSO << "},UF>=1";
7291   RSO.flush();
7292   Plan->setName(PlanName);
7293 
7294   return Plan;
7295 }
7296 
7297 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
7298   // Outer loop handling: They may require CFG and instruction level
7299   // transformations before even evaluating whether vectorization is profitable.
7300   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
7301   // the vectorization pipeline.
7302   assert(!OrigLoop->empty());
7303   assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
7304 
7305   // Create new empty VPlan
7306   auto Plan = std::make_unique<VPlan>();
7307 
7308   // Build hierarchical CFG
7309   VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
7310   HCFGBuilder.buildHierarchicalCFG();
7311 
7312   for (unsigned VF = Range.Start; VF < Range.End; VF *= 2)
7313     Plan->addVF(VF);
7314 
7315   if (EnableVPlanPredication) {
7316     VPlanPredicator VPP(*Plan);
7317     VPP.predicate();
7318 
7319     // Avoid running transformation to recipes until masked code generation in
7320     // VPlan-native path is in place.
7321     return Plan;
7322   }
7323 
7324   SmallPtrSet<Instruction *, 1> DeadInstructions;
7325   VPlanTransforms::VPInstructionsToVPRecipes(
7326       OrigLoop, Plan, Legal->getInductionVars(), DeadInstructions);
7327   return Plan;
7328 }
7329 
7330 Value* LoopVectorizationPlanner::VPCallbackILV::
7331 getOrCreateVectorValues(Value *V, unsigned Part) {
7332       return ILV.getOrCreateVectorValue(V, Part);
7333 }
7334 
7335 Value *LoopVectorizationPlanner::VPCallbackILV::getOrCreateScalarValue(
7336     Value *V, const VPIteration &Instance) {
7337   return ILV.getOrCreateScalarValue(V, Instance);
7338 }
7339 
7340 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent) const {
7341   O << " +\n"
7342     << Indent << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
7343   IG->getInsertPos()->printAsOperand(O, false);
7344   O << ", ";
7345   getAddr()->printAsOperand(O);
7346   VPValue *Mask = getMask();
7347   if (Mask) {
7348     O << ", ";
7349     Mask->printAsOperand(O);
7350   }
7351   O << "\\l\"";
7352   for (unsigned i = 0; i < IG->getFactor(); ++i)
7353     if (Instruction *I = IG->getMember(i))
7354       O << " +\n"
7355         << Indent << "\"  " << VPlanIngredient(I) << " " << i << "\\l\"";
7356 }
7357 
7358 void VPWidenRecipe::execute(VPTransformState &State) {
7359   for (auto &Instr : make_range(Begin, End))
7360     State.ILV->widenInstruction(Instr);
7361 }
7362 
7363 void VPWidenGEPRecipe::execute(VPTransformState &State) {
7364   State.ILV->widenGEP(GEP, State.UF, State.VF, IsPtrLoopInvariant,
7365                       IsIndexLoopInvariant);
7366 }
7367 
7368 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
7369   assert(!State.Instance && "Int or FP induction being replicated.");
7370   State.ILV->widenIntOrFpInduction(IV, Trunc);
7371 }
7372 
7373 void VPWidenPHIRecipe::execute(VPTransformState &State) {
7374   State.ILV->widenPHIInstruction(Phi, State.UF, State.VF);
7375 }
7376 
7377 void VPBlendRecipe::execute(VPTransformState &State) {
7378   State.ILV->setDebugLocFromInst(State.Builder, Phi);
7379   // We know that all PHIs in non-header blocks are converted into
7380   // selects, so we don't have to worry about the insertion order and we
7381   // can just use the builder.
7382   // At this point we generate the predication tree. There may be
7383   // duplications since this is a simple recursive scan, but future
7384   // optimizations will clean it up.
7385 
7386   unsigned NumIncoming = Phi->getNumIncomingValues();
7387 
7388   assert((User || NumIncoming == 1) &&
7389          "Multiple predecessors with predecessors having a full mask");
7390   // Generate a sequence of selects of the form:
7391   // SELECT(Mask3, In3,
7392   //      SELECT(Mask2, In2,
7393   //                   ( ...)))
7394   InnerLoopVectorizer::VectorParts Entry(State.UF);
7395   for (unsigned In = 0; In < NumIncoming; ++In) {
7396     for (unsigned Part = 0; Part < State.UF; ++Part) {
7397       // We might have single edge PHIs (blocks) - use an identity
7398       // 'select' for the first PHI operand.
7399       Value *In0 =
7400           State.ILV->getOrCreateVectorValue(Phi->getIncomingValue(In), Part);
7401       if (In == 0)
7402         Entry[Part] = In0; // Initialize with the first incoming value.
7403       else {
7404         // Select between the current value and the previous incoming edge
7405         // based on the incoming mask.
7406         Value *Cond = State.get(User->getOperand(In), Part);
7407         Entry[Part] =
7408             State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi");
7409       }
7410     }
7411   }
7412   for (unsigned Part = 0; Part < State.UF; ++Part)
7413     State.ValueMap.setVectorValue(Phi, Part, Entry[Part]);
7414 }
7415 
7416 void VPInterleaveRecipe::execute(VPTransformState &State) {
7417   assert(!State.Instance && "Interleave group being replicated.");
7418   State.ILV->vectorizeInterleaveGroup(IG->getInsertPos(), State, getAddr(),
7419                                       getMask());
7420 }
7421 
7422 void VPReplicateRecipe::execute(VPTransformState &State) {
7423   if (State.Instance) { // Generate a single instance.
7424     State.ILV->scalarizeInstruction(Ingredient, *State.Instance, IsPredicated);
7425     // Insert scalar instance packing it into a vector.
7426     if (AlsoPack && State.VF > 1) {
7427       // If we're constructing lane 0, initialize to start from undef.
7428       if (State.Instance->Lane == 0) {
7429         Value *Undef =
7430             UndefValue::get(VectorType::get(Ingredient->getType(), State.VF));
7431         State.ValueMap.setVectorValue(Ingredient, State.Instance->Part, Undef);
7432       }
7433       State.ILV->packScalarIntoVectorValue(Ingredient, *State.Instance);
7434     }
7435     return;
7436   }
7437 
7438   // Generate scalar instances for all VF lanes of all UF parts, unless the
7439   // instruction is uniform inwhich case generate only the first lane for each
7440   // of the UF parts.
7441   unsigned EndLane = IsUniform ? 1 : State.VF;
7442   for (unsigned Part = 0; Part < State.UF; ++Part)
7443     for (unsigned Lane = 0; Lane < EndLane; ++Lane)
7444       State.ILV->scalarizeInstruction(Ingredient, {Part, Lane}, IsPredicated);
7445 }
7446 
7447 void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
7448   assert(State.Instance && "Branch on Mask works only on single instance.");
7449 
7450   unsigned Part = State.Instance->Part;
7451   unsigned Lane = State.Instance->Lane;
7452 
7453   Value *ConditionBit = nullptr;
7454   if (!User) // Block in mask is all-one.
7455     ConditionBit = State.Builder.getTrue();
7456   else {
7457     VPValue *BlockInMask = User->getOperand(0);
7458     ConditionBit = State.get(BlockInMask, Part);
7459     if (ConditionBit->getType()->isVectorTy())
7460       ConditionBit = State.Builder.CreateExtractElement(
7461           ConditionBit, State.Builder.getInt32(Lane));
7462   }
7463 
7464   // Replace the temporary unreachable terminator with a new conditional branch,
7465   // whose two destinations will be set later when they are created.
7466   auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
7467   assert(isa<UnreachableInst>(CurrentTerminator) &&
7468          "Expected to replace unreachable terminator with conditional branch.");
7469   auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
7470   CondBr->setSuccessor(0, nullptr);
7471   ReplaceInstWithInst(CurrentTerminator, CondBr);
7472 }
7473 
7474 void VPPredInstPHIRecipe::execute(VPTransformState &State) {
7475   assert(State.Instance && "Predicated instruction PHI works per instance.");
7476   Instruction *ScalarPredInst = cast<Instruction>(
7477       State.ValueMap.getScalarValue(PredInst, *State.Instance));
7478   BasicBlock *PredicatedBB = ScalarPredInst->getParent();
7479   BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
7480   assert(PredicatingBB && "Predicated block has no single predecessor.");
7481 
7482   // By current pack/unpack logic we need to generate only a single phi node: if
7483   // a vector value for the predicated instruction exists at this point it means
7484   // the instruction has vector users only, and a phi for the vector value is
7485   // needed. In this case the recipe of the predicated instruction is marked to
7486   // also do that packing, thereby "hoisting" the insert-element sequence.
7487   // Otherwise, a phi node for the scalar value is needed.
7488   unsigned Part = State.Instance->Part;
7489   if (State.ValueMap.hasVectorValue(PredInst, Part)) {
7490     Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part);
7491     InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
7492     PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
7493     VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
7494     VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
7495     State.ValueMap.resetVectorValue(PredInst, Part, VPhi); // Update cache.
7496   } else {
7497     Type *PredInstType = PredInst->getType();
7498     PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
7499     Phi->addIncoming(UndefValue::get(ScalarPredInst->getType()), PredicatingBB);
7500     Phi->addIncoming(ScalarPredInst, PredicatedBB);
7501     State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi);
7502   }
7503 }
7504 
7505 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
7506   State.ILV->vectorizeMemoryInstruction(&Instr, State, getAddr(), getMask());
7507 }
7508 
7509 // Determine how to lower the scalar epilogue, which depends on 1) optimising
7510 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
7511 // predication, and 4) a TTI hook that analyses whether the loop is suitable
7512 // for predication.
7513 static ScalarEpilogueLowering getScalarEpilogueLowering(
7514     Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI,
7515     BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
7516     AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
7517     LoopVectorizationLegality &LVL) {
7518   bool OptSize =
7519       F->hasOptSize() || llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
7520                                                      PGSOQueryType::IRPass);
7521   // 1) OptSize takes precedence over all other options, i.e. if this is set,
7522   // don't look at hints or options, and don't request a scalar epilogue.
7523   if (OptSize && Hints.getForce() != LoopVectorizeHints::FK_Enabled)
7524     return CM_ScalarEpilogueNotAllowedOptSize;
7525 
7526   bool PredicateOptDisabled = PreferPredicateOverEpilog.getNumOccurrences() &&
7527                               !PreferPredicateOverEpilog;
7528 
7529   // 2) Next, if disabling predication is requested on the command line, honour
7530   // this and request a scalar epilogue. Also do this if we don't have a
7531   // primary induction variable, which is required for predication.
7532   if (PredicateOptDisabled || !LVL.getPrimaryInduction())
7533     return CM_ScalarEpilogueAllowed;
7534 
7535   // 3) and 4) look if enabling predication is requested on the command line,
7536   // with a loop hint, or if the TTI hook indicates this is profitable, request
7537   // predication .
7538   if (PreferPredicateOverEpilog ||
7539       Hints.getPredicate() == LoopVectorizeHints::FK_Enabled ||
7540       (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT,
7541                                         LVL.getLAI()) &&
7542        Hints.getPredicate() != LoopVectorizeHints::FK_Disabled))
7543     return CM_ScalarEpilogueNotNeededUsePredicate;
7544 
7545   return CM_ScalarEpilogueAllowed;
7546 }
7547 
7548 // Process the loop in the VPlan-native vectorization path. This path builds
7549 // VPlan upfront in the vectorization pipeline, which allows to apply
7550 // VPlan-to-VPlan transformations from the very beginning without modifying the
7551 // input LLVM IR.
7552 static bool processLoopInVPlanNativePath(
7553     Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
7554     LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
7555     TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
7556     OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
7557     ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) {
7558 
7559   assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
7560   Function *F = L->getHeader()->getParent();
7561   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
7562 
7563   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
7564       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL);
7565 
7566   LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
7567                                 &Hints, IAI);
7568   // Use the planner for outer loop vectorization.
7569   // TODO: CM is not used at this point inside the planner. Turn CM into an
7570   // optional argument if we don't need it in the future.
7571   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI);
7572 
7573   // Get user vectorization factor.
7574   const unsigned UserVF = Hints.getWidth();
7575 
7576   // Plan how to best vectorize, return the best VF and its cost.
7577   const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
7578 
7579   // If we are stress testing VPlan builds, do not attempt to generate vector
7580   // code. Masked vector code generation support will follow soon.
7581   // Also, do not attempt to vectorize if no vector code will be produced.
7582   if (VPlanBuildStressTest || EnableVPlanPredication ||
7583       VectorizationFactor::Disabled() == VF)
7584     return false;
7585 
7586   LVP.setBestPlan(VF.Width, 1);
7587 
7588   InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL,
7589                          &CM);
7590   LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
7591                     << L->getHeader()->getParent()->getName() << "\"\n");
7592   LVP.executePlan(LB, DT);
7593 
7594   // Mark the loop as already vectorized to avoid vectorizing again.
7595   Hints.setAlreadyVectorized();
7596 
7597   LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent()));
7598   return true;
7599 }
7600 
7601 bool LoopVectorizePass::processLoop(Loop *L) {
7602   assert((EnableVPlanNativePath || L->empty()) &&
7603          "VPlan-native path is not enabled. Only process inner loops.");
7604 
7605 #ifndef NDEBUG
7606   const std::string DebugLocStr = getDebugLocString(L);
7607 #endif /* NDEBUG */
7608 
7609   LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \""
7610                     << L->getHeader()->getParent()->getName() << "\" from "
7611                     << DebugLocStr << "\n");
7612 
7613   LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE);
7614 
7615   LLVM_DEBUG(
7616       dbgs() << "LV: Loop hints:"
7617              << " force="
7618              << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
7619                      ? "disabled"
7620                      : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
7621                             ? "enabled"
7622                             : "?"))
7623              << " width=" << Hints.getWidth()
7624              << " unroll=" << Hints.getInterleave() << "\n");
7625 
7626   // Function containing loop
7627   Function *F = L->getHeader()->getParent();
7628 
7629   // Looking at the diagnostic output is the only way to determine if a loop
7630   // was vectorized (other than looking at the IR or machine code), so it
7631   // is important to generate an optimization remark for each loop. Most of
7632   // these messages are generated as OptimizationRemarkAnalysis. Remarks
7633   // generated as OptimizationRemark and OptimizationRemarkMissed are
7634   // less verbose reporting vectorized loops and unvectorized loops that may
7635   // benefit from vectorization, respectively.
7636 
7637   if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
7638     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
7639     return false;
7640   }
7641 
7642   PredicatedScalarEvolution PSE(*SE, *L);
7643 
7644   // Check if it is legal to vectorize the loop.
7645   LoopVectorizationRequirements Requirements(*ORE);
7646   LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE,
7647                                 &Requirements, &Hints, DB, AC);
7648   if (!LVL.canVectorize(EnableVPlanNativePath)) {
7649     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
7650     Hints.emitRemarkWithHints();
7651     return false;
7652   }
7653 
7654   // Check the function attributes and profiles to find out if this function
7655   // should be optimized for size.
7656   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
7657       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL);
7658 
7659   // Entrance to the VPlan-native vectorization path. Outer loops are processed
7660   // here. They may require CFG and instruction level transformations before
7661   // even evaluating whether vectorization is profitable. Since we cannot modify
7662   // the incoming IR, we need to build VPlan upfront in the vectorization
7663   // pipeline.
7664   if (!L->empty())
7665     return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
7666                                         ORE, BFI, PSI, Hints);
7667 
7668   assert(L->empty() && "Inner loop expected.");
7669 
7670   // Check the loop for a trip count threshold: vectorize loops with a tiny trip
7671   // count by optimizing for size, to minimize overheads.
7672   auto ExpectedTC = getSmallBestKnownTC(*SE, L);
7673   if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
7674     LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
7675                       << "This loop is worth vectorizing only if no scalar "
7676                       << "iteration overheads are incurred.");
7677     if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
7678       LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
7679     else {
7680       LLVM_DEBUG(dbgs() << "\n");
7681       SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
7682     }
7683   }
7684 
7685   // Check the function attributes to see if implicit floats are allowed.
7686   // FIXME: This check doesn't seem possibly correct -- what if the loop is
7687   // an integer loop and the vector instructions selected are purely integer
7688   // vector instructions?
7689   if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
7690     reportVectorizationFailure(
7691         "Can't vectorize when the NoImplicitFloat attribute is used",
7692         "loop not vectorized due to NoImplicitFloat attribute",
7693         "NoImplicitFloat", ORE, L);
7694     Hints.emitRemarkWithHints();
7695     return false;
7696   }
7697 
7698   // Check if the target supports potentially unsafe FP vectorization.
7699   // FIXME: Add a check for the type of safety issue (denormal, signaling)
7700   // for the target we're vectorizing for, to make sure none of the
7701   // additional fp-math flags can help.
7702   if (Hints.isPotentiallyUnsafe() &&
7703       TTI->isFPVectorizationPotentiallyUnsafe()) {
7704     reportVectorizationFailure(
7705         "Potentially unsafe FP op prevents vectorization",
7706         "loop not vectorized due to unsafe FP support.",
7707         "UnsafeFP", ORE, L);
7708     Hints.emitRemarkWithHints();
7709     return false;
7710   }
7711 
7712   bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
7713   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
7714 
7715   // If an override option has been passed in for interleaved accesses, use it.
7716   if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
7717     UseInterleaved = EnableInterleavedMemAccesses;
7718 
7719   // Analyze interleaved memory accesses.
7720   if (UseInterleaved) {
7721     IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
7722   }
7723 
7724   // Use the cost model.
7725   LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
7726                                 F, &Hints, IAI);
7727   CM.collectValuesToIgnore();
7728 
7729   // Use the planner for vectorization.
7730   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI);
7731 
7732   // Get user vectorization factor.
7733   unsigned UserVF = Hints.getWidth();
7734 
7735   // Plan how to best vectorize, return the best VF and its cost.
7736   Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF);
7737 
7738   VectorizationFactor VF = VectorizationFactor::Disabled();
7739   unsigned IC = 1;
7740   unsigned UserIC = Hints.getInterleave();
7741 
7742   if (MaybeVF) {
7743     VF = *MaybeVF;
7744     // Select the interleave count.
7745     IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
7746   }
7747 
7748   // Identify the diagnostic messages that should be produced.
7749   std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
7750   bool VectorizeLoop = true, InterleaveLoop = true;
7751   if (Requirements.doesNotMeet(F, L, Hints)) {
7752     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization "
7753                          "requirements.\n");
7754     Hints.emitRemarkWithHints();
7755     return false;
7756   }
7757 
7758   if (VF.Width == 1) {
7759     LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
7760     VecDiagMsg = std::make_pair(
7761         "VectorizationNotBeneficial",
7762         "the cost-model indicates that vectorization is not beneficial");
7763     VectorizeLoop = false;
7764   }
7765 
7766   if (!MaybeVF && UserIC > 1) {
7767     // Tell the user interleaving was avoided up-front, despite being explicitly
7768     // requested.
7769     LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
7770                          "interleaving should be avoided up front\n");
7771     IntDiagMsg = std::make_pair(
7772         "InterleavingAvoided",
7773         "Ignoring UserIC, because interleaving was avoided up front");
7774     InterleaveLoop = false;
7775   } else if (IC == 1 && UserIC <= 1) {
7776     // Tell the user interleaving is not beneficial.
7777     LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
7778     IntDiagMsg = std::make_pair(
7779         "InterleavingNotBeneficial",
7780         "the cost-model indicates that interleaving is not beneficial");
7781     InterleaveLoop = false;
7782     if (UserIC == 1) {
7783       IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
7784       IntDiagMsg.second +=
7785           " and is explicitly disabled or interleave count is set to 1";
7786     }
7787   } else if (IC > 1 && UserIC == 1) {
7788     // Tell the user interleaving is beneficial, but it explicitly disabled.
7789     LLVM_DEBUG(
7790         dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
7791     IntDiagMsg = std::make_pair(
7792         "InterleavingBeneficialButDisabled",
7793         "the cost-model indicates that interleaving is beneficial "
7794         "but is explicitly disabled or interleave count is set to 1");
7795     InterleaveLoop = false;
7796   }
7797 
7798   // Override IC if user provided an interleave count.
7799   IC = UserIC > 0 ? UserIC : IC;
7800 
7801   // Emit diagnostic messages, if any.
7802   const char *VAPassName = Hints.vectorizeAnalysisPassName();
7803   if (!VectorizeLoop && !InterleaveLoop) {
7804     // Do not vectorize or interleaving the loop.
7805     ORE->emit([&]() {
7806       return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
7807                                       L->getStartLoc(), L->getHeader())
7808              << VecDiagMsg.second;
7809     });
7810     ORE->emit([&]() {
7811       return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
7812                                       L->getStartLoc(), L->getHeader())
7813              << IntDiagMsg.second;
7814     });
7815     return false;
7816   } else if (!VectorizeLoop && InterleaveLoop) {
7817     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
7818     ORE->emit([&]() {
7819       return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
7820                                         L->getStartLoc(), L->getHeader())
7821              << VecDiagMsg.second;
7822     });
7823   } else if (VectorizeLoop && !InterleaveLoop) {
7824     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
7825                       << ") in " << DebugLocStr << '\n');
7826     ORE->emit([&]() {
7827       return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
7828                                         L->getStartLoc(), L->getHeader())
7829              << IntDiagMsg.second;
7830     });
7831   } else if (VectorizeLoop && InterleaveLoop) {
7832     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
7833                       << ") in " << DebugLocStr << '\n');
7834     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
7835   }
7836 
7837   LVP.setBestPlan(VF.Width, IC);
7838 
7839   using namespace ore;
7840   bool DisableRuntimeUnroll = false;
7841   MDNode *OrigLoopID = L->getLoopID();
7842 
7843   if (!VectorizeLoop) {
7844     assert(IC > 1 && "interleave count should not be 1 or 0");
7845     // If we decided that it is not legal to vectorize the loop, then
7846     // interleave it.
7847     InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
7848                                &CM);
7849     LVP.executePlan(Unroller, DT);
7850 
7851     ORE->emit([&]() {
7852       return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
7853                                 L->getHeader())
7854              << "interleaved loop (interleaved count: "
7855              << NV("InterleaveCount", IC) << ")";
7856     });
7857   } else {
7858     // If we decided that it is *legal* to vectorize the loop, then do it.
7859     InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
7860                            &LVL, &CM);
7861     LVP.executePlan(LB, DT);
7862     ++LoopsVectorized;
7863 
7864     // Add metadata to disable runtime unrolling a scalar loop when there are
7865     // no runtime checks about strides and memory. A scalar loop that is
7866     // rarely used is not worth unrolling.
7867     if (!LB.areSafetyChecksAdded())
7868       DisableRuntimeUnroll = true;
7869 
7870     // Report the vectorization decision.
7871     ORE->emit([&]() {
7872       return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(),
7873                                 L->getHeader())
7874              << "vectorized loop (vectorization width: "
7875              << NV("VectorizationFactor", VF.Width)
7876              << ", interleaved count: " << NV("InterleaveCount", IC) << ")";
7877     });
7878   }
7879 
7880   Optional<MDNode *> RemainderLoopID =
7881       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
7882                                       LLVMLoopVectorizeFollowupEpilogue});
7883   if (RemainderLoopID.hasValue()) {
7884     L->setLoopID(RemainderLoopID.getValue());
7885   } else {
7886     if (DisableRuntimeUnroll)
7887       AddRuntimeUnrollDisableMetaData(L);
7888 
7889     // Mark the loop as already vectorized to avoid vectorizing again.
7890     Hints.setAlreadyVectorized();
7891   }
7892 
7893   LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent()));
7894   return true;
7895 }
7896 
7897 bool LoopVectorizePass::runImpl(
7898     Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_,
7899     DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_,
7900     DemandedBits &DB_, AliasAnalysis &AA_, AssumptionCache &AC_,
7901     std::function<const LoopAccessInfo &(Loop &)> &GetLAA_,
7902     OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) {
7903   SE = &SE_;
7904   LI = &LI_;
7905   TTI = &TTI_;
7906   DT = &DT_;
7907   BFI = &BFI_;
7908   TLI = TLI_;
7909   AA = &AA_;
7910   AC = &AC_;
7911   GetLAA = &GetLAA_;
7912   DB = &DB_;
7913   ORE = &ORE_;
7914   PSI = PSI_;
7915 
7916   // Don't attempt if
7917   // 1. the target claims to have no vector registers, and
7918   // 2. interleaving won't help ILP.
7919   //
7920   // The second condition is necessary because, even if the target has no
7921   // vector registers, loop vectorization may still enable scalar
7922   // interleaving.
7923   if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) &&
7924       TTI->getMaxInterleaveFactor(1) < 2)
7925     return false;
7926 
7927   bool Changed = false;
7928 
7929   // The vectorizer requires loops to be in simplified form.
7930   // Since simplification may add new inner loops, it has to run before the
7931   // legality and profitability checks. This means running the loop vectorizer
7932   // will simplify all loops, regardless of whether anything end up being
7933   // vectorized.
7934   for (auto &L : *LI)
7935     Changed |=
7936         simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
7937 
7938   // Build up a worklist of inner-loops to vectorize. This is necessary as
7939   // the act of vectorizing or partially unrolling a loop creates new loops
7940   // and can invalidate iterators across the loops.
7941   SmallVector<Loop *, 8> Worklist;
7942 
7943   for (Loop *L : *LI)
7944     collectSupportedLoops(*L, LI, ORE, Worklist);
7945 
7946   LoopsAnalyzed += Worklist.size();
7947 
7948   // Now walk the identified inner loops.
7949   while (!Worklist.empty()) {
7950     Loop *L = Worklist.pop_back_val();
7951 
7952     // For the inner loops we actually process, form LCSSA to simplify the
7953     // transform.
7954     Changed |= formLCSSARecursively(*L, *DT, LI, SE);
7955 
7956     Changed |= processLoop(L);
7957   }
7958 
7959   // Process each loop nest in the function.
7960   return Changed;
7961 }
7962 
7963 PreservedAnalyses LoopVectorizePass::run(Function &F,
7964                                          FunctionAnalysisManager &AM) {
7965     auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
7966     auto &LI = AM.getResult<LoopAnalysis>(F);
7967     auto &TTI = AM.getResult<TargetIRAnalysis>(F);
7968     auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
7969     auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F);
7970     auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
7971     auto &AA = AM.getResult<AAManager>(F);
7972     auto &AC = AM.getResult<AssumptionAnalysis>(F);
7973     auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
7974     auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
7975     MemorySSA *MSSA = EnableMSSALoopDependency
7976                           ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA()
7977                           : nullptr;
7978 
7979     auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
7980     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
7981         [&](Loop &L) -> const LoopAccessInfo & {
7982       LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI, MSSA};
7983       return LAM.getResult<LoopAccessAnalysis>(L, AR);
7984     };
7985     const ModuleAnalysisManager &MAM =
7986         AM.getResult<ModuleAnalysisManagerFunctionProxy>(F).getManager();
7987     ProfileSummaryInfo *PSI =
7988         MAM.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
7989     bool Changed =
7990         runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI);
7991     if (!Changed)
7992       return PreservedAnalyses::all();
7993     PreservedAnalyses PA;
7994 
7995     // We currently do not preserve loopinfo/dominator analyses with outer loop
7996     // vectorization. Until this is addressed, mark these analyses as preserved
7997     // only for non-VPlan-native path.
7998     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
7999     if (!EnableVPlanNativePath) {
8000       PA.preserve<LoopAnalysis>();
8001       PA.preserve<DominatorTreeAnalysis>();
8002     }
8003     PA.preserve<BasicAA>();
8004     PA.preserve<GlobalsAA>();
8005     return PA;
8006 }
8007