1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10 // and generates target-independent LLVM-IR.
11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12 // of instructions in order to estimate the profitability of vectorization.
13 //
14 // The loop vectorizer combines consecutive loop iterations into a single
15 // 'wide' iteration. After this transformation the index is incremented
16 // by the SIMD vector width, and not by one.
17 //
18 // This pass has three parts:
19 // 1. The main loop pass that drives the different parts.
20 // 2. LoopVectorizationLegality - A unit that checks for the legality
21 //    of the vectorization.
22 // 3. InnerLoopVectorizer - A unit that performs the actual
23 //    widening of instructions.
24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
25 //    of vectorization. It decides on the optimal vector width, which
26 //    can be one, if vectorization is not profitable.
27 //
28 // There is a development effort going on to migrate loop vectorizer to the
29 // VPlan infrastructure and to introduce outer loop vectorization support (see
30 // docs/Proposal/VectorizationPlan.rst and
31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32 // purpose, we temporarily introduced the VPlan-native vectorization path: an
33 // alternative vectorization path that is natively implemented on top of the
34 // VPlan infrastructure. See EnableVPlanNativePath for enabling.
35 //
36 //===----------------------------------------------------------------------===//
37 //
38 // The reduction-variable vectorization is based on the paper:
39 //  D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40 //
41 // Variable uniformity checks are inspired by:
42 //  Karrenberg, R. and Hack, S. Whole Function Vectorization.
43 //
44 // The interleaved access vectorization is based on the paper:
45 //  Dorit Nuzman, Ira Rosen and Ayal Zaks.  Auto-Vectorization of Interleaved
46 //  Data for SIMD
47 //
48 // Other ideas/concepts are from:
49 //  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50 //
51 //  S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua.  An Evaluation of
52 //  Vectorizing Compilers.
53 //
54 //===----------------------------------------------------------------------===//
55 
56 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
57 #include "LoopVectorizationPlanner.h"
58 #include "VPRecipeBuilder.h"
59 #include "VPlan.h"
60 #include "VPlanHCFGBuilder.h"
61 #include "VPlanPredicator.h"
62 #include "VPlanTransforms.h"
63 #include "llvm/ADT/APInt.h"
64 #include "llvm/ADT/ArrayRef.h"
65 #include "llvm/ADT/DenseMap.h"
66 #include "llvm/ADT/DenseMapInfo.h"
67 #include "llvm/ADT/Hashing.h"
68 #include "llvm/ADT/MapVector.h"
69 #include "llvm/ADT/None.h"
70 #include "llvm/ADT/Optional.h"
71 #include "llvm/ADT/STLExtras.h"
72 #include "llvm/ADT/SetVector.h"
73 #include "llvm/ADT/SmallPtrSet.h"
74 #include "llvm/ADT/SmallVector.h"
75 #include "llvm/ADT/Statistic.h"
76 #include "llvm/ADT/StringRef.h"
77 #include "llvm/ADT/Twine.h"
78 #include "llvm/ADT/iterator_range.h"
79 #include "llvm/Analysis/AssumptionCache.h"
80 #include "llvm/Analysis/BasicAliasAnalysis.h"
81 #include "llvm/Analysis/BlockFrequencyInfo.h"
82 #include "llvm/Analysis/CFG.h"
83 #include "llvm/Analysis/CodeMetrics.h"
84 #include "llvm/Analysis/DemandedBits.h"
85 #include "llvm/Analysis/GlobalsModRef.h"
86 #include "llvm/Analysis/LoopAccessAnalysis.h"
87 #include "llvm/Analysis/LoopAnalysisManager.h"
88 #include "llvm/Analysis/LoopInfo.h"
89 #include "llvm/Analysis/LoopIterator.h"
90 #include "llvm/Analysis/MemorySSA.h"
91 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
92 #include "llvm/Analysis/ProfileSummaryInfo.h"
93 #include "llvm/Analysis/ScalarEvolution.h"
94 #include "llvm/Analysis/ScalarEvolutionExpander.h"
95 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
96 #include "llvm/Analysis/TargetLibraryInfo.h"
97 #include "llvm/Analysis/TargetTransformInfo.h"
98 #include "llvm/Analysis/VectorUtils.h"
99 #include "llvm/IR/Attributes.h"
100 #include "llvm/IR/BasicBlock.h"
101 #include "llvm/IR/CFG.h"
102 #include "llvm/IR/Constant.h"
103 #include "llvm/IR/Constants.h"
104 #include "llvm/IR/DataLayout.h"
105 #include "llvm/IR/DebugInfoMetadata.h"
106 #include "llvm/IR/DebugLoc.h"
107 #include "llvm/IR/DerivedTypes.h"
108 #include "llvm/IR/DiagnosticInfo.h"
109 #include "llvm/IR/Dominators.h"
110 #include "llvm/IR/Function.h"
111 #include "llvm/IR/IRBuilder.h"
112 #include "llvm/IR/InstrTypes.h"
113 #include "llvm/IR/Instruction.h"
114 #include "llvm/IR/Instructions.h"
115 #include "llvm/IR/IntrinsicInst.h"
116 #include "llvm/IR/Intrinsics.h"
117 #include "llvm/IR/LLVMContext.h"
118 #include "llvm/IR/Metadata.h"
119 #include "llvm/IR/Module.h"
120 #include "llvm/IR/Operator.h"
121 #include "llvm/IR/Type.h"
122 #include "llvm/IR/Use.h"
123 #include "llvm/IR/User.h"
124 #include "llvm/IR/Value.h"
125 #include "llvm/IR/ValueHandle.h"
126 #include "llvm/IR/Verifier.h"
127 #include "llvm/InitializePasses.h"
128 #include "llvm/Pass.h"
129 #include "llvm/Support/Casting.h"
130 #include "llvm/Support/CommandLine.h"
131 #include "llvm/Support/Compiler.h"
132 #include "llvm/Support/Debug.h"
133 #include "llvm/Support/ErrorHandling.h"
134 #include "llvm/Support/MathExtras.h"
135 #include "llvm/Support/raw_ostream.h"
136 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
137 #include "llvm/Transforms/Utils/LoopSimplify.h"
138 #include "llvm/Transforms/Utils/LoopUtils.h"
139 #include "llvm/Transforms/Utils/LoopVersioning.h"
140 #include "llvm/Transforms/Utils/SizeOpts.h"
141 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
142 #include <algorithm>
143 #include <cassert>
144 #include <cstdint>
145 #include <cstdlib>
146 #include <functional>
147 #include <iterator>
148 #include <limits>
149 #include <memory>
150 #include <string>
151 #include <tuple>
152 #include <utility>
153 
154 using namespace llvm;
155 
156 #define LV_NAME "loop-vectorize"
157 #define DEBUG_TYPE LV_NAME
158 
159 /// @{
160 /// Metadata attribute names
161 static const char *const LLVMLoopVectorizeFollowupAll =
162     "llvm.loop.vectorize.followup_all";
163 static const char *const LLVMLoopVectorizeFollowupVectorized =
164     "llvm.loop.vectorize.followup_vectorized";
165 static const char *const LLVMLoopVectorizeFollowupEpilogue =
166     "llvm.loop.vectorize.followup_epilogue";
167 /// @}
168 
169 STATISTIC(LoopsVectorized, "Number of loops vectorized");
170 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
171 
172 /// Loops with a known constant trip count below this number are vectorized only
173 /// if no scalar iteration overheads are incurred.
174 static cl::opt<unsigned> TinyTripCountVectorThreshold(
175     "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
176     cl::desc("Loops with a constant trip count that is smaller than this "
177              "value are vectorized only if no scalar iteration overheads "
178              "are incurred."));
179 
180 // Indicates that an epilogue is undesired, predication is preferred.
181 // This means that the vectorizer will try to fold the loop-tail (epilogue)
182 // into the loop and predicate the loop body accordingly.
183 static cl::opt<bool> PreferPredicateOverEpilog(
184     "prefer-predicate-over-epilog", cl::init(false), cl::Hidden,
185     cl::desc("Indicate that an epilogue is undesired, predication should be "
186              "used instead."));
187 
188 static cl::opt<bool> MaximizeBandwidth(
189     "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
190     cl::desc("Maximize bandwidth when selecting vectorization factor which "
191              "will be determined by the smallest type in loop."));
192 
193 static cl::opt<bool> EnableInterleavedMemAccesses(
194     "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
195     cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
196 
197 /// An interleave-group may need masking if it resides in a block that needs
198 /// predication, or in order to mask away gaps.
199 static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
200     "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
201     cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
202 
203 static cl::opt<unsigned> TinyTripCountInterleaveThreshold(
204     "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden,
205     cl::desc("We don't interleave loops with a estimated constant trip count "
206              "below this number"));
207 
208 static cl::opt<unsigned> ForceTargetNumScalarRegs(
209     "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
210     cl::desc("A flag that overrides the target's number of scalar registers."));
211 
212 static cl::opt<unsigned> ForceTargetNumVectorRegs(
213     "force-target-num-vector-regs", cl::init(0), cl::Hidden,
214     cl::desc("A flag that overrides the target's number of vector registers."));
215 
216 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
217     "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
218     cl::desc("A flag that overrides the target's max interleave factor for "
219              "scalar loops."));
220 
221 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
222     "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
223     cl::desc("A flag that overrides the target's max interleave factor for "
224              "vectorized loops."));
225 
226 static cl::opt<unsigned> ForceTargetInstructionCost(
227     "force-target-instruction-cost", cl::init(0), cl::Hidden,
228     cl::desc("A flag that overrides the target's expected cost for "
229              "an instruction to a single constant value. Mostly "
230              "useful for getting consistent testing."));
231 
232 static cl::opt<unsigned> SmallLoopCost(
233     "small-loop-cost", cl::init(20), cl::Hidden,
234     cl::desc(
235         "The cost of a loop that is considered 'small' by the interleaver."));
236 
237 static cl::opt<bool> LoopVectorizeWithBlockFrequency(
238     "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
239     cl::desc("Enable the use of the block frequency analysis to access PGO "
240              "heuristics minimizing code growth in cold regions and being more "
241              "aggressive in hot regions."));
242 
243 // Runtime interleave loops for load/store throughput.
244 static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
245     "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
246     cl::desc(
247         "Enable runtime interleaving until load/store ports are saturated"));
248 
249 /// The number of stores in a loop that are allowed to need predication.
250 static cl::opt<unsigned> NumberOfStoresToPredicate(
251     "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
252     cl::desc("Max number of stores to be predicated behind an if."));
253 
254 static cl::opt<bool> EnableIndVarRegisterHeur(
255     "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
256     cl::desc("Count the induction variable only once when interleaving"));
257 
258 static cl::opt<bool> EnableCondStoresVectorization(
259     "enable-cond-stores-vec", cl::init(true), cl::Hidden,
260     cl::desc("Enable if predication of stores during vectorization."));
261 
262 static cl::opt<unsigned> MaxNestedScalarReductionIC(
263     "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
264     cl::desc("The maximum interleave count to use when interleaving a scalar "
265              "reduction in a nested loop."));
266 
267 cl::opt<bool> EnableVPlanNativePath(
268     "enable-vplan-native-path", cl::init(false), cl::Hidden,
269     cl::desc("Enable VPlan-native vectorization path with "
270              "support for outer loop vectorization."));
271 
272 // FIXME: Remove this switch once we have divergence analysis. Currently we
273 // assume divergent non-backedge branches when this switch is true.
274 cl::opt<bool> EnableVPlanPredication(
275     "enable-vplan-predication", cl::init(false), cl::Hidden,
276     cl::desc("Enable VPlan-native vectorization path predicator with "
277              "support for outer loop vectorization."));
278 
279 // This flag enables the stress testing of the VPlan H-CFG construction in the
280 // VPlan-native vectorization path. It must be used in conjuction with
281 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
282 // verification of the H-CFGs built.
283 static cl::opt<bool> VPlanBuildStressTest(
284     "vplan-build-stress-test", cl::init(false), cl::Hidden,
285     cl::desc(
286         "Build VPlan for every supported loop nest in the function and bail "
287         "out right after the build (stress test the VPlan H-CFG construction "
288         "in the VPlan-native vectorization path)."));
289 
290 cl::opt<bool> llvm::EnableLoopInterleaving(
291     "interleave-loops", cl::init(true), cl::Hidden,
292     cl::desc("Enable loop interleaving in Loop vectorization passes"));
293 cl::opt<bool> llvm::EnableLoopVectorization(
294     "vectorize-loops", cl::init(true), cl::Hidden,
295     cl::desc("Run the Loop vectorization passes"));
296 
297 /// A helper function for converting Scalar types to vector types.
298 /// If the incoming type is void, we return void. If the VF is 1, we return
299 /// the scalar type.
300 static Type *ToVectorTy(Type *Scalar, unsigned VF) {
301   if (Scalar->isVoidTy() || VF == 1)
302     return Scalar;
303   return VectorType::get(Scalar, VF);
304 }
305 
306 /// A helper function that returns the type of loaded or stored value.
307 static Type *getMemInstValueType(Value *I) {
308   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
309          "Expected Load or Store instruction");
310   if (auto *LI = dyn_cast<LoadInst>(I))
311     return LI->getType();
312   return cast<StoreInst>(I)->getValueOperand()->getType();
313 }
314 
315 /// A helper function that returns true if the given type is irregular. The
316 /// type is irregular if its allocated size doesn't equal the store size of an
317 /// element of the corresponding vector type at the given vectorization factor.
318 static bool hasIrregularType(Type *Ty, const DataLayout &DL, unsigned VF) {
319   // Determine if an array of VF elements of type Ty is "bitcast compatible"
320   // with a <VF x Ty> vector.
321   if (VF > 1) {
322     auto *VectorTy = VectorType::get(Ty, VF);
323     return VF * DL.getTypeAllocSize(Ty) != DL.getTypeStoreSize(VectorTy);
324   }
325 
326   // If the vectorization factor is one, we just check if an array of type Ty
327   // requires padding between elements.
328   return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
329 }
330 
331 /// A helper function that returns the reciprocal of the block probability of
332 /// predicated blocks. If we return X, we are assuming the predicated block
333 /// will execute once for every X iterations of the loop header.
334 ///
335 /// TODO: We should use actual block probability here, if available. Currently,
336 ///       we always assume predicated blocks have a 50% chance of executing.
337 static unsigned getReciprocalPredBlockProb() { return 2; }
338 
339 /// A helper function that adds a 'fast' flag to floating-point operations.
340 static Value *addFastMathFlag(Value *V) {
341   if (isa<FPMathOperator>(V))
342     cast<Instruction>(V)->setFastMathFlags(FastMathFlags::getFast());
343   return V;
344 }
345 
346 static Value *addFastMathFlag(Value *V, FastMathFlags FMF) {
347   if (isa<FPMathOperator>(V))
348     cast<Instruction>(V)->setFastMathFlags(FMF);
349   return V;
350 }
351 
352 /// A helper function that returns an integer or floating-point constant with
353 /// value C.
354 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
355   return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
356                            : ConstantFP::get(Ty, C);
357 }
358 
359 /// Returns "best known" trip count for the specified loop \p L as defined by
360 /// the following procedure:
361 ///   1) Returns exact trip count if it is known.
362 ///   2) Returns expected trip count according to profile data if any.
363 ///   3) Returns upper bound estimate if it is known.
364 ///   4) Returns None if all of the above failed.
365 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) {
366   // Check if exact trip count is known.
367   if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
368     return ExpectedTC;
369 
370   // Check if there is an expected trip count available from profile data.
371   if (LoopVectorizeWithBlockFrequency)
372     if (auto EstimatedTC = getLoopEstimatedTripCount(L))
373       return EstimatedTC;
374 
375   // Check if upper bound estimate is known.
376   if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
377     return ExpectedTC;
378 
379   return None;
380 }
381 
382 namespace llvm {
383 
384 /// InnerLoopVectorizer vectorizes loops which contain only one basic
385 /// block to a specified vectorization factor (VF).
386 /// This class performs the widening of scalars into vectors, or multiple
387 /// scalars. This class also implements the following features:
388 /// * It inserts an epilogue loop for handling loops that don't have iteration
389 ///   counts that are known to be a multiple of the vectorization factor.
390 /// * It handles the code generation for reduction variables.
391 /// * Scalarization (implementation using scalars) of un-vectorizable
392 ///   instructions.
393 /// InnerLoopVectorizer does not perform any vectorization-legality
394 /// checks, and relies on the caller to check for the different legality
395 /// aspects. The InnerLoopVectorizer relies on the
396 /// LoopVectorizationLegality class to provide information about the induction
397 /// and reduction variables that were found to a given vectorization factor.
398 class InnerLoopVectorizer {
399 public:
400   InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
401                       LoopInfo *LI, DominatorTree *DT,
402                       const TargetLibraryInfo *TLI,
403                       const TargetTransformInfo *TTI, AssumptionCache *AC,
404                       OptimizationRemarkEmitter *ORE, unsigned VecWidth,
405                       unsigned UnrollFactor, LoopVectorizationLegality *LVL,
406                       LoopVectorizationCostModel *CM)
407       : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
408         AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
409         Builder(PSE.getSE()->getContext()),
410         VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM) {}
411   virtual ~InnerLoopVectorizer() = default;
412 
413   /// Create a new empty loop. Unlink the old loop and connect the new one.
414   /// Return the pre-header block of the new loop.
415   BasicBlock *createVectorizedLoopSkeleton();
416 
417   /// Widen a single instruction within the innermost loop.
418   void widenInstruction(Instruction &I);
419 
420   /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
421   void fixVectorizedLoop();
422 
423   // Return true if any runtime check is added.
424   bool areSafetyChecksAdded() { return AddedSafetyChecks; }
425 
426   /// A type for vectorized values in the new loop. Each value from the
427   /// original loop, when vectorized, is represented by UF vector values in the
428   /// new unrolled loop, where UF is the unroll factor.
429   using VectorParts = SmallVector<Value *, 2>;
430 
431   /// Vectorize a single GetElementPtrInst based on information gathered and
432   /// decisions taken during planning.
433   void widenGEP(GetElementPtrInst *GEP, unsigned UF, unsigned VF,
434                 bool IsPtrLoopInvariant, SmallBitVector &IsIndexLoopInvariant);
435 
436   /// Vectorize a single PHINode in a block. This method handles the induction
437   /// variable canonicalization. It supports both VF = 1 for unrolled loops and
438   /// arbitrary length vectors.
439   void widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF);
440 
441   /// A helper function to scalarize a single Instruction in the innermost loop.
442   /// Generates a sequence of scalar instances for each lane between \p MinLane
443   /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
444   /// inclusive..
445   void scalarizeInstruction(Instruction *Instr, const VPIteration &Instance,
446                             bool IfPredicateInstr);
447 
448   /// Widen an integer or floating-point induction variable \p IV. If \p Trunc
449   /// is provided, the integer induction variable will first be truncated to
450   /// the corresponding type.
451   void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr);
452 
453   /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a
454   /// vector or scalar value on-demand if one is not yet available. When
455   /// vectorizing a loop, we visit the definition of an instruction before its
456   /// uses. When visiting the definition, we either vectorize or scalarize the
457   /// instruction, creating an entry for it in the corresponding map. (In some
458   /// cases, such as induction variables, we will create both vector and scalar
459   /// entries.) Then, as we encounter uses of the definition, we derive values
460   /// for each scalar or vector use unless such a value is already available.
461   /// For example, if we scalarize a definition and one of its uses is vector,
462   /// we build the required vector on-demand with an insertelement sequence
463   /// when visiting the use. Otherwise, if the use is scalar, we can use the
464   /// existing scalar definition.
465   ///
466   /// Return a value in the new loop corresponding to \p V from the original
467   /// loop at unroll index \p Part. If the value has already been vectorized,
468   /// the corresponding vector entry in VectorLoopValueMap is returned. If,
469   /// however, the value has a scalar entry in VectorLoopValueMap, we construct
470   /// a new vector value on-demand by inserting the scalar values into a vector
471   /// with an insertelement sequence. If the value has been neither vectorized
472   /// nor scalarized, it must be loop invariant, so we simply broadcast the
473   /// value into a vector.
474   Value *getOrCreateVectorValue(Value *V, unsigned Part);
475 
476   /// Return a value in the new loop corresponding to \p V from the original
477   /// loop at unroll and vector indices \p Instance. If the value has been
478   /// vectorized but not scalarized, the necessary extractelement instruction
479   /// will be generated.
480   Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance);
481 
482   /// Construct the vector value of a scalarized value \p V one lane at a time.
483   void packScalarIntoVectorValue(Value *V, const VPIteration &Instance);
484 
485   /// Try to vectorize the interleaved access group that \p Instr belongs to,
486   /// optionally masking the vector operations if \p BlockInMask is non-null.
487   void vectorizeInterleaveGroup(Instruction *Instr,
488                                 VectorParts *BlockInMask = nullptr);
489 
490   /// Vectorize Load and Store instructions, optionally masking the vector
491   /// operations if \p BlockInMask is non-null.
492   void vectorizeMemoryInstruction(Instruction *Instr,
493                                   VectorParts *BlockInMask = nullptr);
494 
495   /// Set the debug location in the builder using the debug location in
496   /// the instruction.
497   void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr);
498 
499   /// Fix the non-induction PHIs in the OrigPHIsToFix vector.
500   void fixNonInductionPHIs(void);
501 
502 protected:
503   friend class LoopVectorizationPlanner;
504 
505   /// A small list of PHINodes.
506   using PhiVector = SmallVector<PHINode *, 4>;
507 
508   /// A type for scalarized values in the new loop. Each value from the
509   /// original loop, when scalarized, is represented by UF x VF scalar values
510   /// in the new unrolled loop, where UF is the unroll factor and VF is the
511   /// vectorization factor.
512   using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
513 
514   /// Set up the values of the IVs correctly when exiting the vector loop.
515   void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
516                     Value *CountRoundDown, Value *EndValue,
517                     BasicBlock *MiddleBlock);
518 
519   /// Create a new induction variable inside L.
520   PHINode *createInductionVariable(Loop *L, Value *Start, Value *End,
521                                    Value *Step, Instruction *DL);
522 
523   /// Handle all cross-iteration phis in the header.
524   void fixCrossIterationPHIs();
525 
526   /// Fix a first-order recurrence. This is the second phase of vectorizing
527   /// this phi node.
528   void fixFirstOrderRecurrence(PHINode *Phi);
529 
530   /// Fix a reduction cross-iteration phi. This is the second phase of
531   /// vectorizing this phi node.
532   void fixReduction(PHINode *Phi);
533 
534   /// Clear NSW/NUW flags from reduction instructions if necessary.
535   void clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc);
536 
537   /// The Loop exit block may have single value PHI nodes with some
538   /// incoming value. While vectorizing we only handled real values
539   /// that were defined inside the loop and we should have one value for
540   /// each predecessor of its parent basic block. See PR14725.
541   void fixLCSSAPHIs();
542 
543   /// Iteratively sink the scalarized operands of a predicated instruction into
544   /// the block that was created for it.
545   void sinkScalarOperands(Instruction *PredInst);
546 
547   /// Shrinks vector element sizes to the smallest bitwidth they can be legally
548   /// represented as.
549   void truncateToMinimalBitwidths();
550 
551   /// Insert the new loop to the loop hierarchy and pass manager
552   /// and update the analysis passes.
553   void updateAnalysis();
554 
555   /// Create a broadcast instruction. This method generates a broadcast
556   /// instruction (shuffle) for loop invariant values and for the induction
557   /// value. If this is the induction variable then we extend it to N, N+1, ...
558   /// this is needed because each iteration in the loop corresponds to a SIMD
559   /// element.
560   virtual Value *getBroadcastInstrs(Value *V);
561 
562   /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...)
563   /// to each vector element of Val. The sequence starts at StartIndex.
564   /// \p Opcode is relevant for FP induction variable.
565   virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step,
566                                Instruction::BinaryOps Opcode =
567                                Instruction::BinaryOpsEnd);
568 
569   /// Compute scalar induction steps. \p ScalarIV is the scalar induction
570   /// variable on which to base the steps, \p Step is the size of the step, and
571   /// \p EntryVal is the value from the original loop that maps to the steps.
572   /// Note that \p EntryVal doesn't have to be an induction variable - it
573   /// can also be a truncate instruction.
574   void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal,
575                         const InductionDescriptor &ID);
576 
577   /// Create a vector induction phi node based on an existing scalar one. \p
578   /// EntryVal is the value from the original loop that maps to the vector phi
579   /// node, and \p Step is the loop-invariant step. If \p EntryVal is a
580   /// truncate instruction, instead of widening the original IV, we widen a
581   /// version of the IV truncated to \p EntryVal's type.
582   void createVectorIntOrFpInductionPHI(const InductionDescriptor &II,
583                                        Value *Step, Instruction *EntryVal);
584 
585   /// Returns true if an instruction \p I should be scalarized instead of
586   /// vectorized for the chosen vectorization factor.
587   bool shouldScalarizeInstruction(Instruction *I) const;
588 
589   /// Returns true if we should generate a scalar version of \p IV.
590   bool needsScalarInduction(Instruction *IV) const;
591 
592   /// If there is a cast involved in the induction variable \p ID, which should
593   /// be ignored in the vectorized loop body, this function records the
594   /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the
595   /// cast. We had already proved that the casted Phi is equal to the uncasted
596   /// Phi in the vectorized loop (under a runtime guard), and therefore
597   /// there is no need to vectorize the cast - the same value can be used in the
598   /// vector loop for both the Phi and the cast.
599   /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified,
600   /// Otherwise, \p VectorLoopValue is a widened/vectorized value.
601   ///
602   /// \p EntryVal is the value from the original loop that maps to the vector
603   /// phi node and is used to distinguish what is the IV currently being
604   /// processed - original one (if \p EntryVal is a phi corresponding to the
605   /// original IV) or the "newly-created" one based on the proof mentioned above
606   /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the
607   /// latter case \p EntryVal is a TruncInst and we must not record anything for
608   /// that IV, but it's error-prone to expect callers of this routine to care
609   /// about that, hence this explicit parameter.
610   void recordVectorLoopValueForInductionCast(const InductionDescriptor &ID,
611                                              const Instruction *EntryVal,
612                                              Value *VectorLoopValue,
613                                              unsigned Part,
614                                              unsigned Lane = UINT_MAX);
615 
616   /// Generate a shuffle sequence that will reverse the vector Vec.
617   virtual Value *reverseVector(Value *Vec);
618 
619   /// Returns (and creates if needed) the original loop trip count.
620   Value *getOrCreateTripCount(Loop *NewLoop);
621 
622   /// Returns (and creates if needed) the trip count of the widened loop.
623   Value *getOrCreateVectorTripCount(Loop *NewLoop);
624 
625   /// Returns a bitcasted value to the requested vector type.
626   /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
627   Value *createBitOrPointerCast(Value *V, VectorType *DstVTy,
628                                 const DataLayout &DL);
629 
630   /// Emit a bypass check to see if the vector trip count is zero, including if
631   /// it overflows.
632   void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass);
633 
634   /// Emit a bypass check to see if all of the SCEV assumptions we've
635   /// had to make are correct.
636   void emitSCEVChecks(Loop *L, BasicBlock *Bypass);
637 
638   /// Emit bypass checks to check any memory assumptions we may have made.
639   void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass);
640 
641   /// Compute the transformed value of Index at offset StartValue using step
642   /// StepValue.
643   /// For integer induction, returns StartValue + Index * StepValue.
644   /// For pointer induction, returns StartValue[Index * StepValue].
645   /// FIXME: The newly created binary instructions should contain nsw/nuw
646   /// flags, which can be found from the original scalar operations.
647   Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE,
648                               const DataLayout &DL,
649                               const InductionDescriptor &ID) const;
650 
651   /// Add additional metadata to \p To that was not present on \p Orig.
652   ///
653   /// Currently this is used to add the noalias annotations based on the
654   /// inserted memchecks.  Use this for instructions that are *cloned* into the
655   /// vector loop.
656   void addNewMetadata(Instruction *To, const Instruction *Orig);
657 
658   /// Add metadata from one instruction to another.
659   ///
660   /// This includes both the original MDs from \p From and additional ones (\see
661   /// addNewMetadata).  Use this for *newly created* instructions in the vector
662   /// loop.
663   void addMetadata(Instruction *To, Instruction *From);
664 
665   /// Similar to the previous function but it adds the metadata to a
666   /// vector of instructions.
667   void addMetadata(ArrayRef<Value *> To, Instruction *From);
668 
669   /// The original loop.
670   Loop *OrigLoop;
671 
672   /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
673   /// dynamic knowledge to simplify SCEV expressions and converts them to a
674   /// more usable form.
675   PredicatedScalarEvolution &PSE;
676 
677   /// Loop Info.
678   LoopInfo *LI;
679 
680   /// Dominator Tree.
681   DominatorTree *DT;
682 
683   /// Alias Analysis.
684   AliasAnalysis *AA;
685 
686   /// Target Library Info.
687   const TargetLibraryInfo *TLI;
688 
689   /// Target Transform Info.
690   const TargetTransformInfo *TTI;
691 
692   /// Assumption Cache.
693   AssumptionCache *AC;
694 
695   /// Interface to emit optimization remarks.
696   OptimizationRemarkEmitter *ORE;
697 
698   /// LoopVersioning.  It's only set up (non-null) if memchecks were
699   /// used.
700   ///
701   /// This is currently only used to add no-alias metadata based on the
702   /// memchecks.  The actually versioning is performed manually.
703   std::unique_ptr<LoopVersioning> LVer;
704 
705   /// The vectorization SIMD factor to use. Each vector will have this many
706   /// vector elements.
707   unsigned VF;
708 
709   /// The vectorization unroll factor to use. Each scalar is vectorized to this
710   /// many different vector instructions.
711   unsigned UF;
712 
713   /// The builder that we use
714   IRBuilder<> Builder;
715 
716   // --- Vectorization state ---
717 
718   /// The vector-loop preheader.
719   BasicBlock *LoopVectorPreHeader;
720 
721   /// The scalar-loop preheader.
722   BasicBlock *LoopScalarPreHeader;
723 
724   /// Middle Block between the vector and the scalar.
725   BasicBlock *LoopMiddleBlock;
726 
727   /// The ExitBlock of the scalar loop.
728   BasicBlock *LoopExitBlock;
729 
730   /// The vector loop body.
731   BasicBlock *LoopVectorBody;
732 
733   /// The scalar loop body.
734   BasicBlock *LoopScalarBody;
735 
736   /// A list of all bypass blocks. The first block is the entry of the loop.
737   SmallVector<BasicBlock *, 4> LoopBypassBlocks;
738 
739   /// The new Induction variable which was added to the new block.
740   PHINode *Induction = nullptr;
741 
742   /// The induction variable of the old basic block.
743   PHINode *OldInduction = nullptr;
744 
745   /// Maps values from the original loop to their corresponding values in the
746   /// vectorized loop. A key value can map to either vector values, scalar
747   /// values or both kinds of values, depending on whether the key was
748   /// vectorized and scalarized.
749   VectorizerValueMap VectorLoopValueMap;
750 
751   /// Store instructions that were predicated.
752   SmallVector<Instruction *, 4> PredicatedInstructions;
753 
754   /// Trip count of the original loop.
755   Value *TripCount = nullptr;
756 
757   /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
758   Value *VectorTripCount = nullptr;
759 
760   /// The legality analysis.
761   LoopVectorizationLegality *Legal;
762 
763   /// The profitablity analysis.
764   LoopVectorizationCostModel *Cost;
765 
766   // Record whether runtime checks are added.
767   bool AddedSafetyChecks = false;
768 
769   // Holds the end values for each induction variable. We save the end values
770   // so we can later fix-up the external users of the induction variables.
771   DenseMap<PHINode *, Value *> IVEndValues;
772 
773   // Vector of original scalar PHIs whose corresponding widened PHIs need to be
774   // fixed up at the end of vector code generation.
775   SmallVector<PHINode *, 8> OrigPHIsToFix;
776 };
777 
778 class InnerLoopUnroller : public InnerLoopVectorizer {
779 public:
780   InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
781                     LoopInfo *LI, DominatorTree *DT,
782                     const TargetLibraryInfo *TLI,
783                     const TargetTransformInfo *TTI, AssumptionCache *AC,
784                     OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
785                     LoopVectorizationLegality *LVL,
786                     LoopVectorizationCostModel *CM)
787       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1,
788                             UnrollFactor, LVL, CM) {}
789 
790 private:
791   Value *getBroadcastInstrs(Value *V) override;
792   Value *getStepVector(Value *Val, int StartIdx, Value *Step,
793                        Instruction::BinaryOps Opcode =
794                        Instruction::BinaryOpsEnd) override;
795   Value *reverseVector(Value *Vec) override;
796 };
797 
798 } // end namespace llvm
799 
800 /// Look for a meaningful debug location on the instruction or it's
801 /// operands.
802 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
803   if (!I)
804     return I;
805 
806   DebugLoc Empty;
807   if (I->getDebugLoc() != Empty)
808     return I;
809 
810   for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) {
811     if (Instruction *OpInst = dyn_cast<Instruction>(*OI))
812       if (OpInst->getDebugLoc() != Empty)
813         return OpInst;
814   }
815 
816   return I;
817 }
818 
819 void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) {
820   if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) {
821     const DILocation *DIL = Inst->getDebugLoc();
822     if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
823         !isa<DbgInfoIntrinsic>(Inst)) {
824       auto NewDIL = DIL->cloneByMultiplyingDuplicationFactor(UF * VF);
825       if (NewDIL)
826         B.SetCurrentDebugLocation(NewDIL.getValue());
827       else
828         LLVM_DEBUG(dbgs()
829                    << "Failed to create new discriminator: "
830                    << DIL->getFilename() << " Line: " << DIL->getLine());
831     }
832     else
833       B.SetCurrentDebugLocation(DIL);
834   } else
835     B.SetCurrentDebugLocation(DebugLoc());
836 }
837 
838 /// Write a record \p DebugMsg about vectorization failure to the debug
839 /// output stream. If \p I is passed, it is an instruction that prevents
840 /// vectorization.
841 #ifndef NDEBUG
842 static void debugVectorizationFailure(const StringRef DebugMsg,
843     Instruction *I) {
844   dbgs() << "LV: Not vectorizing: " << DebugMsg;
845   if (I != nullptr)
846     dbgs() << " " << *I;
847   else
848     dbgs() << '.';
849   dbgs() << '\n';
850 }
851 #endif
852 
853 /// Create an analysis remark that explains why vectorization failed
854 ///
855 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint).  \p
856 /// RemarkName is the identifier for the remark.  If \p I is passed it is an
857 /// instruction that prevents vectorization.  Otherwise \p TheLoop is used for
858 /// the location of the remark.  \return the remark object that can be
859 /// streamed to.
860 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
861     StringRef RemarkName, Loop *TheLoop, Instruction *I) {
862   Value *CodeRegion = TheLoop->getHeader();
863   DebugLoc DL = TheLoop->getStartLoc();
864 
865   if (I) {
866     CodeRegion = I->getParent();
867     // If there is no debug location attached to the instruction, revert back to
868     // using the loop's.
869     if (I->getDebugLoc())
870       DL = I->getDebugLoc();
871   }
872 
873   OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion);
874   R << "loop not vectorized: ";
875   return R;
876 }
877 
878 namespace llvm {
879 
880 void reportVectorizationFailure(const StringRef DebugMsg,
881     const StringRef OREMsg, const StringRef ORETag,
882     OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) {
883   LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I));
884   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
885   ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(),
886                 ORETag, TheLoop, I) << OREMsg);
887 }
888 
889 } // end namespace llvm
890 
891 #ifndef NDEBUG
892 /// \return string containing a file name and a line # for the given loop.
893 static std::string getDebugLocString(const Loop *L) {
894   std::string Result;
895   if (L) {
896     raw_string_ostream OS(Result);
897     if (const DebugLoc LoopDbgLoc = L->getStartLoc())
898       LoopDbgLoc.print(OS);
899     else
900       // Just print the module name.
901       OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
902     OS.flush();
903   }
904   return Result;
905 }
906 #endif
907 
908 void InnerLoopVectorizer::addNewMetadata(Instruction *To,
909                                          const Instruction *Orig) {
910   // If the loop was versioned with memchecks, add the corresponding no-alias
911   // metadata.
912   if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig)))
913     LVer->annotateInstWithNoAlias(To, Orig);
914 }
915 
916 void InnerLoopVectorizer::addMetadata(Instruction *To,
917                                       Instruction *From) {
918   propagateMetadata(To, From);
919   addNewMetadata(To, From);
920 }
921 
922 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,
923                                       Instruction *From) {
924   for (Value *V : To) {
925     if (Instruction *I = dyn_cast<Instruction>(V))
926       addMetadata(I, From);
927   }
928 }
929 
930 namespace llvm {
931 
932 // Loop vectorization cost-model hints how the scalar epilogue loop should be
933 // lowered.
934 enum ScalarEpilogueLowering {
935 
936   // The default: allowing scalar epilogues.
937   CM_ScalarEpilogueAllowed,
938 
939   // Vectorization with OptForSize: don't allow epilogues.
940   CM_ScalarEpilogueNotAllowedOptSize,
941 
942   // A special case of vectorisation with OptForSize: loops with a very small
943   // trip count are considered for vectorization under OptForSize, thereby
944   // making sure the cost of their loop body is dominant, free of runtime
945   // guards and scalar iteration overheads.
946   CM_ScalarEpilogueNotAllowedLowTripLoop,
947 
948   // Loop hint predicate indicating an epilogue is undesired.
949   CM_ScalarEpilogueNotNeededUsePredicate
950 };
951 
952 /// LoopVectorizationCostModel - estimates the expected speedups due to
953 /// vectorization.
954 /// In many cases vectorization is not profitable. This can happen because of
955 /// a number of reasons. In this class we mainly attempt to predict the
956 /// expected speedup/slowdowns due to the supported instruction set. We use the
957 /// TargetTransformInfo to query the different backends for the cost of
958 /// different operations.
959 class LoopVectorizationCostModel {
960 public:
961   LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
962                              PredicatedScalarEvolution &PSE, LoopInfo *LI,
963                              LoopVectorizationLegality *Legal,
964                              const TargetTransformInfo &TTI,
965                              const TargetLibraryInfo *TLI, DemandedBits *DB,
966                              AssumptionCache *AC,
967                              OptimizationRemarkEmitter *ORE, const Function *F,
968                              const LoopVectorizeHints *Hints,
969                              InterleavedAccessInfo &IAI)
970       : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
971         TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
972         Hints(Hints), InterleaveInfo(IAI) {}
973 
974   /// \return An upper bound for the vectorization factor, or None if
975   /// vectorization and interleaving should be avoided up front.
976   Optional<unsigned> computeMaxVF();
977 
978   /// \return True if runtime checks are required for vectorization, and false
979   /// otherwise.
980   bool runtimeChecksRequired();
981 
982   /// \return The most profitable vectorization factor and the cost of that VF.
983   /// This method checks every power of two up to MaxVF. If UserVF is not ZERO
984   /// then this vectorization factor will be selected if vectorization is
985   /// possible.
986   VectorizationFactor selectVectorizationFactor(unsigned MaxVF);
987 
988   /// Setup cost-based decisions for user vectorization factor.
989   void selectUserVectorizationFactor(unsigned UserVF) {
990     collectUniformsAndScalars(UserVF);
991     collectInstsToScalarize(UserVF);
992   }
993 
994   /// \return The size (in bits) of the smallest and widest types in the code
995   /// that needs to be vectorized. We ignore values that remain scalar such as
996   /// 64 bit loop indices.
997   std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
998 
999   /// \return The desired interleave count.
1000   /// If interleave count has been specified by metadata it will be returned.
1001   /// Otherwise, the interleave count is computed and returned. VF and LoopCost
1002   /// are the selected vectorization factor and the cost of the selected VF.
1003   unsigned selectInterleaveCount(unsigned VF, unsigned LoopCost);
1004 
1005   /// Memory access instruction may be vectorized in more than one way.
1006   /// Form of instruction after vectorization depends on cost.
1007   /// This function takes cost-based decisions for Load/Store instructions
1008   /// and collects them in a map. This decisions map is used for building
1009   /// the lists of loop-uniform and loop-scalar instructions.
1010   /// The calculated cost is saved with widening decision in order to
1011   /// avoid redundant calculations.
1012   void setCostBasedWideningDecision(unsigned VF);
1013 
1014   /// A struct that represents some properties of the register usage
1015   /// of a loop.
1016   struct RegisterUsage {
1017     /// Holds the number of loop invariant values that are used in the loop.
1018     /// The key is ClassID of target-provided register class.
1019     SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs;
1020     /// Holds the maximum number of concurrent live intervals in the loop.
1021     /// The key is ClassID of target-provided register class.
1022     SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
1023   };
1024 
1025   /// \return Returns information about the register usages of the loop for the
1026   /// given vectorization factors.
1027   SmallVector<RegisterUsage, 8> calculateRegisterUsage(ArrayRef<unsigned> VFs);
1028 
1029   /// Collect values we want to ignore in the cost model.
1030   void collectValuesToIgnore();
1031 
1032   /// \returns The smallest bitwidth each instruction can be represented with.
1033   /// The vector equivalents of these instructions should be truncated to this
1034   /// type.
1035   const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
1036     return MinBWs;
1037   }
1038 
1039   /// \returns True if it is more profitable to scalarize instruction \p I for
1040   /// vectorization factor \p VF.
1041   bool isProfitableToScalarize(Instruction *I, unsigned VF) const {
1042     assert(VF > 1 && "Profitable to scalarize relevant only for VF > 1.");
1043 
1044     // Cost model is not run in the VPlan-native path - return conservative
1045     // result until this changes.
1046     if (EnableVPlanNativePath)
1047       return false;
1048 
1049     auto Scalars = InstsToScalarize.find(VF);
1050     assert(Scalars != InstsToScalarize.end() &&
1051            "VF not yet analyzed for scalarization profitability");
1052     return Scalars->second.find(I) != Scalars->second.end();
1053   }
1054 
1055   /// Returns true if \p I is known to be uniform after vectorization.
1056   bool isUniformAfterVectorization(Instruction *I, unsigned VF) const {
1057     if (VF == 1)
1058       return true;
1059 
1060     // Cost model is not run in the VPlan-native path - return conservative
1061     // result until this changes.
1062     if (EnableVPlanNativePath)
1063       return false;
1064 
1065     auto UniformsPerVF = Uniforms.find(VF);
1066     assert(UniformsPerVF != Uniforms.end() &&
1067            "VF not yet analyzed for uniformity");
1068     return UniformsPerVF->second.find(I) != UniformsPerVF->second.end();
1069   }
1070 
1071   /// Returns true if \p I is known to be scalar after vectorization.
1072   bool isScalarAfterVectorization(Instruction *I, unsigned VF) const {
1073     if (VF == 1)
1074       return true;
1075 
1076     // Cost model is not run in the VPlan-native path - return conservative
1077     // result until this changes.
1078     if (EnableVPlanNativePath)
1079       return false;
1080 
1081     auto ScalarsPerVF = Scalars.find(VF);
1082     assert(ScalarsPerVF != Scalars.end() &&
1083            "Scalar values are not calculated for VF");
1084     return ScalarsPerVF->second.find(I) != ScalarsPerVF->second.end();
1085   }
1086 
1087   /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1088   /// for vectorization factor \p VF.
1089   bool canTruncateToMinimalBitwidth(Instruction *I, unsigned VF) const {
1090     return VF > 1 && MinBWs.find(I) != MinBWs.end() &&
1091            !isProfitableToScalarize(I, VF) &&
1092            !isScalarAfterVectorization(I, VF);
1093   }
1094 
1095   /// Decision that was taken during cost calculation for memory instruction.
1096   enum InstWidening {
1097     CM_Unknown,
1098     CM_Widen,         // For consecutive accesses with stride +1.
1099     CM_Widen_Reverse, // For consecutive accesses with stride -1.
1100     CM_Interleave,
1101     CM_GatherScatter,
1102     CM_Scalarize
1103   };
1104 
1105   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1106   /// instruction \p I and vector width \p VF.
1107   void setWideningDecision(Instruction *I, unsigned VF, InstWidening W,
1108                            unsigned Cost) {
1109     assert(VF >= 2 && "Expected VF >=2");
1110     WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1111   }
1112 
1113   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1114   /// interleaving group \p Grp and vector width \p VF.
1115   void setWideningDecision(const InterleaveGroup<Instruction> *Grp, unsigned VF,
1116                            InstWidening W, unsigned Cost) {
1117     assert(VF >= 2 && "Expected VF >=2");
1118     /// Broadcast this decicion to all instructions inside the group.
1119     /// But the cost will be assigned to one instruction only.
1120     for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1121       if (auto *I = Grp->getMember(i)) {
1122         if (Grp->getInsertPos() == I)
1123           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1124         else
1125           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1126       }
1127     }
1128   }
1129 
1130   /// Return the cost model decision for the given instruction \p I and vector
1131   /// width \p VF. Return CM_Unknown if this instruction did not pass
1132   /// through the cost modeling.
1133   InstWidening getWideningDecision(Instruction *I, unsigned VF) {
1134     assert(VF >= 2 && "Expected VF >=2");
1135 
1136     // Cost model is not run in the VPlan-native path - return conservative
1137     // result until this changes.
1138     if (EnableVPlanNativePath)
1139       return CM_GatherScatter;
1140 
1141     std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
1142     auto Itr = WideningDecisions.find(InstOnVF);
1143     if (Itr == WideningDecisions.end())
1144       return CM_Unknown;
1145     return Itr->second.first;
1146   }
1147 
1148   /// Return the vectorization cost for the given instruction \p I and vector
1149   /// width \p VF.
1150   unsigned getWideningCost(Instruction *I, unsigned VF) {
1151     assert(VF >= 2 && "Expected VF >=2");
1152     std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
1153     assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&
1154            "The cost is not calculated");
1155     return WideningDecisions[InstOnVF].second;
1156   }
1157 
1158   /// Return True if instruction \p I is an optimizable truncate whose operand
1159   /// is an induction variable. Such a truncate will be removed by adding a new
1160   /// induction variable with the destination type.
1161   bool isOptimizableIVTruncate(Instruction *I, unsigned VF) {
1162     // If the instruction is not a truncate, return false.
1163     auto *Trunc = dyn_cast<TruncInst>(I);
1164     if (!Trunc)
1165       return false;
1166 
1167     // Get the source and destination types of the truncate.
1168     Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1169     Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1170 
1171     // If the truncate is free for the given types, return false. Replacing a
1172     // free truncate with an induction variable would add an induction variable
1173     // update instruction to each iteration of the loop. We exclude from this
1174     // check the primary induction variable since it will need an update
1175     // instruction regardless.
1176     Value *Op = Trunc->getOperand(0);
1177     if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1178       return false;
1179 
1180     // If the truncated value is not an induction variable, return false.
1181     return Legal->isInductionPhi(Op);
1182   }
1183 
1184   /// Collects the instructions to scalarize for each predicated instruction in
1185   /// the loop.
1186   void collectInstsToScalarize(unsigned VF);
1187 
1188   /// Collect Uniform and Scalar values for the given \p VF.
1189   /// The sets depend on CM decision for Load/Store instructions
1190   /// that may be vectorized as interleave, gather-scatter or scalarized.
1191   void collectUniformsAndScalars(unsigned VF) {
1192     // Do the analysis once.
1193     if (VF == 1 || Uniforms.find(VF) != Uniforms.end())
1194       return;
1195     setCostBasedWideningDecision(VF);
1196     collectLoopUniforms(VF);
1197     collectLoopScalars(VF);
1198   }
1199 
1200   /// Returns true if the target machine supports masked store operation
1201   /// for the given \p DataType and kind of access to \p Ptr.
1202   bool isLegalMaskedStore(Type *DataType, Value *Ptr, MaybeAlign Alignment) {
1203     return Legal->isConsecutivePtr(Ptr) &&
1204            TTI.isLegalMaskedStore(DataType, Alignment);
1205   }
1206 
1207   /// Returns true if the target machine supports masked load operation
1208   /// for the given \p DataType and kind of access to \p Ptr.
1209   bool isLegalMaskedLoad(Type *DataType, Value *Ptr, MaybeAlign Alignment) {
1210     return Legal->isConsecutivePtr(Ptr) &&
1211            TTI.isLegalMaskedLoad(DataType, Alignment);
1212   }
1213 
1214   /// Returns true if the target machine supports masked scatter operation
1215   /// for the given \p DataType.
1216   bool isLegalMaskedScatter(Type *DataType, MaybeAlign Alignment) {
1217     return TTI.isLegalMaskedScatter(DataType, Alignment);
1218   }
1219 
1220   /// Returns true if the target machine supports masked gather operation
1221   /// for the given \p DataType.
1222   bool isLegalMaskedGather(Type *DataType, MaybeAlign Alignment) {
1223     return TTI.isLegalMaskedGather(DataType, Alignment);
1224   }
1225 
1226   /// Returns true if the target machine can represent \p V as a masked gather
1227   /// or scatter operation.
1228   bool isLegalGatherOrScatter(Value *V) {
1229     bool LI = isa<LoadInst>(V);
1230     bool SI = isa<StoreInst>(V);
1231     if (!LI && !SI)
1232       return false;
1233     auto *Ty = getMemInstValueType(V);
1234     MaybeAlign Align = getLoadStoreAlignment(V);
1235     return (LI && isLegalMaskedGather(Ty, Align)) ||
1236            (SI && isLegalMaskedScatter(Ty, Align));
1237   }
1238 
1239   /// Returns true if \p I is an instruction that will be scalarized with
1240   /// predication. Such instructions include conditional stores and
1241   /// instructions that may divide by zero.
1242   /// If a non-zero VF has been calculated, we check if I will be scalarized
1243   /// predication for that VF.
1244   bool isScalarWithPredication(Instruction *I, unsigned VF = 1);
1245 
1246   // Returns true if \p I is an instruction that will be predicated either
1247   // through scalar predication or masked load/store or masked gather/scatter.
1248   // Superset of instructions that return true for isScalarWithPredication.
1249   bool isPredicatedInst(Instruction *I) {
1250     if (!blockNeedsPredication(I->getParent()))
1251       return false;
1252     // Loads and stores that need some form of masked operation are predicated
1253     // instructions.
1254     if (isa<LoadInst>(I) || isa<StoreInst>(I))
1255       return Legal->isMaskRequired(I);
1256     return isScalarWithPredication(I);
1257   }
1258 
1259   /// Returns true if \p I is a memory instruction with consecutive memory
1260   /// access that can be widened.
1261   bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1);
1262 
1263   /// Returns true if \p I is a memory instruction in an interleaved-group
1264   /// of memory accesses that can be vectorized with wide vector loads/stores
1265   /// and shuffles.
1266   bool interleavedAccessCanBeWidened(Instruction *I, unsigned VF = 1);
1267 
1268   /// Check if \p Instr belongs to any interleaved access group.
1269   bool isAccessInterleaved(Instruction *Instr) {
1270     return InterleaveInfo.isInterleaved(Instr);
1271   }
1272 
1273   /// Get the interleaved access group that \p Instr belongs to.
1274   const InterleaveGroup<Instruction> *
1275   getInterleavedAccessGroup(Instruction *Instr) {
1276     return InterleaveInfo.getInterleaveGroup(Instr);
1277   }
1278 
1279   /// Returns true if an interleaved group requires a scalar iteration
1280   /// to handle accesses with gaps, and there is nothing preventing us from
1281   /// creating a scalar epilogue.
1282   bool requiresScalarEpilogue() const {
1283     return isScalarEpilogueAllowed() && InterleaveInfo.requiresScalarEpilogue();
1284   }
1285 
1286   /// Returns true if a scalar epilogue is not allowed due to optsize or a
1287   /// loop hint annotation.
1288   bool isScalarEpilogueAllowed() const {
1289     return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1290   }
1291 
1292   /// Returns true if all loop blocks should be masked to fold tail loop.
1293   bool foldTailByMasking() const { return FoldTailByMasking; }
1294 
1295   bool blockNeedsPredication(BasicBlock *BB) {
1296     return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1297   }
1298 
1299   /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1300   /// with factor VF.  Return the cost of the instruction, including
1301   /// scalarization overhead if it's needed.
1302   unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF);
1303 
1304   /// Estimate cost of a call instruction CI if it were vectorized with factor
1305   /// VF. Return the cost of the instruction, including scalarization overhead
1306   /// if it's needed. The flag NeedToScalarize shows if the call needs to be
1307   /// scalarized -
1308   /// i.e. either vector version isn't available, or is too expensive.
1309   unsigned getVectorCallCost(CallInst *CI, unsigned VF, bool &NeedToScalarize);
1310 
1311 private:
1312   unsigned NumPredStores = 0;
1313 
1314   /// \return An upper bound for the vectorization factor, larger than zero.
1315   /// One is returned if vectorization should best be avoided due to cost.
1316   unsigned computeFeasibleMaxVF(unsigned ConstTripCount);
1317 
1318   /// The vectorization cost is a combination of the cost itself and a boolean
1319   /// indicating whether any of the contributing operations will actually
1320   /// operate on
1321   /// vector values after type legalization in the backend. If this latter value
1322   /// is
1323   /// false, then all operations will be scalarized (i.e. no vectorization has
1324   /// actually taken place).
1325   using VectorizationCostTy = std::pair<unsigned, bool>;
1326 
1327   /// Returns the expected execution cost. The unit of the cost does
1328   /// not matter because we use the 'cost' units to compare different
1329   /// vector widths. The cost that is returned is *not* normalized by
1330   /// the factor width.
1331   VectorizationCostTy expectedCost(unsigned VF);
1332 
1333   /// Returns the execution time cost of an instruction for a given vector
1334   /// width. Vector width of one means scalar.
1335   VectorizationCostTy getInstructionCost(Instruction *I, unsigned VF);
1336 
1337   /// The cost-computation logic from getInstructionCost which provides
1338   /// the vector type as an output parameter.
1339   unsigned getInstructionCost(Instruction *I, unsigned VF, Type *&VectorTy);
1340 
1341   /// Calculate vectorization cost of memory instruction \p I.
1342   unsigned getMemoryInstructionCost(Instruction *I, unsigned VF);
1343 
1344   /// The cost computation for scalarized memory instruction.
1345   unsigned getMemInstScalarizationCost(Instruction *I, unsigned VF);
1346 
1347   /// The cost computation for interleaving group of memory instructions.
1348   unsigned getInterleaveGroupCost(Instruction *I, unsigned VF);
1349 
1350   /// The cost computation for Gather/Scatter instruction.
1351   unsigned getGatherScatterCost(Instruction *I, unsigned VF);
1352 
1353   /// The cost computation for widening instruction \p I with consecutive
1354   /// memory access.
1355   unsigned getConsecutiveMemOpCost(Instruction *I, unsigned VF);
1356 
1357   /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1358   /// Load: scalar load + broadcast.
1359   /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1360   /// element)
1361   unsigned getUniformMemOpCost(Instruction *I, unsigned VF);
1362 
1363   /// Estimate the overhead of scalarizing an instruction. This is a
1364   /// convenience wrapper for the type-based getScalarizationOverhead API.
1365   unsigned getScalarizationOverhead(Instruction *I, unsigned VF);
1366 
1367   /// Returns whether the instruction is a load or store and will be a emitted
1368   /// as a vector operation.
1369   bool isConsecutiveLoadOrStore(Instruction *I);
1370 
1371   /// Returns true if an artificially high cost for emulated masked memrefs
1372   /// should be used.
1373   bool useEmulatedMaskMemRefHack(Instruction *I);
1374 
1375   /// Map of scalar integer values to the smallest bitwidth they can be legally
1376   /// represented as. The vector equivalents of these values should be truncated
1377   /// to this type.
1378   MapVector<Instruction *, uint64_t> MinBWs;
1379 
1380   /// A type representing the costs for instructions if they were to be
1381   /// scalarized rather than vectorized. The entries are Instruction-Cost
1382   /// pairs.
1383   using ScalarCostsTy = DenseMap<Instruction *, unsigned>;
1384 
1385   /// A set containing all BasicBlocks that are known to present after
1386   /// vectorization as a predicated block.
1387   SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;
1388 
1389   /// Records whether it is allowed to have the original scalar loop execute at
1390   /// least once. This may be needed as a fallback loop in case runtime
1391   /// aliasing/dependence checks fail, or to handle the tail/remainder
1392   /// iterations when the trip count is unknown or doesn't divide by the VF,
1393   /// or as a peel-loop to handle gaps in interleave-groups.
1394   /// Under optsize and when the trip count is very small we don't allow any
1395   /// iterations to execute in the scalar loop.
1396   ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1397 
1398   /// All blocks of loop are to be masked to fold tail of scalar iterations.
1399   bool FoldTailByMasking = false;
1400 
1401   /// A map holding scalar costs for different vectorization factors. The
1402   /// presence of a cost for an instruction in the mapping indicates that the
1403   /// instruction will be scalarized when vectorizing with the associated
1404   /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1405   DenseMap<unsigned, ScalarCostsTy> InstsToScalarize;
1406 
1407   /// Holds the instructions known to be uniform after vectorization.
1408   /// The data is collected per VF.
1409   DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Uniforms;
1410 
1411   /// Holds the instructions known to be scalar after vectorization.
1412   /// The data is collected per VF.
1413   DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Scalars;
1414 
1415   /// Holds the instructions (address computations) that are forced to be
1416   /// scalarized.
1417   DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1418 
1419   /// Returns the expected difference in cost from scalarizing the expression
1420   /// feeding a predicated instruction \p PredInst. The instructions to
1421   /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1422   /// non-negative return value implies the expression will be scalarized.
1423   /// Currently, only single-use chains are considered for scalarization.
1424   int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
1425                               unsigned VF);
1426 
1427   /// Collect the instructions that are uniform after vectorization. An
1428   /// instruction is uniform if we represent it with a single scalar value in
1429   /// the vectorized loop corresponding to each vector iteration. Examples of
1430   /// uniform instructions include pointer operands of consecutive or
1431   /// interleaved memory accesses. Note that although uniformity implies an
1432   /// instruction will be scalar, the reverse is not true. In general, a
1433   /// scalarized instruction will be represented by VF scalar values in the
1434   /// vectorized loop, each corresponding to an iteration of the original
1435   /// scalar loop.
1436   void collectLoopUniforms(unsigned VF);
1437 
1438   /// Collect the instructions that are scalar after vectorization. An
1439   /// instruction is scalar if it is known to be uniform or will be scalarized
1440   /// during vectorization. Non-uniform scalarized instructions will be
1441   /// represented by VF values in the vectorized loop, each corresponding to an
1442   /// iteration of the original scalar loop.
1443   void collectLoopScalars(unsigned VF);
1444 
1445   /// Keeps cost model vectorization decision and cost for instructions.
1446   /// Right now it is used for memory instructions only.
1447   using DecisionList = DenseMap<std::pair<Instruction *, unsigned>,
1448                                 std::pair<InstWidening, unsigned>>;
1449 
1450   DecisionList WideningDecisions;
1451 
1452   /// Returns true if \p V is expected to be vectorized and it needs to be
1453   /// extracted.
1454   bool needsExtract(Value *V, unsigned VF) const {
1455     Instruction *I = dyn_cast<Instruction>(V);
1456     if (VF == 1 || !I || !TheLoop->contains(I) || TheLoop->isLoopInvariant(I))
1457       return false;
1458 
1459     // Assume we can vectorize V (and hence we need extraction) if the
1460     // scalars are not computed yet. This can happen, because it is called
1461     // via getScalarizationOverhead from setCostBasedWideningDecision, before
1462     // the scalars are collected. That should be a safe assumption in most
1463     // cases, because we check if the operands have vectorizable types
1464     // beforehand in LoopVectorizationLegality.
1465     return Scalars.find(VF) == Scalars.end() ||
1466            !isScalarAfterVectorization(I, VF);
1467   };
1468 
1469   /// Returns a range containing only operands needing to be extracted.
1470   SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1471                                                    unsigned VF) {
1472     return SmallVector<Value *, 4>(make_filter_range(
1473         Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1474   }
1475 
1476 public:
1477   /// The loop that we evaluate.
1478   Loop *TheLoop;
1479 
1480   /// Predicated scalar evolution analysis.
1481   PredicatedScalarEvolution &PSE;
1482 
1483   /// Loop Info analysis.
1484   LoopInfo *LI;
1485 
1486   /// Vectorization legality.
1487   LoopVectorizationLegality *Legal;
1488 
1489   /// Vector target information.
1490   const TargetTransformInfo &TTI;
1491 
1492   /// Target Library Info.
1493   const TargetLibraryInfo *TLI;
1494 
1495   /// Demanded bits analysis.
1496   DemandedBits *DB;
1497 
1498   /// Assumption cache.
1499   AssumptionCache *AC;
1500 
1501   /// Interface to emit optimization remarks.
1502   OptimizationRemarkEmitter *ORE;
1503 
1504   const Function *TheFunction;
1505 
1506   /// Loop Vectorize Hint.
1507   const LoopVectorizeHints *Hints;
1508 
1509   /// The interleave access information contains groups of interleaved accesses
1510   /// with the same stride and close to each other.
1511   InterleavedAccessInfo &InterleaveInfo;
1512 
1513   /// Values to ignore in the cost model.
1514   SmallPtrSet<const Value *, 16> ValuesToIgnore;
1515 
1516   /// Values to ignore in the cost model when VF > 1.
1517   SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1518 };
1519 
1520 } // end namespace llvm
1521 
1522 // Return true if \p OuterLp is an outer loop annotated with hints for explicit
1523 // vectorization. The loop needs to be annotated with #pragma omp simd
1524 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
1525 // vector length information is not provided, vectorization is not considered
1526 // explicit. Interleave hints are not allowed either. These limitations will be
1527 // relaxed in the future.
1528 // Please, note that we are currently forced to abuse the pragma 'clang
1529 // vectorize' semantics. This pragma provides *auto-vectorization hints*
1530 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
1531 // provides *explicit vectorization hints* (LV can bypass legal checks and
1532 // assume that vectorization is legal). However, both hints are implemented
1533 // using the same metadata (llvm.loop.vectorize, processed by
1534 // LoopVectorizeHints). This will be fixed in the future when the native IR
1535 // representation for pragma 'omp simd' is introduced.
1536 static bool isExplicitVecOuterLoop(Loop *OuterLp,
1537                                    OptimizationRemarkEmitter *ORE) {
1538   assert(!OuterLp->empty() && "This is not an outer loop");
1539   LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
1540 
1541   // Only outer loops with an explicit vectorization hint are supported.
1542   // Unannotated outer loops are ignored.
1543   if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
1544     return false;
1545 
1546   Function *Fn = OuterLp->getHeader()->getParent();
1547   if (!Hints.allowVectorization(Fn, OuterLp,
1548                                 true /*VectorizeOnlyWhenForced*/)) {
1549     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
1550     return false;
1551   }
1552 
1553   if (Hints.getInterleave() > 1) {
1554     // TODO: Interleave support is future work.
1555     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
1556                          "outer loops.\n");
1557     Hints.emitRemarkWithHints();
1558     return false;
1559   }
1560 
1561   return true;
1562 }
1563 
1564 static void collectSupportedLoops(Loop &L, LoopInfo *LI,
1565                                   OptimizationRemarkEmitter *ORE,
1566                                   SmallVectorImpl<Loop *> &V) {
1567   // Collect inner loops and outer loops without irreducible control flow. For
1568   // now, only collect outer loops that have explicit vectorization hints. If we
1569   // are stress testing the VPlan H-CFG construction, we collect the outermost
1570   // loop of every loop nest.
1571   if (L.empty() || VPlanBuildStressTest ||
1572       (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
1573     LoopBlocksRPO RPOT(&L);
1574     RPOT.perform(LI);
1575     if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
1576       V.push_back(&L);
1577       // TODO: Collect inner loops inside marked outer loops in case
1578       // vectorization fails for the outer loop. Do not invoke
1579       // 'containsIrreducibleCFG' again for inner loops when the outer loop is
1580       // already known to be reducible. We can use an inherited attribute for
1581       // that.
1582       return;
1583     }
1584   }
1585   for (Loop *InnerL : L)
1586     collectSupportedLoops(*InnerL, LI, ORE, V);
1587 }
1588 
1589 namespace {
1590 
1591 /// The LoopVectorize Pass.
1592 struct LoopVectorize : public FunctionPass {
1593   /// Pass identification, replacement for typeid
1594   static char ID;
1595 
1596   LoopVectorizePass Impl;
1597 
1598   explicit LoopVectorize(bool InterleaveOnlyWhenForced = false,
1599                          bool VectorizeOnlyWhenForced = false)
1600       : FunctionPass(ID) {
1601     Impl.InterleaveOnlyWhenForced = InterleaveOnlyWhenForced;
1602     Impl.VectorizeOnlyWhenForced = VectorizeOnlyWhenForced;
1603     initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
1604   }
1605 
1606   bool runOnFunction(Function &F) override {
1607     if (skipFunction(F))
1608       return false;
1609 
1610     auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
1611     auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
1612     auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
1613     auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
1614     auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
1615     auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
1616     auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
1617     auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
1618     auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
1619     auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
1620     auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
1621     auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
1622     auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
1623 
1624     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
1625         [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
1626 
1627     return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC,
1628                         GetLAA, *ORE, PSI);
1629   }
1630 
1631   void getAnalysisUsage(AnalysisUsage &AU) const override {
1632     AU.addRequired<AssumptionCacheTracker>();
1633     AU.addRequired<BlockFrequencyInfoWrapperPass>();
1634     AU.addRequired<DominatorTreeWrapperPass>();
1635     AU.addRequired<LoopInfoWrapperPass>();
1636     AU.addRequired<ScalarEvolutionWrapperPass>();
1637     AU.addRequired<TargetTransformInfoWrapperPass>();
1638     AU.addRequired<AAResultsWrapperPass>();
1639     AU.addRequired<LoopAccessLegacyAnalysis>();
1640     AU.addRequired<DemandedBitsWrapperPass>();
1641     AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
1642 
1643     // We currently do not preserve loopinfo/dominator analyses with outer loop
1644     // vectorization. Until this is addressed, mark these analyses as preserved
1645     // only for non-VPlan-native path.
1646     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
1647     if (!EnableVPlanNativePath) {
1648       AU.addPreserved<LoopInfoWrapperPass>();
1649       AU.addPreserved<DominatorTreeWrapperPass>();
1650     }
1651 
1652     AU.addPreserved<BasicAAWrapperPass>();
1653     AU.addPreserved<GlobalsAAWrapperPass>();
1654     AU.addRequired<ProfileSummaryInfoWrapperPass>();
1655   }
1656 };
1657 
1658 } // end anonymous namespace
1659 
1660 //===----------------------------------------------------------------------===//
1661 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
1662 // LoopVectorizationCostModel and LoopVectorizationPlanner.
1663 //===----------------------------------------------------------------------===//
1664 
1665 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
1666   // We need to place the broadcast of invariant variables outside the loop,
1667   // but only if it's proven safe to do so. Else, broadcast will be inside
1668   // vector loop body.
1669   Instruction *Instr = dyn_cast<Instruction>(V);
1670   bool SafeToHoist = OrigLoop->isLoopInvariant(V) &&
1671                      (!Instr ||
1672                       DT->dominates(Instr->getParent(), LoopVectorPreHeader));
1673   // Place the code for broadcasting invariant variables in the new preheader.
1674   IRBuilder<>::InsertPointGuard Guard(Builder);
1675   if (SafeToHoist)
1676     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
1677 
1678   // Broadcast the scalar into all locations in the vector.
1679   Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
1680 
1681   return Shuf;
1682 }
1683 
1684 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
1685     const InductionDescriptor &II, Value *Step, Instruction *EntryVal) {
1686   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1687          "Expected either an induction phi-node or a truncate of it!");
1688   Value *Start = II.getStartValue();
1689 
1690   // Construct the initial value of the vector IV in the vector loop preheader
1691   auto CurrIP = Builder.saveIP();
1692   Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
1693   if (isa<TruncInst>(EntryVal)) {
1694     assert(Start->getType()->isIntegerTy() &&
1695            "Truncation requires an integer type");
1696     auto *TruncType = cast<IntegerType>(EntryVal->getType());
1697     Step = Builder.CreateTrunc(Step, TruncType);
1698     Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
1699   }
1700   Value *SplatStart = Builder.CreateVectorSplat(VF, Start);
1701   Value *SteppedStart =
1702       getStepVector(SplatStart, 0, Step, II.getInductionOpcode());
1703 
1704   // We create vector phi nodes for both integer and floating-point induction
1705   // variables. Here, we determine the kind of arithmetic we will perform.
1706   Instruction::BinaryOps AddOp;
1707   Instruction::BinaryOps MulOp;
1708   if (Step->getType()->isIntegerTy()) {
1709     AddOp = Instruction::Add;
1710     MulOp = Instruction::Mul;
1711   } else {
1712     AddOp = II.getInductionOpcode();
1713     MulOp = Instruction::FMul;
1714   }
1715 
1716   // Multiply the vectorization factor by the step using integer or
1717   // floating-point arithmetic as appropriate.
1718   Value *ConstVF = getSignedIntOrFpConstant(Step->getType(), VF);
1719   Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF));
1720 
1721   // Create a vector splat to use in the induction update.
1722   //
1723   // FIXME: If the step is non-constant, we create the vector splat with
1724   //        IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
1725   //        handle a constant vector splat.
1726   Value *SplatVF = isa<Constant>(Mul)
1727                        ? ConstantVector::getSplat(VF, cast<Constant>(Mul))
1728                        : Builder.CreateVectorSplat(VF, Mul);
1729   Builder.restoreIP(CurrIP);
1730 
1731   // We may need to add the step a number of times, depending on the unroll
1732   // factor. The last of those goes into the PHI.
1733   PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
1734                                     &*LoopVectorBody->getFirstInsertionPt());
1735   VecInd->setDebugLoc(EntryVal->getDebugLoc());
1736   Instruction *LastInduction = VecInd;
1737   for (unsigned Part = 0; Part < UF; ++Part) {
1738     VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction);
1739 
1740     if (isa<TruncInst>(EntryVal))
1741       addMetadata(LastInduction, EntryVal);
1742     recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, Part);
1743 
1744     LastInduction = cast<Instruction>(addFastMathFlag(
1745         Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")));
1746     LastInduction->setDebugLoc(EntryVal->getDebugLoc());
1747   }
1748 
1749   // Move the last step to the end of the latch block. This ensures consistent
1750   // placement of all induction updates.
1751   auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
1752   auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator());
1753   auto *ICmp = cast<Instruction>(Br->getCondition());
1754   LastInduction->moveBefore(ICmp);
1755   LastInduction->setName("vec.ind.next");
1756 
1757   VecInd->addIncoming(SteppedStart, LoopVectorPreHeader);
1758   VecInd->addIncoming(LastInduction, LoopVectorLatch);
1759 }
1760 
1761 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const {
1762   return Cost->isScalarAfterVectorization(I, VF) ||
1763          Cost->isProfitableToScalarize(I, VF);
1764 }
1765 
1766 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const {
1767   if (shouldScalarizeInstruction(IV))
1768     return true;
1769   auto isScalarInst = [&](User *U) -> bool {
1770     auto *I = cast<Instruction>(U);
1771     return (OrigLoop->contains(I) && shouldScalarizeInstruction(I));
1772   };
1773   return llvm::any_of(IV->users(), isScalarInst);
1774 }
1775 
1776 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast(
1777     const InductionDescriptor &ID, const Instruction *EntryVal,
1778     Value *VectorLoopVal, unsigned Part, unsigned Lane) {
1779   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1780          "Expected either an induction phi-node or a truncate of it!");
1781 
1782   // This induction variable is not the phi from the original loop but the
1783   // newly-created IV based on the proof that casted Phi is equal to the
1784   // uncasted Phi in the vectorized loop (under a runtime guard possibly). It
1785   // re-uses the same InductionDescriptor that original IV uses but we don't
1786   // have to do any recording in this case - that is done when original IV is
1787   // processed.
1788   if (isa<TruncInst>(EntryVal))
1789     return;
1790 
1791   const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts();
1792   if (Casts.empty())
1793     return;
1794   // Only the first Cast instruction in the Casts vector is of interest.
1795   // The rest of the Casts (if exist) have no uses outside the
1796   // induction update chain itself.
1797   Instruction *CastInst = *Casts.begin();
1798   if (Lane < UINT_MAX)
1799     VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal);
1800   else
1801     VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal);
1802 }
1803 
1804 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) {
1805   assert((IV->getType()->isIntegerTy() || IV != OldInduction) &&
1806          "Primary induction variable must have an integer type");
1807 
1808   auto II = Legal->getInductionVars()->find(IV);
1809   assert(II != Legal->getInductionVars()->end() && "IV is not an induction");
1810 
1811   auto ID = II->second;
1812   assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
1813 
1814   // The scalar value to broadcast. This will be derived from the canonical
1815   // induction variable.
1816   Value *ScalarIV = nullptr;
1817 
1818   // The value from the original loop to which we are mapping the new induction
1819   // variable.
1820   Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
1821 
1822   // True if we have vectorized the induction variable.
1823   auto VectorizedIV = false;
1824 
1825   // Determine if we want a scalar version of the induction variable. This is
1826   // true if the induction variable itself is not widened, or if it has at
1827   // least one user in the loop that is not widened.
1828   auto NeedsScalarIV = VF > 1 && needsScalarInduction(EntryVal);
1829 
1830   // Generate code for the induction step. Note that induction steps are
1831   // required to be loop-invariant
1832   assert(PSE.getSE()->isLoopInvariant(ID.getStep(), OrigLoop) &&
1833          "Induction step should be loop invariant");
1834   auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
1835   Value *Step = nullptr;
1836   if (PSE.getSE()->isSCEVable(IV->getType())) {
1837     SCEVExpander Exp(*PSE.getSE(), DL, "induction");
1838     Step = Exp.expandCodeFor(ID.getStep(), ID.getStep()->getType(),
1839                              LoopVectorPreHeader->getTerminator());
1840   } else {
1841     Step = cast<SCEVUnknown>(ID.getStep())->getValue();
1842   }
1843 
1844   // Try to create a new independent vector induction variable. If we can't
1845   // create the phi node, we will splat the scalar induction variable in each
1846   // loop iteration.
1847   if (VF > 1 && !shouldScalarizeInstruction(EntryVal)) {
1848     createVectorIntOrFpInductionPHI(ID, Step, EntryVal);
1849     VectorizedIV = true;
1850   }
1851 
1852   // If we haven't yet vectorized the induction variable, or if we will create
1853   // a scalar one, we need to define the scalar induction variable and step
1854   // values. If we were given a truncation type, truncate the canonical
1855   // induction variable and step. Otherwise, derive these values from the
1856   // induction descriptor.
1857   if (!VectorizedIV || NeedsScalarIV) {
1858     ScalarIV = Induction;
1859     if (IV != OldInduction) {
1860       ScalarIV = IV->getType()->isIntegerTy()
1861                      ? Builder.CreateSExtOrTrunc(Induction, IV->getType())
1862                      : Builder.CreateCast(Instruction::SIToFP, Induction,
1863                                           IV->getType());
1864       ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID);
1865       ScalarIV->setName("offset.idx");
1866     }
1867     if (Trunc) {
1868       auto *TruncType = cast<IntegerType>(Trunc->getType());
1869       assert(Step->getType()->isIntegerTy() &&
1870              "Truncation requires an integer step");
1871       ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType);
1872       Step = Builder.CreateTrunc(Step, TruncType);
1873     }
1874   }
1875 
1876   // If we haven't yet vectorized the induction variable, splat the scalar
1877   // induction variable, and build the necessary step vectors.
1878   // TODO: Don't do it unless the vectorized IV is really required.
1879   if (!VectorizedIV) {
1880     Value *Broadcasted = getBroadcastInstrs(ScalarIV);
1881     for (unsigned Part = 0; Part < UF; ++Part) {
1882       Value *EntryPart =
1883           getStepVector(Broadcasted, VF * Part, Step, ID.getInductionOpcode());
1884       VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart);
1885       if (Trunc)
1886         addMetadata(EntryPart, Trunc);
1887       recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part);
1888     }
1889   }
1890 
1891   // If an induction variable is only used for counting loop iterations or
1892   // calculating addresses, it doesn't need to be widened. Create scalar steps
1893   // that can be used by instructions we will later scalarize. Note that the
1894   // addition of the scalar steps will not increase the number of instructions
1895   // in the loop in the common case prior to InstCombine. We will be trading
1896   // one vector extract for each scalar step.
1897   if (NeedsScalarIV)
1898     buildScalarSteps(ScalarIV, Step, EntryVal, ID);
1899 }
1900 
1901 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step,
1902                                           Instruction::BinaryOps BinOp) {
1903   // Create and check the types.
1904   assert(Val->getType()->isVectorTy() && "Must be a vector");
1905   int VLen = Val->getType()->getVectorNumElements();
1906 
1907   Type *STy = Val->getType()->getScalarType();
1908   assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
1909          "Induction Step must be an integer or FP");
1910   assert(Step->getType() == STy && "Step has wrong type");
1911 
1912   SmallVector<Constant *, 8> Indices;
1913 
1914   if (STy->isIntegerTy()) {
1915     // Create a vector of consecutive numbers from zero to VF.
1916     for (int i = 0; i < VLen; ++i)
1917       Indices.push_back(ConstantInt::get(STy, StartIdx + i));
1918 
1919     // Add the consecutive indices to the vector value.
1920     Constant *Cv = ConstantVector::get(Indices);
1921     assert(Cv->getType() == Val->getType() && "Invalid consecutive vec");
1922     Step = Builder.CreateVectorSplat(VLen, Step);
1923     assert(Step->getType() == Val->getType() && "Invalid step vec");
1924     // FIXME: The newly created binary instructions should contain nsw/nuw flags,
1925     // which can be found from the original scalar operations.
1926     Step = Builder.CreateMul(Cv, Step);
1927     return Builder.CreateAdd(Val, Step, "induction");
1928   }
1929 
1930   // Floating point induction.
1931   assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
1932          "Binary Opcode should be specified for FP induction");
1933   // Create a vector of consecutive numbers from zero to VF.
1934   for (int i = 0; i < VLen; ++i)
1935     Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i)));
1936 
1937   // Add the consecutive indices to the vector value.
1938   Constant *Cv = ConstantVector::get(Indices);
1939 
1940   Step = Builder.CreateVectorSplat(VLen, Step);
1941 
1942   // Floating point operations had to be 'fast' to enable the induction.
1943   FastMathFlags Flags;
1944   Flags.setFast();
1945 
1946   Value *MulOp = Builder.CreateFMul(Cv, Step);
1947   if (isa<Instruction>(MulOp))
1948     // Have to check, MulOp may be a constant
1949     cast<Instruction>(MulOp)->setFastMathFlags(Flags);
1950 
1951   Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
1952   if (isa<Instruction>(BOp))
1953     cast<Instruction>(BOp)->setFastMathFlags(Flags);
1954   return BOp;
1955 }
1956 
1957 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
1958                                            Instruction *EntryVal,
1959                                            const InductionDescriptor &ID) {
1960   // We shouldn't have to build scalar steps if we aren't vectorizing.
1961   assert(VF > 1 && "VF should be greater than one");
1962 
1963   // Get the value type and ensure it and the step have the same integer type.
1964   Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
1965   assert(ScalarIVTy == Step->getType() &&
1966          "Val and Step should have the same type");
1967 
1968   // We build scalar steps for both integer and floating-point induction
1969   // variables. Here, we determine the kind of arithmetic we will perform.
1970   Instruction::BinaryOps AddOp;
1971   Instruction::BinaryOps MulOp;
1972   if (ScalarIVTy->isIntegerTy()) {
1973     AddOp = Instruction::Add;
1974     MulOp = Instruction::Mul;
1975   } else {
1976     AddOp = ID.getInductionOpcode();
1977     MulOp = Instruction::FMul;
1978   }
1979 
1980   // Determine the number of scalars we need to generate for each unroll
1981   // iteration. If EntryVal is uniform, we only need to generate the first
1982   // lane. Otherwise, we generate all VF values.
1983   unsigned Lanes =
1984       Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) ? 1
1985                                                                          : VF;
1986   // Compute the scalar steps and save the results in VectorLoopValueMap.
1987   for (unsigned Part = 0; Part < UF; ++Part) {
1988     for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
1989       auto *StartIdx = getSignedIntOrFpConstant(ScalarIVTy, VF * Part + Lane);
1990       auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step));
1991       auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul));
1992       VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add);
1993       recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane);
1994     }
1995   }
1996 }
1997 
1998 Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) {
1999   assert(V != Induction && "The new induction variable should not be used.");
2000   assert(!V->getType()->isVectorTy() && "Can't widen a vector");
2001   assert(!V->getType()->isVoidTy() && "Type does not produce a value");
2002 
2003   // If we have a stride that is replaced by one, do it here. Defer this for
2004   // the VPlan-native path until we start running Legal checks in that path.
2005   if (!EnableVPlanNativePath && Legal->hasStride(V))
2006     V = ConstantInt::get(V->getType(), 1);
2007 
2008   // If we have a vector mapped to this value, return it.
2009   if (VectorLoopValueMap.hasVectorValue(V, Part))
2010     return VectorLoopValueMap.getVectorValue(V, Part);
2011 
2012   // If the value has not been vectorized, check if it has been scalarized
2013   // instead. If it has been scalarized, and we actually need the value in
2014   // vector form, we will construct the vector values on demand.
2015   if (VectorLoopValueMap.hasAnyScalarValue(V)) {
2016     Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0});
2017 
2018     // If we've scalarized a value, that value should be an instruction.
2019     auto *I = cast<Instruction>(V);
2020 
2021     // If we aren't vectorizing, we can just copy the scalar map values over to
2022     // the vector map.
2023     if (VF == 1) {
2024       VectorLoopValueMap.setVectorValue(V, Part, ScalarValue);
2025       return ScalarValue;
2026     }
2027 
2028     // Get the last scalar instruction we generated for V and Part. If the value
2029     // is known to be uniform after vectorization, this corresponds to lane zero
2030     // of the Part unroll iteration. Otherwise, the last instruction is the one
2031     // we created for the last vector lane of the Part unroll iteration.
2032     unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) ? 0 : VF - 1;
2033     auto *LastInst = cast<Instruction>(
2034         VectorLoopValueMap.getScalarValue(V, {Part, LastLane}));
2035 
2036     // Set the insert point after the last scalarized instruction. This ensures
2037     // the insertelement sequence will directly follow the scalar definitions.
2038     auto OldIP = Builder.saveIP();
2039     auto NewIP = std::next(BasicBlock::iterator(LastInst));
2040     Builder.SetInsertPoint(&*NewIP);
2041 
2042     // However, if we are vectorizing, we need to construct the vector values.
2043     // If the value is known to be uniform after vectorization, we can just
2044     // broadcast the scalar value corresponding to lane zero for each unroll
2045     // iteration. Otherwise, we construct the vector values using insertelement
2046     // instructions. Since the resulting vectors are stored in
2047     // VectorLoopValueMap, we will only generate the insertelements once.
2048     Value *VectorValue = nullptr;
2049     if (Cost->isUniformAfterVectorization(I, VF)) {
2050       VectorValue = getBroadcastInstrs(ScalarValue);
2051       VectorLoopValueMap.setVectorValue(V, Part, VectorValue);
2052     } else {
2053       // Initialize packing with insertelements to start from undef.
2054       Value *Undef = UndefValue::get(VectorType::get(V->getType(), VF));
2055       VectorLoopValueMap.setVectorValue(V, Part, Undef);
2056       for (unsigned Lane = 0; Lane < VF; ++Lane)
2057         packScalarIntoVectorValue(V, {Part, Lane});
2058       VectorValue = VectorLoopValueMap.getVectorValue(V, Part);
2059     }
2060     Builder.restoreIP(OldIP);
2061     return VectorValue;
2062   }
2063 
2064   // If this scalar is unknown, assume that it is a constant or that it is
2065   // loop invariant. Broadcast V and save the value for future uses.
2066   Value *B = getBroadcastInstrs(V);
2067   VectorLoopValueMap.setVectorValue(V, Part, B);
2068   return B;
2069 }
2070 
2071 Value *
2072 InnerLoopVectorizer::getOrCreateScalarValue(Value *V,
2073                                             const VPIteration &Instance) {
2074   // If the value is not an instruction contained in the loop, it should
2075   // already be scalar.
2076   if (OrigLoop->isLoopInvariant(V))
2077     return V;
2078 
2079   assert(Instance.Lane > 0
2080              ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF)
2081              : true && "Uniform values only have lane zero");
2082 
2083   // If the value from the original loop has not been vectorized, it is
2084   // represented by UF x VF scalar values in the new loop. Return the requested
2085   // scalar value.
2086   if (VectorLoopValueMap.hasScalarValue(V, Instance))
2087     return VectorLoopValueMap.getScalarValue(V, Instance);
2088 
2089   // If the value has not been scalarized, get its entry in VectorLoopValueMap
2090   // for the given unroll part. If this entry is not a vector type (i.e., the
2091   // vectorization factor is one), there is no need to generate an
2092   // extractelement instruction.
2093   auto *U = getOrCreateVectorValue(V, Instance.Part);
2094   if (!U->getType()->isVectorTy()) {
2095     assert(VF == 1 && "Value not scalarized has non-vector type");
2096     return U;
2097   }
2098 
2099   // Otherwise, the value from the original loop has been vectorized and is
2100   // represented by UF vector values. Extract and return the requested scalar
2101   // value from the appropriate vector lane.
2102   return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane));
2103 }
2104 
2105 void InnerLoopVectorizer::packScalarIntoVectorValue(
2106     Value *V, const VPIteration &Instance) {
2107   assert(V != Induction && "The new induction variable should not be used.");
2108   assert(!V->getType()->isVectorTy() && "Can't pack a vector");
2109   assert(!V->getType()->isVoidTy() && "Type does not produce a value");
2110 
2111   Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance);
2112   Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part);
2113   VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst,
2114                                             Builder.getInt32(Instance.Lane));
2115   VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue);
2116 }
2117 
2118 Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
2119   assert(Vec->getType()->isVectorTy() && "Invalid type");
2120   SmallVector<Constant *, 8> ShuffleMask;
2121   for (unsigned i = 0; i < VF; ++i)
2122     ShuffleMask.push_back(Builder.getInt32(VF - i - 1));
2123 
2124   return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()),
2125                                      ConstantVector::get(ShuffleMask),
2126                                      "reverse");
2127 }
2128 
2129 // Return whether we allow using masked interleave-groups (for dealing with
2130 // strided loads/stores that reside in predicated blocks, or for dealing
2131 // with gaps).
2132 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2133   // If an override option has been passed in for interleaved accesses, use it.
2134   if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2135     return EnableMaskedInterleavedMemAccesses;
2136 
2137   return TTI.enableMaskedInterleavedAccessVectorization();
2138 }
2139 
2140 // Try to vectorize the interleave group that \p Instr belongs to.
2141 //
2142 // E.g. Translate following interleaved load group (factor = 3):
2143 //   for (i = 0; i < N; i+=3) {
2144 //     R = Pic[i];             // Member of index 0
2145 //     G = Pic[i+1];           // Member of index 1
2146 //     B = Pic[i+2];           // Member of index 2
2147 //     ... // do something to R, G, B
2148 //   }
2149 // To:
2150 //   %wide.vec = load <12 x i32>                       ; Read 4 tuples of R,G,B
2151 //   %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9>   ; R elements
2152 //   %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10>  ; G elements
2153 //   %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11>  ; B elements
2154 //
2155 // Or translate following interleaved store group (factor = 3):
2156 //   for (i = 0; i < N; i+=3) {
2157 //     ... do something to R, G, B
2158 //     Pic[i]   = R;           // Member of index 0
2159 //     Pic[i+1] = G;           // Member of index 1
2160 //     Pic[i+2] = B;           // Member of index 2
2161 //   }
2162 // To:
2163 //   %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2164 //   %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u>
2165 //   %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2166 //        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements
2167 //   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
2168 void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr,
2169                                                    VectorParts *BlockInMask) {
2170   const InterleaveGroup<Instruction> *Group =
2171       Cost->getInterleavedAccessGroup(Instr);
2172   assert(Group && "Fail to get an interleaved access group.");
2173 
2174   // Skip if current instruction is not the insert position.
2175   if (Instr != Group->getInsertPos())
2176     return;
2177 
2178   const DataLayout &DL = Instr->getModule()->getDataLayout();
2179   Value *Ptr = getLoadStorePointerOperand(Instr);
2180 
2181   // Prepare for the vector type of the interleaved load/store.
2182   Type *ScalarTy = getMemInstValueType(Instr);
2183   unsigned InterleaveFactor = Group->getFactor();
2184   Type *VecTy = VectorType::get(ScalarTy, InterleaveFactor * VF);
2185   Type *PtrTy = VecTy->getPointerTo(getLoadStoreAddressSpace(Instr));
2186 
2187   // Prepare for the new pointers.
2188   setDebugLocFromInst(Builder, Ptr);
2189   SmallVector<Value *, 2> NewPtrs;
2190   unsigned Index = Group->getIndex(Instr);
2191 
2192   VectorParts Mask;
2193   bool IsMaskForCondRequired = BlockInMask;
2194   if (IsMaskForCondRequired) {
2195     Mask = *BlockInMask;
2196     // TODO: extend the masked interleaved-group support to reversed access.
2197     assert(!Group->isReverse() && "Reversed masked interleave-group "
2198                                   "not supported.");
2199   }
2200 
2201   // If the group is reverse, adjust the index to refer to the last vector lane
2202   // instead of the first. We adjust the index from the first vector lane,
2203   // rather than directly getting the pointer for lane VF - 1, because the
2204   // pointer operand of the interleaved access is supposed to be uniform. For
2205   // uniform instructions, we're only required to generate a value for the
2206   // first vector lane in each unroll iteration.
2207   if (Group->isReverse())
2208     Index += (VF - 1) * Group->getFactor();
2209 
2210   bool InBounds = false;
2211   if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
2212     InBounds = gep->isInBounds();
2213 
2214   for (unsigned Part = 0; Part < UF; Part++) {
2215     Value *NewPtr = getOrCreateScalarValue(Ptr, {Part, 0});
2216 
2217     // Notice current instruction could be any index. Need to adjust the address
2218     // to the member of index 0.
2219     //
2220     // E.g.  a = A[i+1];     // Member of index 1 (Current instruction)
2221     //       b = A[i];       // Member of index 0
2222     // Current pointer is pointed to A[i+1], adjust it to A[i].
2223     //
2224     // E.g.  A[i+1] = a;     // Member of index 1
2225     //       A[i]   = b;     // Member of index 0
2226     //       A[i+2] = c;     // Member of index 2 (Current instruction)
2227     // Current pointer is pointed to A[i+2], adjust it to A[i].
2228     NewPtr = Builder.CreateGEP(ScalarTy, NewPtr, Builder.getInt32(-Index));
2229     if (InBounds)
2230       cast<GetElementPtrInst>(NewPtr)->setIsInBounds(true);
2231 
2232     // Cast to the vector pointer type.
2233     NewPtrs.push_back(Builder.CreateBitCast(NewPtr, PtrTy));
2234   }
2235 
2236   setDebugLocFromInst(Builder, Instr);
2237   Value *UndefVec = UndefValue::get(VecTy);
2238 
2239   Value *MaskForGaps = nullptr;
2240   if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) {
2241     MaskForGaps = createBitMaskForGaps(Builder, VF, *Group);
2242     assert(MaskForGaps && "Mask for Gaps is required but it is null");
2243   }
2244 
2245   // Vectorize the interleaved load group.
2246   if (isa<LoadInst>(Instr)) {
2247     // For each unroll part, create a wide load for the group.
2248     SmallVector<Value *, 2> NewLoads;
2249     for (unsigned Part = 0; Part < UF; Part++) {
2250       Instruction *NewLoad;
2251       if (IsMaskForCondRequired || MaskForGaps) {
2252         assert(useMaskedInterleavedAccesses(*TTI) &&
2253                "masked interleaved groups are not allowed.");
2254         Value *GroupMask = MaskForGaps;
2255         if (IsMaskForCondRequired) {
2256           auto *Undefs = UndefValue::get(Mask[Part]->getType());
2257           auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF);
2258           Value *ShuffledMask = Builder.CreateShuffleVector(
2259               Mask[Part], Undefs, RepMask, "interleaved.mask");
2260           GroupMask = MaskForGaps
2261                           ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
2262                                                 MaskForGaps)
2263                           : ShuffledMask;
2264         }
2265         NewLoad =
2266             Builder.CreateMaskedLoad(NewPtrs[Part], Group->getAlignment(),
2267                                      GroupMask, UndefVec, "wide.masked.vec");
2268       }
2269       else
2270         NewLoad = Builder.CreateAlignedLoad(VecTy, NewPtrs[Part],
2271                                             Group->getAlignment(), "wide.vec");
2272       Group->addMetadata(NewLoad);
2273       NewLoads.push_back(NewLoad);
2274     }
2275 
2276     // For each member in the group, shuffle out the appropriate data from the
2277     // wide loads.
2278     for (unsigned I = 0; I < InterleaveFactor; ++I) {
2279       Instruction *Member = Group->getMember(I);
2280 
2281       // Skip the gaps in the group.
2282       if (!Member)
2283         continue;
2284 
2285       Constant *StrideMask = createStrideMask(Builder, I, InterleaveFactor, VF);
2286       for (unsigned Part = 0; Part < UF; Part++) {
2287         Value *StridedVec = Builder.CreateShuffleVector(
2288             NewLoads[Part], UndefVec, StrideMask, "strided.vec");
2289 
2290         // If this member has different type, cast the result type.
2291         if (Member->getType() != ScalarTy) {
2292           VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2293           StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2294         }
2295 
2296         if (Group->isReverse())
2297           StridedVec = reverseVector(StridedVec);
2298 
2299         VectorLoopValueMap.setVectorValue(Member, Part, StridedVec);
2300       }
2301     }
2302     return;
2303   }
2304 
2305   // The sub vector type for current instruction.
2306   VectorType *SubVT = VectorType::get(ScalarTy, VF);
2307 
2308   // Vectorize the interleaved store group.
2309   for (unsigned Part = 0; Part < UF; Part++) {
2310     // Collect the stored vector from each member.
2311     SmallVector<Value *, 4> StoredVecs;
2312     for (unsigned i = 0; i < InterleaveFactor; i++) {
2313       // Interleaved store group doesn't allow a gap, so each index has a member
2314       Instruction *Member = Group->getMember(i);
2315       assert(Member && "Fail to get a member from an interleaved store group");
2316 
2317       Value *StoredVec = getOrCreateVectorValue(
2318           cast<StoreInst>(Member)->getValueOperand(), Part);
2319       if (Group->isReverse())
2320         StoredVec = reverseVector(StoredVec);
2321 
2322       // If this member has different type, cast it to a unified type.
2323 
2324       if (StoredVec->getType() != SubVT)
2325         StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2326 
2327       StoredVecs.push_back(StoredVec);
2328     }
2329 
2330     // Concatenate all vectors into a wide vector.
2331     Value *WideVec = concatenateVectors(Builder, StoredVecs);
2332 
2333     // Interleave the elements in the wide vector.
2334     Constant *IMask = createInterleaveMask(Builder, VF, InterleaveFactor);
2335     Value *IVec = Builder.CreateShuffleVector(WideVec, UndefVec, IMask,
2336                                               "interleaved.vec");
2337 
2338     Instruction *NewStoreInstr;
2339     if (IsMaskForCondRequired) {
2340       auto *Undefs = UndefValue::get(Mask[Part]->getType());
2341       auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF);
2342       Value *ShuffledMask = Builder.CreateShuffleVector(
2343           Mask[Part], Undefs, RepMask, "interleaved.mask");
2344       NewStoreInstr = Builder.CreateMaskedStore(
2345           IVec, NewPtrs[Part], Group->getAlignment(), ShuffledMask);
2346     }
2347     else
2348       NewStoreInstr = Builder.CreateAlignedStore(IVec, NewPtrs[Part],
2349         Group->getAlignment());
2350 
2351     Group->addMetadata(NewStoreInstr);
2352   }
2353 }
2354 
2355 void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
2356                                                      VectorParts *BlockInMask) {
2357   // Attempt to issue a wide load.
2358   LoadInst *LI = dyn_cast<LoadInst>(Instr);
2359   StoreInst *SI = dyn_cast<StoreInst>(Instr);
2360 
2361   assert((LI || SI) && "Invalid Load/Store instruction");
2362 
2363   LoopVectorizationCostModel::InstWidening Decision =
2364       Cost->getWideningDecision(Instr, VF);
2365   assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
2366          "CM decision should be taken at this point");
2367   if (Decision == LoopVectorizationCostModel::CM_Interleave)
2368     return vectorizeInterleaveGroup(Instr);
2369 
2370   Type *ScalarDataTy = getMemInstValueType(Instr);
2371   Type *DataTy = VectorType::get(ScalarDataTy, VF);
2372   Value *Ptr = getLoadStorePointerOperand(Instr);
2373   // An alignment of 0 means target abi alignment. We need to use the scalar's
2374   // target abi alignment in such a case.
2375   const DataLayout &DL = Instr->getModule()->getDataLayout();
2376   const Align Alignment =
2377       DL.getValueOrABITypeAlignment(getLoadStoreAlignment(Instr), ScalarDataTy);
2378   unsigned AddressSpace = getLoadStoreAddressSpace(Instr);
2379 
2380   // Determine if the pointer operand of the access is either consecutive or
2381   // reverse consecutive.
2382   bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse);
2383   bool ConsecutiveStride =
2384       Reverse || (Decision == LoopVectorizationCostModel::CM_Widen);
2385   bool CreateGatherScatter =
2386       (Decision == LoopVectorizationCostModel::CM_GatherScatter);
2387 
2388   // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector
2389   // gather/scatter. Otherwise Decision should have been to Scalarize.
2390   assert((ConsecutiveStride || CreateGatherScatter) &&
2391          "The instruction should be scalarized");
2392 
2393   // Handle consecutive loads/stores.
2394   if (ConsecutiveStride)
2395     Ptr = getOrCreateScalarValue(Ptr, {0, 0});
2396 
2397   VectorParts Mask;
2398   bool isMaskRequired = BlockInMask;
2399   if (isMaskRequired)
2400     Mask = *BlockInMask;
2401 
2402   bool InBounds = false;
2403   if (auto *gep = dyn_cast<GetElementPtrInst>(
2404           getLoadStorePointerOperand(Instr)->stripPointerCasts()))
2405     InBounds = gep->isInBounds();
2406 
2407   const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
2408     // Calculate the pointer for the specific unroll-part.
2409     GetElementPtrInst *PartPtr = nullptr;
2410 
2411     if (Reverse) {
2412       // If the address is consecutive but reversed, then the
2413       // wide store needs to start at the last vector element.
2414       PartPtr = cast<GetElementPtrInst>(
2415           Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(-Part * VF)));
2416       PartPtr->setIsInBounds(InBounds);
2417       PartPtr = cast<GetElementPtrInst>(
2418           Builder.CreateGEP(ScalarDataTy, PartPtr, Builder.getInt32(1 - VF)));
2419       PartPtr->setIsInBounds(InBounds);
2420       if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
2421         Mask[Part] = reverseVector(Mask[Part]);
2422     } else {
2423       PartPtr = cast<GetElementPtrInst>(
2424           Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(Part * VF)));
2425       PartPtr->setIsInBounds(InBounds);
2426     }
2427 
2428     return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
2429   };
2430 
2431   // Handle Stores:
2432   if (SI) {
2433     setDebugLocFromInst(Builder, SI);
2434 
2435     for (unsigned Part = 0; Part < UF; ++Part) {
2436       Instruction *NewSI = nullptr;
2437       Value *StoredVal = getOrCreateVectorValue(SI->getValueOperand(), Part);
2438       if (CreateGatherScatter) {
2439         Value *MaskPart = isMaskRequired ? Mask[Part] : nullptr;
2440         Value *VectorGep = getOrCreateVectorValue(Ptr, Part);
2441         NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep,
2442                                             Alignment.value(), MaskPart);
2443       } else {
2444         if (Reverse) {
2445           // If we store to reverse consecutive memory locations, then we need
2446           // to reverse the order of elements in the stored value.
2447           StoredVal = reverseVector(StoredVal);
2448           // We don't want to update the value in the map as it might be used in
2449           // another expression. So don't call resetVectorValue(StoredVal).
2450         }
2451         auto *VecPtr = CreateVecPtr(Part, Ptr);
2452         if (isMaskRequired)
2453           NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr,
2454                                             Alignment.value(), Mask[Part]);
2455         else
2456           NewSI =
2457               Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment.value());
2458       }
2459       addMetadata(NewSI, SI);
2460     }
2461     return;
2462   }
2463 
2464   // Handle loads.
2465   assert(LI && "Must have a load instruction");
2466   setDebugLocFromInst(Builder, LI);
2467   for (unsigned Part = 0; Part < UF; ++Part) {
2468     Value *NewLI;
2469     if (CreateGatherScatter) {
2470       Value *MaskPart = isMaskRequired ? Mask[Part] : nullptr;
2471       Value *VectorGep = getOrCreateVectorValue(Ptr, Part);
2472       NewLI = Builder.CreateMaskedGather(VectorGep, Alignment.value(), MaskPart,
2473                                          nullptr, "wide.masked.gather");
2474       addMetadata(NewLI, LI);
2475     } else {
2476       auto *VecPtr = CreateVecPtr(Part, Ptr);
2477       if (isMaskRequired)
2478         NewLI = Builder.CreateMaskedLoad(VecPtr, Alignment.value(), Mask[Part],
2479                                          UndefValue::get(DataTy),
2480                                          "wide.masked.load");
2481       else
2482         NewLI = Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment.value(),
2483                                           "wide.load");
2484 
2485       // Add metadata to the load, but setVectorValue to the reverse shuffle.
2486       addMetadata(NewLI, LI);
2487       if (Reverse)
2488         NewLI = reverseVector(NewLI);
2489     }
2490     VectorLoopValueMap.setVectorValue(Instr, Part, NewLI);
2491   }
2492 }
2493 
2494 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
2495                                                const VPIteration &Instance,
2496                                                bool IfPredicateInstr) {
2497   assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
2498 
2499   setDebugLocFromInst(Builder, Instr);
2500 
2501   // Does this instruction return a value ?
2502   bool IsVoidRetTy = Instr->getType()->isVoidTy();
2503 
2504   Instruction *Cloned = Instr->clone();
2505   if (!IsVoidRetTy)
2506     Cloned->setName(Instr->getName() + ".cloned");
2507 
2508   // Replace the operands of the cloned instructions with their scalar
2509   // equivalents in the new loop.
2510   for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
2511     auto *NewOp = getOrCreateScalarValue(Instr->getOperand(op), Instance);
2512     Cloned->setOperand(op, NewOp);
2513   }
2514   addNewMetadata(Cloned, Instr);
2515 
2516   // Place the cloned scalar in the new loop.
2517   Builder.Insert(Cloned);
2518 
2519   // Add the cloned scalar to the scalar map entry.
2520   VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned);
2521 
2522   // If we just cloned a new assumption, add it the assumption cache.
2523   if (auto *II = dyn_cast<IntrinsicInst>(Cloned))
2524     if (II->getIntrinsicID() == Intrinsic::assume)
2525       AC->registerAssumption(II);
2526 
2527   // End if-block.
2528   if (IfPredicateInstr)
2529     PredicatedInstructions.push_back(Cloned);
2530 }
2531 
2532 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start,
2533                                                       Value *End, Value *Step,
2534                                                       Instruction *DL) {
2535   BasicBlock *Header = L->getHeader();
2536   BasicBlock *Latch = L->getLoopLatch();
2537   // As we're just creating this loop, it's possible no latch exists
2538   // yet. If so, use the header as this will be a single block loop.
2539   if (!Latch)
2540     Latch = Header;
2541 
2542   IRBuilder<> Builder(&*Header->getFirstInsertionPt());
2543   Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction);
2544   setDebugLocFromInst(Builder, OldInst);
2545   auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index");
2546 
2547   Builder.SetInsertPoint(Latch->getTerminator());
2548   setDebugLocFromInst(Builder, OldInst);
2549 
2550   // Create i+1 and fill the PHINode.
2551   Value *Next = Builder.CreateAdd(Induction, Step, "index.next");
2552   Induction->addIncoming(Start, L->getLoopPreheader());
2553   Induction->addIncoming(Next, Latch);
2554   // Create the compare.
2555   Value *ICmp = Builder.CreateICmpEQ(Next, End);
2556   Builder.CreateCondBr(ICmp, L->getExitBlock(), Header);
2557 
2558   // Now we have two terminators. Remove the old one from the block.
2559   Latch->getTerminator()->eraseFromParent();
2560 
2561   return Induction;
2562 }
2563 
2564 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
2565   if (TripCount)
2566     return TripCount;
2567 
2568   assert(L && "Create Trip Count for null loop.");
2569   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2570   // Find the loop boundaries.
2571   ScalarEvolution *SE = PSE.getSE();
2572   const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
2573   assert(BackedgeTakenCount != SE->getCouldNotCompute() &&
2574          "Invalid loop count");
2575 
2576   Type *IdxTy = Legal->getWidestInductionType();
2577   assert(IdxTy && "No type for induction");
2578 
2579   // The exit count might have the type of i64 while the phi is i32. This can
2580   // happen if we have an induction variable that is sign extended before the
2581   // compare. The only way that we get a backedge taken count is that the
2582   // induction variable was signed and as such will not overflow. In such a case
2583   // truncation is legal.
2584   if (BackedgeTakenCount->getType()->getPrimitiveSizeInBits() >
2585       IdxTy->getPrimitiveSizeInBits())
2586     BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
2587   BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
2588 
2589   // Get the total trip count from the count by adding 1.
2590   const SCEV *ExitCount = SE->getAddExpr(
2591       BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
2592 
2593   const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
2594 
2595   // Expand the trip count and place the new instructions in the preheader.
2596   // Notice that the pre-header does not change, only the loop body.
2597   SCEVExpander Exp(*SE, DL, "induction");
2598 
2599   // Count holds the overall loop count (N).
2600   TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
2601                                 L->getLoopPreheader()->getTerminator());
2602 
2603   if (TripCount->getType()->isPointerTy())
2604     TripCount =
2605         CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
2606                                     L->getLoopPreheader()->getTerminator());
2607 
2608   return TripCount;
2609 }
2610 
2611 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
2612   if (VectorTripCount)
2613     return VectorTripCount;
2614 
2615   Value *TC = getOrCreateTripCount(L);
2616   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2617 
2618   Type *Ty = TC->getType();
2619   Constant *Step = ConstantInt::get(Ty, VF * UF);
2620 
2621   // If the tail is to be folded by masking, round the number of iterations N
2622   // up to a multiple of Step instead of rounding down. This is done by first
2623   // adding Step-1 and then rounding down. Note that it's ok if this addition
2624   // overflows: the vector induction variable will eventually wrap to zero given
2625   // that it starts at zero and its Step is a power of two; the loop will then
2626   // exit, with the last early-exit vector comparison also producing all-true.
2627   if (Cost->foldTailByMasking()) {
2628     assert(isPowerOf2_32(VF * UF) &&
2629            "VF*UF must be a power of 2 when folding tail by masking");
2630     TC = Builder.CreateAdd(TC, ConstantInt::get(Ty, VF * UF - 1), "n.rnd.up");
2631   }
2632 
2633   // Now we need to generate the expression for the part of the loop that the
2634   // vectorized body will execute. This is equal to N - (N % Step) if scalar
2635   // iterations are not required for correctness, or N - Step, otherwise. Step
2636   // is equal to the vectorization factor (number of SIMD elements) times the
2637   // unroll factor (number of SIMD instructions).
2638   Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
2639 
2640   // If there is a non-reversed interleaved group that may speculatively access
2641   // memory out-of-bounds, we need to ensure that there will be at least one
2642   // iteration of the scalar epilogue loop. Thus, if the step evenly divides
2643   // the trip count, we set the remainder to be equal to the step. If the step
2644   // does not evenly divide the trip count, no adjustment is necessary since
2645   // there will already be scalar iterations. Note that the minimum iterations
2646   // check ensures that N >= Step.
2647   if (VF > 1 && Cost->requiresScalarEpilogue()) {
2648     auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
2649     R = Builder.CreateSelect(IsZero, Step, R);
2650   }
2651 
2652   VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
2653 
2654   return VectorTripCount;
2655 }
2656 
2657 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
2658                                                    const DataLayout &DL) {
2659   // Verify that V is a vector type with same number of elements as DstVTy.
2660   unsigned VF = DstVTy->getNumElements();
2661   VectorType *SrcVecTy = cast<VectorType>(V->getType());
2662   assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match");
2663   Type *SrcElemTy = SrcVecTy->getElementType();
2664   Type *DstElemTy = DstVTy->getElementType();
2665   assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
2666          "Vector elements must have same size");
2667 
2668   // Do a direct cast if element types are castable.
2669   if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
2670     return Builder.CreateBitOrPointerCast(V, DstVTy);
2671   }
2672   // V cannot be directly casted to desired vector type.
2673   // May happen when V is a floating point vector but DstVTy is a vector of
2674   // pointers or vice-versa. Handle this using a two-step bitcast using an
2675   // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
2676   assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
2677          "Only one type should be a pointer type");
2678   assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
2679          "Only one type should be a floating point type");
2680   Type *IntTy =
2681       IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
2682   VectorType *VecIntTy = VectorType::get(IntTy, VF);
2683   Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
2684   return Builder.CreateBitOrPointerCast(CastVal, DstVTy);
2685 }
2686 
2687 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
2688                                                          BasicBlock *Bypass) {
2689   Value *Count = getOrCreateTripCount(L);
2690   BasicBlock *BB = L->getLoopPreheader();
2691   IRBuilder<> Builder(BB->getTerminator());
2692 
2693   // Generate code to check if the loop's trip count is less than VF * UF, or
2694   // equal to it in case a scalar epilogue is required; this implies that the
2695   // vector trip count is zero. This check also covers the case where adding one
2696   // to the backedge-taken count overflowed leading to an incorrect trip count
2697   // of zero. In this case we will also jump to the scalar loop.
2698   auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE
2699                                           : ICmpInst::ICMP_ULT;
2700 
2701   // If tail is to be folded, vector loop takes care of all iterations.
2702   Value *CheckMinIters = Builder.getFalse();
2703   if (!Cost->foldTailByMasking())
2704     CheckMinIters = Builder.CreateICmp(
2705         P, Count, ConstantInt::get(Count->getType(), VF * UF),
2706         "min.iters.check");
2707 
2708   BasicBlock *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
2709   // Update dominator tree immediately if the generated block is a
2710   // LoopBypassBlock because SCEV expansions to generate loop bypass
2711   // checks may query it before the current function is finished.
2712   DT->addNewBlock(NewBB, BB);
2713   if (L->getParentLoop())
2714     L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
2715   ReplaceInstWithInst(BB->getTerminator(),
2716                       BranchInst::Create(Bypass, NewBB, CheckMinIters));
2717   LoopBypassBlocks.push_back(BB);
2718 }
2719 
2720 void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
2721   BasicBlock *BB = L->getLoopPreheader();
2722 
2723   // Generate the code to check that the SCEV assumptions that we made.
2724   // We want the new basic block to start at the first instruction in a
2725   // sequence of instructions that form a check.
2726   SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(),
2727                    "scev.check");
2728   Value *SCEVCheck =
2729       Exp.expandCodeForPredicate(&PSE.getUnionPredicate(), BB->getTerminator());
2730 
2731   if (auto *C = dyn_cast<ConstantInt>(SCEVCheck))
2732     if (C->isZero())
2733       return;
2734 
2735   assert(!BB->getParent()->hasOptSize() &&
2736          "Cannot SCEV check stride or overflow when optimizing for size");
2737 
2738   // Create a new block containing the stride check.
2739   BB->setName("vector.scevcheck");
2740   auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
2741   // Update dominator tree immediately if the generated block is a
2742   // LoopBypassBlock because SCEV expansions to generate loop bypass
2743   // checks may query it before the current function is finished.
2744   DT->addNewBlock(NewBB, BB);
2745   if (L->getParentLoop())
2746     L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
2747   ReplaceInstWithInst(BB->getTerminator(),
2748                       BranchInst::Create(Bypass, NewBB, SCEVCheck));
2749   LoopBypassBlocks.push_back(BB);
2750   AddedSafetyChecks = true;
2751 }
2752 
2753 void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {
2754   // VPlan-native path does not do any analysis for runtime checks currently.
2755   if (EnableVPlanNativePath)
2756     return;
2757 
2758   BasicBlock *BB = L->getLoopPreheader();
2759 
2760   // Generate the code that checks in runtime if arrays overlap. We put the
2761   // checks into a separate block to make the more common case of few elements
2762   // faster.
2763   Instruction *FirstCheckInst;
2764   Instruction *MemRuntimeCheck;
2765   std::tie(FirstCheckInst, MemRuntimeCheck) =
2766       Legal->getLAI()->addRuntimeChecks(BB->getTerminator());
2767   if (!MemRuntimeCheck)
2768     return;
2769 
2770   if (BB->getParent()->hasOptSize()) {
2771     assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
2772            "Cannot emit memory checks when optimizing for size, unless forced "
2773            "to vectorize.");
2774     ORE->emit([&]() {
2775       return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
2776                                         L->getStartLoc(), L->getHeader())
2777              << "Code-size may be reduced by not forcing "
2778                 "vectorization, or by source-code modifications "
2779                 "eliminating the need for runtime checks "
2780                 "(e.g., adding 'restrict').";
2781     });
2782   }
2783 
2784   // Create a new block containing the memory check.
2785   BB->setName("vector.memcheck");
2786   auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
2787   // Update dominator tree immediately if the generated block is a
2788   // LoopBypassBlock because SCEV expansions to generate loop bypass
2789   // checks may query it before the current function is finished.
2790   DT->addNewBlock(NewBB, BB);
2791   if (L->getParentLoop())
2792     L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
2793   ReplaceInstWithInst(BB->getTerminator(),
2794                       BranchInst::Create(Bypass, NewBB, MemRuntimeCheck));
2795   LoopBypassBlocks.push_back(BB);
2796   AddedSafetyChecks = true;
2797 
2798   // We currently don't use LoopVersioning for the actual loop cloning but we
2799   // still use it to add the noalias metadata.
2800   LVer = std::make_unique<LoopVersioning>(*Legal->getLAI(), OrigLoop, LI, DT,
2801                                            PSE.getSE());
2802   LVer->prepareNoAliasMetadata();
2803 }
2804 
2805 Value *InnerLoopVectorizer::emitTransformedIndex(
2806     IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL,
2807     const InductionDescriptor &ID) const {
2808 
2809   SCEVExpander Exp(*SE, DL, "induction");
2810   auto Step = ID.getStep();
2811   auto StartValue = ID.getStartValue();
2812   assert(Index->getType() == Step->getType() &&
2813          "Index type does not match StepValue type");
2814 
2815   // Note: the IR at this point is broken. We cannot use SE to create any new
2816   // SCEV and then expand it, hoping that SCEV's simplification will give us
2817   // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
2818   // lead to various SCEV crashes. So all we can do is to use builder and rely
2819   // on InstCombine for future simplifications. Here we handle some trivial
2820   // cases only.
2821   auto CreateAdd = [&B](Value *X, Value *Y) {
2822     assert(X->getType() == Y->getType() && "Types don't match!");
2823     if (auto *CX = dyn_cast<ConstantInt>(X))
2824       if (CX->isZero())
2825         return Y;
2826     if (auto *CY = dyn_cast<ConstantInt>(Y))
2827       if (CY->isZero())
2828         return X;
2829     return B.CreateAdd(X, Y);
2830   };
2831 
2832   auto CreateMul = [&B](Value *X, Value *Y) {
2833     assert(X->getType() == Y->getType() && "Types don't match!");
2834     if (auto *CX = dyn_cast<ConstantInt>(X))
2835       if (CX->isOne())
2836         return Y;
2837     if (auto *CY = dyn_cast<ConstantInt>(Y))
2838       if (CY->isOne())
2839         return X;
2840     return B.CreateMul(X, Y);
2841   };
2842 
2843   switch (ID.getKind()) {
2844   case InductionDescriptor::IK_IntInduction: {
2845     assert(Index->getType() == StartValue->getType() &&
2846            "Index type does not match StartValue type");
2847     if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne())
2848       return B.CreateSub(StartValue, Index);
2849     auto *Offset = CreateMul(
2850         Index, Exp.expandCodeFor(Step, Index->getType(), &*B.GetInsertPoint()));
2851     return CreateAdd(StartValue, Offset);
2852   }
2853   case InductionDescriptor::IK_PtrInduction: {
2854     assert(isa<SCEVConstant>(Step) &&
2855            "Expected constant step for pointer induction");
2856     return B.CreateGEP(
2857         StartValue->getType()->getPointerElementType(), StartValue,
2858         CreateMul(Index, Exp.expandCodeFor(Step, Index->getType(),
2859                                            &*B.GetInsertPoint())));
2860   }
2861   case InductionDescriptor::IK_FpInduction: {
2862     assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
2863     auto InductionBinOp = ID.getInductionBinOp();
2864     assert(InductionBinOp &&
2865            (InductionBinOp->getOpcode() == Instruction::FAdd ||
2866             InductionBinOp->getOpcode() == Instruction::FSub) &&
2867            "Original bin op should be defined for FP induction");
2868 
2869     Value *StepValue = cast<SCEVUnknown>(Step)->getValue();
2870 
2871     // Floating point operations had to be 'fast' to enable the induction.
2872     FastMathFlags Flags;
2873     Flags.setFast();
2874 
2875     Value *MulExp = B.CreateFMul(StepValue, Index);
2876     if (isa<Instruction>(MulExp))
2877       // We have to check, the MulExp may be a constant.
2878       cast<Instruction>(MulExp)->setFastMathFlags(Flags);
2879 
2880     Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
2881                                "induction");
2882     if (isa<Instruction>(BOp))
2883       cast<Instruction>(BOp)->setFastMathFlags(Flags);
2884 
2885     return BOp;
2886   }
2887   case InductionDescriptor::IK_NoInduction:
2888     return nullptr;
2889   }
2890   llvm_unreachable("invalid enum");
2891 }
2892 
2893 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
2894   /*
2895    In this function we generate a new loop. The new loop will contain
2896    the vectorized instructions while the old loop will continue to run the
2897    scalar remainder.
2898 
2899        [ ] <-- loop iteration number check.
2900     /   |
2901    /    v
2902   |    [ ] <-- vector loop bypass (may consist of multiple blocks).
2903   |  /  |
2904   | /   v
2905   ||   [ ]     <-- vector pre header.
2906   |/    |
2907   |     v
2908   |    [  ] \
2909   |    [  ]_|   <-- vector loop.
2910   |     |
2911   |     v
2912   |   -[ ]   <--- middle-block.
2913   |  /  |
2914   | /   v
2915   -|- >[ ]     <--- new preheader.
2916    |    |
2917    |    v
2918    |   [ ] \
2919    |   [ ]_|   <-- old scalar loop to handle remainder.
2920     \   |
2921      \  v
2922       >[ ]     <-- exit block.
2923    ...
2924    */
2925 
2926   BasicBlock *OldBasicBlock = OrigLoop->getHeader();
2927   BasicBlock *VectorPH = OrigLoop->getLoopPreheader();
2928   BasicBlock *ExitBlock = OrigLoop->getExitBlock();
2929   MDNode *OrigLoopID = OrigLoop->getLoopID();
2930   assert(VectorPH && "Invalid loop structure");
2931   assert(ExitBlock && "Must have an exit block");
2932 
2933   // Some loops have a single integer induction variable, while other loops
2934   // don't. One example is c++ iterators that often have multiple pointer
2935   // induction variables. In the code below we also support a case where we
2936   // don't have a single induction variable.
2937   //
2938   // We try to obtain an induction variable from the original loop as hard
2939   // as possible. However if we don't find one that:
2940   //   - is an integer
2941   //   - counts from zero, stepping by one
2942   //   - is the size of the widest induction variable type
2943   // then we create a new one.
2944   OldInduction = Legal->getPrimaryInduction();
2945   Type *IdxTy = Legal->getWidestInductionType();
2946 
2947   // Split the single block loop into the two loop structure described above.
2948   BasicBlock *VecBody =
2949       VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.body");
2950   BasicBlock *MiddleBlock =
2951       VecBody->splitBasicBlock(VecBody->getTerminator(), "middle.block");
2952   BasicBlock *ScalarPH =
2953       MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), "scalar.ph");
2954 
2955   // Create and register the new vector loop.
2956   Loop *Lp = LI->AllocateLoop();
2957   Loop *ParentLoop = OrigLoop->getParentLoop();
2958 
2959   // Insert the new loop into the loop nest and register the new basic blocks
2960   // before calling any utilities such as SCEV that require valid LoopInfo.
2961   if (ParentLoop) {
2962     ParentLoop->addChildLoop(Lp);
2963     ParentLoop->addBasicBlockToLoop(ScalarPH, *LI);
2964     ParentLoop->addBasicBlockToLoop(MiddleBlock, *LI);
2965   } else {
2966     LI->addTopLevelLoop(Lp);
2967   }
2968   Lp->addBasicBlockToLoop(VecBody, *LI);
2969 
2970   // Find the loop boundaries.
2971   Value *Count = getOrCreateTripCount(Lp);
2972 
2973   Value *StartIdx = ConstantInt::get(IdxTy, 0);
2974 
2975   // Now, compare the new count to zero. If it is zero skip the vector loop and
2976   // jump to the scalar loop. This check also covers the case where the
2977   // backedge-taken count is uint##_max: adding one to it will overflow leading
2978   // to an incorrect trip count of zero. In this (rare) case we will also jump
2979   // to the scalar loop.
2980   emitMinimumIterationCountCheck(Lp, ScalarPH);
2981 
2982   // Generate the code to check any assumptions that we've made for SCEV
2983   // expressions.
2984   emitSCEVChecks(Lp, ScalarPH);
2985 
2986   // Generate the code that checks in runtime if arrays overlap. We put the
2987   // checks into a separate block to make the more common case of few elements
2988   // faster.
2989   emitMemRuntimeChecks(Lp, ScalarPH);
2990 
2991   // Generate the induction variable.
2992   // The loop step is equal to the vectorization factor (num of SIMD elements)
2993   // times the unroll factor (num of SIMD instructions).
2994   Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
2995   Constant *Step = ConstantInt::get(IdxTy, VF * UF);
2996   Induction =
2997       createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
2998                               getDebugLocFromInstOrOperands(OldInduction));
2999 
3000   // We are going to resume the execution of the scalar loop.
3001   // Go over all of the induction variables that we found and fix the
3002   // PHIs that are left in the scalar version of the loop.
3003   // The starting values of PHI nodes depend on the counter of the last
3004   // iteration in the vectorized loop.
3005   // If we come from a bypass edge then we need to start from the original
3006   // start value.
3007 
3008   // This variable saves the new starting index for the scalar loop. It is used
3009   // to test if there are any tail iterations left once the vector loop has
3010   // completed.
3011   LoopVectorizationLegality::InductionList *List = Legal->getInductionVars();
3012   for (auto &InductionEntry : *List) {
3013     PHINode *OrigPhi = InductionEntry.first;
3014     InductionDescriptor II = InductionEntry.second;
3015 
3016     // Create phi nodes to merge from the  backedge-taken check block.
3017     PHINode *BCResumeVal = PHINode::Create(
3018         OrigPhi->getType(), 3, "bc.resume.val", ScalarPH->getTerminator());
3019     // Copy original phi DL over to the new one.
3020     BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
3021     Value *&EndValue = IVEndValues[OrigPhi];
3022     if (OrigPhi == OldInduction) {
3023       // We know what the end value is.
3024       EndValue = CountRoundDown;
3025     } else {
3026       IRBuilder<> B(Lp->getLoopPreheader()->getTerminator());
3027       Type *StepType = II.getStep()->getType();
3028       Instruction::CastOps CastOp =
3029         CastInst::getCastOpcode(CountRoundDown, true, StepType, true);
3030       Value *CRD = B.CreateCast(CastOp, CountRoundDown, StepType, "cast.crd");
3031       const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
3032       EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
3033       EndValue->setName("ind.end");
3034     }
3035 
3036     // The new PHI merges the original incoming value, in case of a bypass,
3037     // or the value at the end of the vectorized loop.
3038     BCResumeVal->addIncoming(EndValue, MiddleBlock);
3039 
3040     // Fix the scalar body counter (PHI node).
3041     // The old induction's phi node in the scalar body needs the truncated
3042     // value.
3043     for (BasicBlock *BB : LoopBypassBlocks)
3044       BCResumeVal->addIncoming(II.getStartValue(), BB);
3045     OrigPhi->setIncomingValueForBlock(ScalarPH, BCResumeVal);
3046   }
3047 
3048   // We need the OrigLoop (scalar loop part) latch terminator to help
3049   // produce correct debug info for the middle block BB instructions.
3050   // The legality check stage guarantees that the loop will have a single
3051   // latch.
3052   assert(isa<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()) &&
3053          "Scalar loop latch terminator isn't a branch");
3054   BranchInst *ScalarLatchBr =
3055       cast<BranchInst>(OrigLoop->getLoopLatch()->getTerminator());
3056 
3057   // Add a check in the middle block to see if we have completed
3058   // all of the iterations in the first vector loop.
3059   // If (N - N%VF) == N, then we *don't* need to run the remainder.
3060   // If tail is to be folded, we know we don't need to run the remainder.
3061   Value *CmpN = Builder.getTrue();
3062   if (!Cost->foldTailByMasking()) {
3063     CmpN =
3064         CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count,
3065                         CountRoundDown, "cmp.n", MiddleBlock->getTerminator());
3066 
3067     // Here we use the same DebugLoc as the scalar loop latch branch instead
3068     // of the corresponding compare because they may have ended up with
3069     // different line numbers and we want to avoid awkward line stepping while
3070     // debugging. Eg. if the compare has got a line number inside the loop.
3071     cast<Instruction>(CmpN)->setDebugLoc(ScalarLatchBr->getDebugLoc());
3072   }
3073 
3074   BranchInst *BrInst = BranchInst::Create(ExitBlock, ScalarPH, CmpN);
3075   BrInst->setDebugLoc(ScalarLatchBr->getDebugLoc());
3076   ReplaceInstWithInst(MiddleBlock->getTerminator(), BrInst);
3077 
3078   // Get ready to start creating new instructions into the vectorized body.
3079   Builder.SetInsertPoint(&*VecBody->getFirstInsertionPt());
3080 
3081   // Save the state.
3082   LoopVectorPreHeader = Lp->getLoopPreheader();
3083   LoopScalarPreHeader = ScalarPH;
3084   LoopMiddleBlock = MiddleBlock;
3085   LoopExitBlock = ExitBlock;
3086   LoopVectorBody = VecBody;
3087   LoopScalarBody = OldBasicBlock;
3088 
3089   Optional<MDNode *> VectorizedLoopID =
3090       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
3091                                       LLVMLoopVectorizeFollowupVectorized});
3092   if (VectorizedLoopID.hasValue()) {
3093     Lp->setLoopID(VectorizedLoopID.getValue());
3094 
3095     // Do not setAlreadyVectorized if loop attributes have been defined
3096     // explicitly.
3097     return LoopVectorPreHeader;
3098   }
3099 
3100   // Keep all loop hints from the original loop on the vector loop (we'll
3101   // replace the vectorizer-specific hints below).
3102   if (MDNode *LID = OrigLoop->getLoopID())
3103     Lp->setLoopID(LID);
3104 
3105   LoopVectorizeHints Hints(Lp, true, *ORE);
3106   Hints.setAlreadyVectorized();
3107 
3108   return LoopVectorPreHeader;
3109 }
3110 
3111 // Fix up external users of the induction variable. At this point, we are
3112 // in LCSSA form, with all external PHIs that use the IV having one input value,
3113 // coming from the remainder loop. We need those PHIs to also have a correct
3114 // value for the IV when arriving directly from the middle block.
3115 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
3116                                        const InductionDescriptor &II,
3117                                        Value *CountRoundDown, Value *EndValue,
3118                                        BasicBlock *MiddleBlock) {
3119   // There are two kinds of external IV usages - those that use the value
3120   // computed in the last iteration (the PHI) and those that use the penultimate
3121   // value (the value that feeds into the phi from the loop latch).
3122   // We allow both, but they, obviously, have different values.
3123 
3124   assert(OrigLoop->getExitBlock() && "Expected a single exit block");
3125 
3126   DenseMap<Value *, Value *> MissingVals;
3127 
3128   // An external user of the last iteration's value should see the value that
3129   // the remainder loop uses to initialize its own IV.
3130   Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
3131   for (User *U : PostInc->users()) {
3132     Instruction *UI = cast<Instruction>(U);
3133     if (!OrigLoop->contains(UI)) {
3134       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3135       MissingVals[UI] = EndValue;
3136     }
3137   }
3138 
3139   // An external user of the penultimate value need to see EndValue - Step.
3140   // The simplest way to get this is to recompute it from the constituent SCEVs,
3141   // that is Start + (Step * (CRD - 1)).
3142   for (User *U : OrigPhi->users()) {
3143     auto *UI = cast<Instruction>(U);
3144     if (!OrigLoop->contains(UI)) {
3145       const DataLayout &DL =
3146           OrigLoop->getHeader()->getModule()->getDataLayout();
3147       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3148 
3149       IRBuilder<> B(MiddleBlock->getTerminator());
3150       Value *CountMinusOne = B.CreateSub(
3151           CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1));
3152       Value *CMO =
3153           !II.getStep()->getType()->isIntegerTy()
3154               ? B.CreateCast(Instruction::SIToFP, CountMinusOne,
3155                              II.getStep()->getType())
3156               : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType());
3157       CMO->setName("cast.cmo");
3158       Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II);
3159       Escape->setName("ind.escape");
3160       MissingVals[UI] = Escape;
3161     }
3162   }
3163 
3164   for (auto &I : MissingVals) {
3165     PHINode *PHI = cast<PHINode>(I.first);
3166     // One corner case we have to handle is two IVs "chasing" each-other,
3167     // that is %IV2 = phi [...], [ %IV1, %latch ]
3168     // In this case, if IV1 has an external use, we need to avoid adding both
3169     // "last value of IV1" and "penultimate value of IV2". So, verify that we
3170     // don't already have an incoming value for the middle block.
3171     if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
3172       PHI->addIncoming(I.second, MiddleBlock);
3173   }
3174 }
3175 
3176 namespace {
3177 
3178 struct CSEDenseMapInfo {
3179   static bool canHandle(const Instruction *I) {
3180     return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
3181            isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
3182   }
3183 
3184   static inline Instruction *getEmptyKey() {
3185     return DenseMapInfo<Instruction *>::getEmptyKey();
3186   }
3187 
3188   static inline Instruction *getTombstoneKey() {
3189     return DenseMapInfo<Instruction *>::getTombstoneKey();
3190   }
3191 
3192   static unsigned getHashValue(const Instruction *I) {
3193     assert(canHandle(I) && "Unknown instruction!");
3194     return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
3195                                                            I->value_op_end()));
3196   }
3197 
3198   static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
3199     if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3200         LHS == getTombstoneKey() || RHS == getTombstoneKey())
3201       return LHS == RHS;
3202     return LHS->isIdenticalTo(RHS);
3203   }
3204 };
3205 
3206 } // end anonymous namespace
3207 
3208 ///Perform cse of induction variable instructions.
3209 static void cse(BasicBlock *BB) {
3210   // Perform simple cse.
3211   SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
3212   for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
3213     Instruction *In = &*I++;
3214 
3215     if (!CSEDenseMapInfo::canHandle(In))
3216       continue;
3217 
3218     // Check if we can replace this instruction with any of the
3219     // visited instructions.
3220     if (Instruction *V = CSEMap.lookup(In)) {
3221       In->replaceAllUsesWith(V);
3222       In->eraseFromParent();
3223       continue;
3224     }
3225 
3226     CSEMap[In] = In;
3227   }
3228 }
3229 
3230 unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
3231                                                        unsigned VF,
3232                                                        bool &NeedToScalarize) {
3233   Function *F = CI->getCalledFunction();
3234   StringRef FnName = CI->getCalledFunction()->getName();
3235   Type *ScalarRetTy = CI->getType();
3236   SmallVector<Type *, 4> Tys, ScalarTys;
3237   for (auto &ArgOp : CI->arg_operands())
3238     ScalarTys.push_back(ArgOp->getType());
3239 
3240   // Estimate cost of scalarized vector call. The source operands are assumed
3241   // to be vectors, so we need to extract individual elements from there,
3242   // execute VF scalar calls, and then gather the result into the vector return
3243   // value.
3244   unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys);
3245   if (VF == 1)
3246     return ScalarCallCost;
3247 
3248   // Compute corresponding vector type for return value and arguments.
3249   Type *RetTy = ToVectorTy(ScalarRetTy, VF);
3250   for (Type *ScalarTy : ScalarTys)
3251     Tys.push_back(ToVectorTy(ScalarTy, VF));
3252 
3253   // Compute costs of unpacking argument values for the scalar calls and
3254   // packing the return values to a vector.
3255   unsigned ScalarizationCost = getScalarizationOverhead(CI, VF);
3256 
3257   unsigned Cost = ScalarCallCost * VF + ScalarizationCost;
3258 
3259   // If we can't emit a vector call for this function, then the currently found
3260   // cost is the cost we need to return.
3261   NeedToScalarize = true;
3262   if (!TLI || !TLI->isFunctionVectorizable(FnName, VF) || CI->isNoBuiltin())
3263     return Cost;
3264 
3265   // If the corresponding vector cost is cheaper, return its cost.
3266   unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys);
3267   if (VectorCallCost < Cost) {
3268     NeedToScalarize = false;
3269     return VectorCallCost;
3270   }
3271   return Cost;
3272 }
3273 
3274 unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
3275                                                             unsigned VF) {
3276   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
3277   assert(ID && "Expected intrinsic call!");
3278 
3279   FastMathFlags FMF;
3280   if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
3281     FMF = FPMO->getFastMathFlags();
3282 
3283   SmallVector<Value *, 4> Operands(CI->arg_operands());
3284   return TTI.getIntrinsicInstrCost(ID, CI->getType(), Operands, FMF, VF);
3285 }
3286 
3287 static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
3288   auto *I1 = cast<IntegerType>(T1->getVectorElementType());
3289   auto *I2 = cast<IntegerType>(T2->getVectorElementType());
3290   return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3291 }
3292 static Type *largestIntegerVectorType(Type *T1, Type *T2) {
3293   auto *I1 = cast<IntegerType>(T1->getVectorElementType());
3294   auto *I2 = cast<IntegerType>(T2->getVectorElementType());
3295   return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3296 }
3297 
3298 void InnerLoopVectorizer::truncateToMinimalBitwidths() {
3299   // For every instruction `I` in MinBWs, truncate the operands, create a
3300   // truncated version of `I` and reextend its result. InstCombine runs
3301   // later and will remove any ext/trunc pairs.
3302   SmallPtrSet<Value *, 4> Erased;
3303   for (const auto &KV : Cost->getMinimalBitwidths()) {
3304     // If the value wasn't vectorized, we must maintain the original scalar
3305     // type. The absence of the value from VectorLoopValueMap indicates that it
3306     // wasn't vectorized.
3307     if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3308       continue;
3309     for (unsigned Part = 0; Part < UF; ++Part) {
3310       Value *I = getOrCreateVectorValue(KV.first, Part);
3311       if (Erased.find(I) != Erased.end() || I->use_empty() ||
3312           !isa<Instruction>(I))
3313         continue;
3314       Type *OriginalTy = I->getType();
3315       Type *ScalarTruncatedTy =
3316           IntegerType::get(OriginalTy->getContext(), KV.second);
3317       Type *TruncatedTy = VectorType::get(ScalarTruncatedTy,
3318                                           OriginalTy->getVectorNumElements());
3319       if (TruncatedTy == OriginalTy)
3320         continue;
3321 
3322       IRBuilder<> B(cast<Instruction>(I));
3323       auto ShrinkOperand = [&](Value *V) -> Value * {
3324         if (auto *ZI = dyn_cast<ZExtInst>(V))
3325           if (ZI->getSrcTy() == TruncatedTy)
3326             return ZI->getOperand(0);
3327         return B.CreateZExtOrTrunc(V, TruncatedTy);
3328       };
3329 
3330       // The actual instruction modification depends on the instruction type,
3331       // unfortunately.
3332       Value *NewI = nullptr;
3333       if (auto *BO = dyn_cast<BinaryOperator>(I)) {
3334         NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
3335                              ShrinkOperand(BO->getOperand(1)));
3336 
3337         // Any wrapping introduced by shrinking this operation shouldn't be
3338         // considered undefined behavior. So, we can't unconditionally copy
3339         // arithmetic wrapping flags to NewI.
3340         cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
3341       } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
3342         NewI =
3343             B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
3344                          ShrinkOperand(CI->getOperand(1)));
3345       } else if (auto *SI = dyn_cast<SelectInst>(I)) {
3346         NewI = B.CreateSelect(SI->getCondition(),
3347                               ShrinkOperand(SI->getTrueValue()),
3348                               ShrinkOperand(SI->getFalseValue()));
3349       } else if (auto *CI = dyn_cast<CastInst>(I)) {
3350         switch (CI->getOpcode()) {
3351         default:
3352           llvm_unreachable("Unhandled cast!");
3353         case Instruction::Trunc:
3354           NewI = ShrinkOperand(CI->getOperand(0));
3355           break;
3356         case Instruction::SExt:
3357           NewI = B.CreateSExtOrTrunc(
3358               CI->getOperand(0),
3359               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3360           break;
3361         case Instruction::ZExt:
3362           NewI = B.CreateZExtOrTrunc(
3363               CI->getOperand(0),
3364               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3365           break;
3366         }
3367       } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
3368         auto Elements0 = SI->getOperand(0)->getType()->getVectorNumElements();
3369         auto *O0 = B.CreateZExtOrTrunc(
3370             SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0));
3371         auto Elements1 = SI->getOperand(1)->getType()->getVectorNumElements();
3372         auto *O1 = B.CreateZExtOrTrunc(
3373             SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1));
3374 
3375         NewI = B.CreateShuffleVector(O0, O1, SI->getMask());
3376       } else if (isa<LoadInst>(I) || isa<PHINode>(I)) {
3377         // Don't do anything with the operands, just extend the result.
3378         continue;
3379       } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
3380         auto Elements = IE->getOperand(0)->getType()->getVectorNumElements();
3381         auto *O0 = B.CreateZExtOrTrunc(
3382             IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3383         auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
3384         NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
3385       } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
3386         auto Elements = EE->getOperand(0)->getType()->getVectorNumElements();
3387         auto *O0 = B.CreateZExtOrTrunc(
3388             EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3389         NewI = B.CreateExtractElement(O0, EE->getOperand(2));
3390       } else {
3391         // If we don't know what to do, be conservative and don't do anything.
3392         continue;
3393       }
3394 
3395       // Lastly, extend the result.
3396       NewI->takeName(cast<Instruction>(I));
3397       Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
3398       I->replaceAllUsesWith(Res);
3399       cast<Instruction>(I)->eraseFromParent();
3400       Erased.insert(I);
3401       VectorLoopValueMap.resetVectorValue(KV.first, Part, Res);
3402     }
3403   }
3404 
3405   // We'll have created a bunch of ZExts that are now parentless. Clean up.
3406   for (const auto &KV : Cost->getMinimalBitwidths()) {
3407     // If the value wasn't vectorized, we must maintain the original scalar
3408     // type. The absence of the value from VectorLoopValueMap indicates that it
3409     // wasn't vectorized.
3410     if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3411       continue;
3412     for (unsigned Part = 0; Part < UF; ++Part) {
3413       Value *I = getOrCreateVectorValue(KV.first, Part);
3414       ZExtInst *Inst = dyn_cast<ZExtInst>(I);
3415       if (Inst && Inst->use_empty()) {
3416         Value *NewI = Inst->getOperand(0);
3417         Inst->eraseFromParent();
3418         VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI);
3419       }
3420     }
3421   }
3422 }
3423 
3424 void InnerLoopVectorizer::fixVectorizedLoop() {
3425   // Insert truncates and extends for any truncated instructions as hints to
3426   // InstCombine.
3427   if (VF > 1)
3428     truncateToMinimalBitwidths();
3429 
3430   // Fix widened non-induction PHIs by setting up the PHI operands.
3431   if (OrigPHIsToFix.size()) {
3432     assert(EnableVPlanNativePath &&
3433            "Unexpected non-induction PHIs for fixup in non VPlan-native path");
3434     fixNonInductionPHIs();
3435   }
3436 
3437   // At this point every instruction in the original loop is widened to a
3438   // vector form. Now we need to fix the recurrences in the loop. These PHI
3439   // nodes are currently empty because we did not want to introduce cycles.
3440   // This is the second stage of vectorizing recurrences.
3441   fixCrossIterationPHIs();
3442 
3443   // Update the dominator tree.
3444   //
3445   // FIXME: After creating the structure of the new loop, the dominator tree is
3446   //        no longer up-to-date, and it remains that way until we update it
3447   //        here. An out-of-date dominator tree is problematic for SCEV,
3448   //        because SCEVExpander uses it to guide code generation. The
3449   //        vectorizer use SCEVExpanders in several places. Instead, we should
3450   //        keep the dominator tree up-to-date as we go.
3451   updateAnalysis();
3452 
3453   // Fix-up external users of the induction variables.
3454   for (auto &Entry : *Legal->getInductionVars())
3455     fixupIVUsers(Entry.first, Entry.second,
3456                  getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)),
3457                  IVEndValues[Entry.first], LoopMiddleBlock);
3458 
3459   fixLCSSAPHIs();
3460   for (Instruction *PI : PredicatedInstructions)
3461     sinkScalarOperands(&*PI);
3462 
3463   // Remove redundant induction instructions.
3464   cse(LoopVectorBody);
3465 }
3466 
3467 void InnerLoopVectorizer::fixCrossIterationPHIs() {
3468   // In order to support recurrences we need to be able to vectorize Phi nodes.
3469   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3470   // stage #2: We now need to fix the recurrences by adding incoming edges to
3471   // the currently empty PHI nodes. At this point every instruction in the
3472   // original loop is widened to a vector form so we can use them to construct
3473   // the incoming edges.
3474   for (PHINode &Phi : OrigLoop->getHeader()->phis()) {
3475     // Handle first-order recurrences and reductions that need to be fixed.
3476     if (Legal->isFirstOrderRecurrence(&Phi))
3477       fixFirstOrderRecurrence(&Phi);
3478     else if (Legal->isReductionVariable(&Phi))
3479       fixReduction(&Phi);
3480   }
3481 }
3482 
3483 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
3484   // This is the second phase of vectorizing first-order recurrences. An
3485   // overview of the transformation is described below. Suppose we have the
3486   // following loop.
3487   //
3488   //   for (int i = 0; i < n; ++i)
3489   //     b[i] = a[i] - a[i - 1];
3490   //
3491   // There is a first-order recurrence on "a". For this loop, the shorthand
3492   // scalar IR looks like:
3493   //
3494   //   scalar.ph:
3495   //     s_init = a[-1]
3496   //     br scalar.body
3497   //
3498   //   scalar.body:
3499   //     i = phi [0, scalar.ph], [i+1, scalar.body]
3500   //     s1 = phi [s_init, scalar.ph], [s2, scalar.body]
3501   //     s2 = a[i]
3502   //     b[i] = s2 - s1
3503   //     br cond, scalar.body, ...
3504   //
3505   // In this example, s1 is a recurrence because it's value depends on the
3506   // previous iteration. In the first phase of vectorization, we created a
3507   // temporary value for s1. We now complete the vectorization and produce the
3508   // shorthand vector IR shown below (for VF = 4, UF = 1).
3509   //
3510   //   vector.ph:
3511   //     v_init = vector(..., ..., ..., a[-1])
3512   //     br vector.body
3513   //
3514   //   vector.body
3515   //     i = phi [0, vector.ph], [i+4, vector.body]
3516   //     v1 = phi [v_init, vector.ph], [v2, vector.body]
3517   //     v2 = a[i, i+1, i+2, i+3];
3518   //     v3 = vector(v1(3), v2(0, 1, 2))
3519   //     b[i, i+1, i+2, i+3] = v2 - v3
3520   //     br cond, vector.body, middle.block
3521   //
3522   //   middle.block:
3523   //     x = v2(3)
3524   //     br scalar.ph
3525   //
3526   //   scalar.ph:
3527   //     s_init = phi [x, middle.block], [a[-1], otherwise]
3528   //     br scalar.body
3529   //
3530   // After execution completes the vector loop, we extract the next value of
3531   // the recurrence (x) to use as the initial value in the scalar loop.
3532 
3533   // Get the original loop preheader and single loop latch.
3534   auto *Preheader = OrigLoop->getLoopPreheader();
3535   auto *Latch = OrigLoop->getLoopLatch();
3536 
3537   // Get the initial and previous values of the scalar recurrence.
3538   auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader);
3539   auto *Previous = Phi->getIncomingValueForBlock(Latch);
3540 
3541   // Create a vector from the initial value.
3542   auto *VectorInit = ScalarInit;
3543   if (VF > 1) {
3544     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
3545     VectorInit = Builder.CreateInsertElement(
3546         UndefValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit,
3547         Builder.getInt32(VF - 1), "vector.recur.init");
3548   }
3549 
3550   // We constructed a temporary phi node in the first phase of vectorization.
3551   // This phi node will eventually be deleted.
3552   Builder.SetInsertPoint(
3553       cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0)));
3554 
3555   // Create a phi node for the new recurrence. The current value will either be
3556   // the initial value inserted into a vector or loop-varying vector value.
3557   auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur");
3558   VecPhi->addIncoming(VectorInit, LoopVectorPreHeader);
3559 
3560   // Get the vectorized previous value of the last part UF - 1. It appears last
3561   // among all unrolled iterations, due to the order of their construction.
3562   Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1);
3563 
3564   // Find and set the insertion point after the previous value if it is an
3565   // instruction.
3566   BasicBlock::iterator InsertPt;
3567   // Note that the previous value may have been constant-folded so it is not
3568   // guaranteed to be an instruction in the vector loop.
3569   // FIXME: Loop invariant values do not form recurrences. We should deal with
3570   //        them earlier.
3571   if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart))
3572     InsertPt = LoopVectorBody->getFirstInsertionPt();
3573   else {
3574     Instruction *PreviousInst = cast<Instruction>(PreviousLastPart);
3575     if (isa<PHINode>(PreviousLastPart))
3576       // If the previous value is a phi node, we should insert after all the phi
3577       // nodes in the block containing the PHI to avoid breaking basic block
3578       // verification. Note that the basic block may be different to
3579       // LoopVectorBody, in case we predicate the loop.
3580       InsertPt = PreviousInst->getParent()->getFirstInsertionPt();
3581     else
3582       InsertPt = ++PreviousInst->getIterator();
3583   }
3584   Builder.SetInsertPoint(&*InsertPt);
3585 
3586   // We will construct a vector for the recurrence by combining the values for
3587   // the current and previous iterations. This is the required shuffle mask.
3588   SmallVector<Constant *, 8> ShuffleMask(VF);
3589   ShuffleMask[0] = Builder.getInt32(VF - 1);
3590   for (unsigned I = 1; I < VF; ++I)
3591     ShuffleMask[I] = Builder.getInt32(I + VF - 1);
3592 
3593   // The vector from which to take the initial value for the current iteration
3594   // (actual or unrolled). Initially, this is the vector phi node.
3595   Value *Incoming = VecPhi;
3596 
3597   // Shuffle the current and previous vector and update the vector parts.
3598   for (unsigned Part = 0; Part < UF; ++Part) {
3599     Value *PreviousPart = getOrCreateVectorValue(Previous, Part);
3600     Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part);
3601     auto *Shuffle =
3602         VF > 1 ? Builder.CreateShuffleVector(Incoming, PreviousPart,
3603                                              ConstantVector::get(ShuffleMask))
3604                : Incoming;
3605     PhiPart->replaceAllUsesWith(Shuffle);
3606     cast<Instruction>(PhiPart)->eraseFromParent();
3607     VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle);
3608     Incoming = PreviousPart;
3609   }
3610 
3611   // Fix the latch value of the new recurrence in the vector loop.
3612   VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
3613 
3614   // Extract the last vector element in the middle block. This will be the
3615   // initial value for the recurrence when jumping to the scalar loop.
3616   auto *ExtractForScalar = Incoming;
3617   if (VF > 1) {
3618     Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
3619     ExtractForScalar = Builder.CreateExtractElement(
3620         ExtractForScalar, Builder.getInt32(VF - 1), "vector.recur.extract");
3621   }
3622   // Extract the second last element in the middle block if the
3623   // Phi is used outside the loop. We need to extract the phi itself
3624   // and not the last element (the phi update in the current iteration). This
3625   // will be the value when jumping to the exit block from the LoopMiddleBlock,
3626   // when the scalar loop is not run at all.
3627   Value *ExtractForPhiUsedOutsideLoop = nullptr;
3628   if (VF > 1)
3629     ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
3630         Incoming, Builder.getInt32(VF - 2), "vector.recur.extract.for.phi");
3631   // When loop is unrolled without vectorizing, initialize
3632   // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of
3633   // `Incoming`. This is analogous to the vectorized case above: extracting the
3634   // second last element when VF > 1.
3635   else if (UF > 1)
3636     ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2);
3637 
3638   // Fix the initial value of the original recurrence in the scalar loop.
3639   Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
3640   auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
3641   for (auto *BB : predecessors(LoopScalarPreHeader)) {
3642     auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
3643     Start->addIncoming(Incoming, BB);
3644   }
3645 
3646   Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start);
3647   Phi->setName("scalar.recur");
3648 
3649   // Finally, fix users of the recurrence outside the loop. The users will need
3650   // either the last value of the scalar recurrence or the last value of the
3651   // vector recurrence we extracted in the middle block. Since the loop is in
3652   // LCSSA form, we just need to find all the phi nodes for the original scalar
3653   // recurrence in the exit block, and then add an edge for the middle block.
3654   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3655     if (LCSSAPhi.getIncomingValue(0) == Phi) {
3656       LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
3657     }
3658   }
3659 }
3660 
3661 void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
3662   Constant *Zero = Builder.getInt32(0);
3663 
3664   // Get it's reduction variable descriptor.
3665   assert(Legal->isReductionVariable(Phi) &&
3666          "Unable to find the reduction variable");
3667   RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[Phi];
3668 
3669   RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();
3670   TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
3671   Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
3672   RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind =
3673     RdxDesc.getMinMaxRecurrenceKind();
3674   setDebugLocFromInst(Builder, ReductionStartValue);
3675 
3676   // We need to generate a reduction vector from the incoming scalar.
3677   // To do so, we need to generate the 'identity' vector and override
3678   // one of the elements with the incoming scalar reduction. We need
3679   // to do it in the vector-loop preheader.
3680   Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
3681 
3682   // This is the vector-clone of the value that leaves the loop.
3683   Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType();
3684 
3685   // Find the reduction identity variable. Zero for addition, or, xor,
3686   // one for multiplication, -1 for And.
3687   Value *Identity;
3688   Value *VectorStart;
3689   if (RK == RecurrenceDescriptor::RK_IntegerMinMax ||
3690       RK == RecurrenceDescriptor::RK_FloatMinMax) {
3691     // MinMax reduction have the start value as their identify.
3692     if (VF == 1) {
3693       VectorStart = Identity = ReductionStartValue;
3694     } else {
3695       VectorStart = Identity =
3696         Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident");
3697     }
3698   } else {
3699     // Handle other reduction kinds:
3700     Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity(
3701         RK, VecTy->getScalarType());
3702     if (VF == 1) {
3703       Identity = Iden;
3704       // This vector is the Identity vector where the first element is the
3705       // incoming scalar reduction.
3706       VectorStart = ReductionStartValue;
3707     } else {
3708       Identity = ConstantVector::getSplat(VF, Iden);
3709 
3710       // This vector is the Identity vector where the first element is the
3711       // incoming scalar reduction.
3712       VectorStart =
3713         Builder.CreateInsertElement(Identity, ReductionStartValue, Zero);
3714     }
3715   }
3716 
3717   // Wrap flags are in general invalid after vectorization, clear them.
3718   clearReductionWrapFlags(RdxDesc);
3719 
3720   // Fix the vector-loop phi.
3721 
3722   // Reductions do not have to start at zero. They can start with
3723   // any loop invariant values.
3724   BasicBlock *Latch = OrigLoop->getLoopLatch();
3725   Value *LoopVal = Phi->getIncomingValueForBlock(Latch);
3726 
3727   for (unsigned Part = 0; Part < UF; ++Part) {
3728     Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part);
3729     Value *Val = getOrCreateVectorValue(LoopVal, Part);
3730     // Make sure to add the reduction start value only to the
3731     // first unroll part.
3732     Value *StartVal = (Part == 0) ? VectorStart : Identity;
3733     cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader);
3734     cast<PHINode>(VecRdxPhi)
3735       ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
3736   }
3737 
3738   // Before each round, move the insertion point right between
3739   // the PHIs and the values we are going to write.
3740   // This allows us to write both PHINodes and the extractelement
3741   // instructions.
3742   Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3743 
3744   setDebugLocFromInst(Builder, LoopExitInst);
3745 
3746   // If tail is folded by masking, the vector value to leave the loop should be
3747   // a Select choosing between the vectorized LoopExitInst and vectorized Phi,
3748   // instead of the former.
3749   if (Cost->foldTailByMasking()) {
3750     for (unsigned Part = 0; Part < UF; ++Part) {
3751       Value *VecLoopExitInst =
3752           VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3753       Value *Sel = nullptr;
3754       for (User *U : VecLoopExitInst->users()) {
3755         if (isa<SelectInst>(U)) {
3756           assert(!Sel && "Reduction exit feeding two selects");
3757           Sel = U;
3758         } else
3759           assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select");
3760       }
3761       assert(Sel && "Reduction exit feeds no select");
3762       VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, Sel);
3763     }
3764   }
3765 
3766   // If the vector reduction can be performed in a smaller type, we truncate
3767   // then extend the loop exit value to enable InstCombine to evaluate the
3768   // entire expression in the smaller type.
3769   if (VF > 1 && Phi->getType() != RdxDesc.getRecurrenceType()) {
3770     Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
3771     Builder.SetInsertPoint(
3772         LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator());
3773     VectorParts RdxParts(UF);
3774     for (unsigned Part = 0; Part < UF; ++Part) {
3775       RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3776       Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3777       Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
3778                                         : Builder.CreateZExt(Trunc, VecTy);
3779       for (Value::user_iterator UI = RdxParts[Part]->user_begin();
3780            UI != RdxParts[Part]->user_end();)
3781         if (*UI != Trunc) {
3782           (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd);
3783           RdxParts[Part] = Extnd;
3784         } else {
3785           ++UI;
3786         }
3787     }
3788     Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3789     for (unsigned Part = 0; Part < UF; ++Part) {
3790       RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3791       VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]);
3792     }
3793   }
3794 
3795   // Reduce all of the unrolled parts into a single vector.
3796   Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0);
3797   unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK);
3798 
3799   // The middle block terminator has already been assigned a DebugLoc here (the
3800   // OrigLoop's single latch terminator). We want the whole middle block to
3801   // appear to execute on this line because: (a) it is all compiler generated,
3802   // (b) these instructions are always executed after evaluating the latch
3803   // conditional branch, and (c) other passes may add new predecessors which
3804   // terminate on this line. This is the easiest way to ensure we don't
3805   // accidentally cause an extra step back into the loop while debugging.
3806   setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator());
3807   for (unsigned Part = 1; Part < UF; ++Part) {
3808     Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3809     if (Op != Instruction::ICmp && Op != Instruction::FCmp)
3810       // Floating point operations had to be 'fast' to enable the reduction.
3811       ReducedPartRdx = addFastMathFlag(
3812           Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxPart,
3813                               ReducedPartRdx, "bin.rdx"),
3814           RdxDesc.getFastMathFlags());
3815     else
3816       ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx,
3817                                       RdxPart);
3818   }
3819 
3820   if (VF > 1) {
3821     bool NoNaN = Legal->hasFunNoNaNAttr();
3822     ReducedPartRdx =
3823         createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN);
3824     // If the reduction can be performed in a smaller type, we need to extend
3825     // the reduction to the wider type before we branch to the original loop.
3826     if (Phi->getType() != RdxDesc.getRecurrenceType())
3827       ReducedPartRdx =
3828         RdxDesc.isSigned()
3829         ? Builder.CreateSExt(ReducedPartRdx, Phi->getType())
3830         : Builder.CreateZExt(ReducedPartRdx, Phi->getType());
3831   }
3832 
3833   // Create a phi node that merges control-flow from the backedge-taken check
3834   // block and the middle block.
3835   PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx",
3836                                         LoopScalarPreHeader->getTerminator());
3837   for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
3838     BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]);
3839   BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
3840 
3841   // Now, we need to fix the users of the reduction variable
3842   // inside and outside of the scalar remainder loop.
3843   // We know that the loop is in LCSSA form. We need to update the
3844   // PHI nodes in the exit blocks.
3845   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3846     // All PHINodes need to have a single entry edge, or two if
3847     // we already fixed them.
3848     assert(LCSSAPhi.getNumIncomingValues() < 3 && "Invalid LCSSA PHI");
3849 
3850     // We found a reduction value exit-PHI. Update it with the
3851     // incoming bypass edge.
3852     if (LCSSAPhi.getIncomingValue(0) == LoopExitInst)
3853       LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
3854   } // end of the LCSSA phi scan.
3855 
3856     // Fix the scalar loop reduction variable with the incoming reduction sum
3857     // from the vector body and from the backedge value.
3858   int IncomingEdgeBlockIdx =
3859     Phi->getBasicBlockIndex(OrigLoop->getLoopLatch());
3860   assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
3861   // Pick the other block.
3862   int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
3863   Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
3864   Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
3865 }
3866 
3867 void InnerLoopVectorizer::clearReductionWrapFlags(
3868     RecurrenceDescriptor &RdxDesc) {
3869   RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();
3870   if (RK != RecurrenceDescriptor::RK_IntegerAdd &&
3871       RK != RecurrenceDescriptor::RK_IntegerMult)
3872     return;
3873 
3874   Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr();
3875   assert(LoopExitInstr && "null loop exit instruction");
3876   SmallVector<Instruction *, 8> Worklist;
3877   SmallPtrSet<Instruction *, 8> Visited;
3878   Worklist.push_back(LoopExitInstr);
3879   Visited.insert(LoopExitInstr);
3880 
3881   while (!Worklist.empty()) {
3882     Instruction *Cur = Worklist.pop_back_val();
3883     if (isa<OverflowingBinaryOperator>(Cur))
3884       for (unsigned Part = 0; Part < UF; ++Part) {
3885         Value *V = getOrCreateVectorValue(Cur, Part);
3886         cast<Instruction>(V)->dropPoisonGeneratingFlags();
3887       }
3888 
3889     for (User *U : Cur->users()) {
3890       Instruction *UI = cast<Instruction>(U);
3891       if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) &&
3892           Visited.insert(UI).second)
3893         Worklist.push_back(UI);
3894     }
3895   }
3896 }
3897 
3898 void InnerLoopVectorizer::fixLCSSAPHIs() {
3899   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3900     if (LCSSAPhi.getNumIncomingValues() == 1) {
3901       auto *IncomingValue = LCSSAPhi.getIncomingValue(0);
3902       // Non-instruction incoming values will have only one value.
3903       unsigned LastLane = 0;
3904       if (isa<Instruction>(IncomingValue))
3905           LastLane = Cost->isUniformAfterVectorization(
3906                          cast<Instruction>(IncomingValue), VF)
3907                          ? 0
3908                          : VF - 1;
3909       // Can be a loop invariant incoming value or the last scalar value to be
3910       // extracted from the vectorized loop.
3911       Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
3912       Value *lastIncomingValue =
3913           getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane });
3914       LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock);
3915     }
3916   }
3917 }
3918 
3919 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
3920   // The basic block and loop containing the predicated instruction.
3921   auto *PredBB = PredInst->getParent();
3922   auto *VectorLoop = LI->getLoopFor(PredBB);
3923 
3924   // Initialize a worklist with the operands of the predicated instruction.
3925   SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
3926 
3927   // Holds instructions that we need to analyze again. An instruction may be
3928   // reanalyzed if we don't yet know if we can sink it or not.
3929   SmallVector<Instruction *, 8> InstsToReanalyze;
3930 
3931   // Returns true if a given use occurs in the predicated block. Phi nodes use
3932   // their operands in their corresponding predecessor blocks.
3933   auto isBlockOfUsePredicated = [&](Use &U) -> bool {
3934     auto *I = cast<Instruction>(U.getUser());
3935     BasicBlock *BB = I->getParent();
3936     if (auto *Phi = dyn_cast<PHINode>(I))
3937       BB = Phi->getIncomingBlock(
3938           PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
3939     return BB == PredBB;
3940   };
3941 
3942   // Iteratively sink the scalarized operands of the predicated instruction
3943   // into the block we created for it. When an instruction is sunk, it's
3944   // operands are then added to the worklist. The algorithm ends after one pass
3945   // through the worklist doesn't sink a single instruction.
3946   bool Changed;
3947   do {
3948     // Add the instructions that need to be reanalyzed to the worklist, and
3949     // reset the changed indicator.
3950     Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
3951     InstsToReanalyze.clear();
3952     Changed = false;
3953 
3954     while (!Worklist.empty()) {
3955       auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
3956 
3957       // We can't sink an instruction if it is a phi node, is already in the
3958       // predicated block, is not in the loop, or may have side effects.
3959       if (!I || isa<PHINode>(I) || I->getParent() == PredBB ||
3960           !VectorLoop->contains(I) || I->mayHaveSideEffects())
3961         continue;
3962 
3963       // It's legal to sink the instruction if all its uses occur in the
3964       // predicated block. Otherwise, there's nothing to do yet, and we may
3965       // need to reanalyze the instruction.
3966       if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
3967         InstsToReanalyze.push_back(I);
3968         continue;
3969       }
3970 
3971       // Move the instruction to the beginning of the predicated block, and add
3972       // it's operands to the worklist.
3973       I->moveBefore(&*PredBB->getFirstInsertionPt());
3974       Worklist.insert(I->op_begin(), I->op_end());
3975 
3976       // The sinking may have enabled other instructions to be sunk, so we will
3977       // need to iterate.
3978       Changed = true;
3979     }
3980   } while (Changed);
3981 }
3982 
3983 void InnerLoopVectorizer::fixNonInductionPHIs() {
3984   for (PHINode *OrigPhi : OrigPHIsToFix) {
3985     PHINode *NewPhi =
3986         cast<PHINode>(VectorLoopValueMap.getVectorValue(OrigPhi, 0));
3987     unsigned NumIncomingValues = OrigPhi->getNumIncomingValues();
3988 
3989     SmallVector<BasicBlock *, 2> ScalarBBPredecessors(
3990         predecessors(OrigPhi->getParent()));
3991     SmallVector<BasicBlock *, 2> VectorBBPredecessors(
3992         predecessors(NewPhi->getParent()));
3993     assert(ScalarBBPredecessors.size() == VectorBBPredecessors.size() &&
3994            "Scalar and Vector BB should have the same number of predecessors");
3995 
3996     // The insertion point in Builder may be invalidated by the time we get
3997     // here. Force the Builder insertion point to something valid so that we do
3998     // not run into issues during insertion point restore in
3999     // getOrCreateVectorValue calls below.
4000     Builder.SetInsertPoint(NewPhi);
4001 
4002     // The predecessor order is preserved and we can rely on mapping between
4003     // scalar and vector block predecessors.
4004     for (unsigned i = 0; i < NumIncomingValues; ++i) {
4005       BasicBlock *NewPredBB = VectorBBPredecessors[i];
4006 
4007       // When looking up the new scalar/vector values to fix up, use incoming
4008       // values from original phi.
4009       Value *ScIncV =
4010           OrigPhi->getIncomingValueForBlock(ScalarBBPredecessors[i]);
4011 
4012       // Scalar incoming value may need a broadcast
4013       Value *NewIncV = getOrCreateVectorValue(ScIncV, 0);
4014       NewPhi->addIncoming(NewIncV, NewPredBB);
4015     }
4016   }
4017 }
4018 
4019 void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, unsigned UF,
4020                                    unsigned VF, bool IsPtrLoopInvariant,
4021                                    SmallBitVector &IsIndexLoopInvariant) {
4022   // Construct a vector GEP by widening the operands of the scalar GEP as
4023   // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
4024   // results in a vector of pointers when at least one operand of the GEP
4025   // is vector-typed. Thus, to keep the representation compact, we only use
4026   // vector-typed operands for loop-varying values.
4027 
4028   if (VF > 1 && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) {
4029     // If we are vectorizing, but the GEP has only loop-invariant operands,
4030     // the GEP we build (by only using vector-typed operands for
4031     // loop-varying values) would be a scalar pointer. Thus, to ensure we
4032     // produce a vector of pointers, we need to either arbitrarily pick an
4033     // operand to broadcast, or broadcast a clone of the original GEP.
4034     // Here, we broadcast a clone of the original.
4035     //
4036     // TODO: If at some point we decide to scalarize instructions having
4037     //       loop-invariant operands, this special case will no longer be
4038     //       required. We would add the scalarization decision to
4039     //       collectLoopScalars() and teach getVectorValue() to broadcast
4040     //       the lane-zero scalar value.
4041     auto *Clone = Builder.Insert(GEP->clone());
4042     for (unsigned Part = 0; Part < UF; ++Part) {
4043       Value *EntryPart = Builder.CreateVectorSplat(VF, Clone);
4044       VectorLoopValueMap.setVectorValue(GEP, Part, EntryPart);
4045       addMetadata(EntryPart, GEP);
4046     }
4047   } else {
4048     // If the GEP has at least one loop-varying operand, we are sure to
4049     // produce a vector of pointers. But if we are only unrolling, we want
4050     // to produce a scalar GEP for each unroll part. Thus, the GEP we
4051     // produce with the code below will be scalar (if VF == 1) or vector
4052     // (otherwise). Note that for the unroll-only case, we still maintain
4053     // values in the vector mapping with initVector, as we do for other
4054     // instructions.
4055     for (unsigned Part = 0; Part < UF; ++Part) {
4056       // The pointer operand of the new GEP. If it's loop-invariant, we
4057       // won't broadcast it.
4058       auto *Ptr = IsPtrLoopInvariant
4059                       ? GEP->getPointerOperand()
4060                       : getOrCreateVectorValue(GEP->getPointerOperand(), Part);
4061 
4062       // Collect all the indices for the new GEP. If any index is
4063       // loop-invariant, we won't broadcast it.
4064       SmallVector<Value *, 4> Indices;
4065       for (auto Index : enumerate(GEP->indices())) {
4066         Value *User = Index.value().get();
4067         if (IsIndexLoopInvariant[Index.index()])
4068           Indices.push_back(User);
4069         else
4070           Indices.push_back(getOrCreateVectorValue(User, Part));
4071       }
4072 
4073       // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
4074       // but it should be a vector, otherwise.
4075       auto *NewGEP =
4076           GEP->isInBounds()
4077               ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr,
4078                                           Indices)
4079               : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices);
4080       assert((VF == 1 || NewGEP->getType()->isVectorTy()) &&
4081              "NewGEP is not a pointer vector");
4082       VectorLoopValueMap.setVectorValue(GEP, Part, NewGEP);
4083       addMetadata(NewGEP, GEP);
4084     }
4085   }
4086 }
4087 
4088 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
4089                                               unsigned VF) {
4090   PHINode *P = cast<PHINode>(PN);
4091   if (EnableVPlanNativePath) {
4092     // Currently we enter here in the VPlan-native path for non-induction
4093     // PHIs where all control flow is uniform. We simply widen these PHIs.
4094     // Create a vector phi with no operands - the vector phi operands will be
4095     // set at the end of vector code generation.
4096     Type *VecTy =
4097         (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF);
4098     Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi");
4099     VectorLoopValueMap.setVectorValue(P, 0, VecPhi);
4100     OrigPHIsToFix.push_back(P);
4101 
4102     return;
4103   }
4104 
4105   assert(PN->getParent() == OrigLoop->getHeader() &&
4106          "Non-header phis should have been handled elsewhere");
4107 
4108   // In order to support recurrences we need to be able to vectorize Phi nodes.
4109   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
4110   // stage #1: We create a new vector PHI node with no incoming edges. We'll use
4111   // this value when we vectorize all of the instructions that use the PHI.
4112   if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) {
4113     for (unsigned Part = 0; Part < UF; ++Part) {
4114       // This is phase one of vectorizing PHIs.
4115       Type *VecTy =
4116           (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF);
4117       Value *EntryPart = PHINode::Create(
4118           VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt());
4119       VectorLoopValueMap.setVectorValue(P, Part, EntryPart);
4120     }
4121     return;
4122   }
4123 
4124   setDebugLocFromInst(Builder, P);
4125 
4126   // This PHINode must be an induction variable.
4127   // Make sure that we know about it.
4128   assert(Legal->getInductionVars()->count(P) && "Not an induction variable");
4129 
4130   InductionDescriptor II = Legal->getInductionVars()->lookup(P);
4131   const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
4132 
4133   // FIXME: The newly created binary instructions should contain nsw/nuw flags,
4134   // which can be found from the original scalar operations.
4135   switch (II.getKind()) {
4136   case InductionDescriptor::IK_NoInduction:
4137     llvm_unreachable("Unknown induction");
4138   case InductionDescriptor::IK_IntInduction:
4139   case InductionDescriptor::IK_FpInduction:
4140     llvm_unreachable("Integer/fp induction is handled elsewhere.");
4141   case InductionDescriptor::IK_PtrInduction: {
4142     // Handle the pointer induction variable case.
4143     assert(P->getType()->isPointerTy() && "Unexpected type.");
4144     // This is the normalized GEP that starts counting at zero.
4145     Value *PtrInd = Induction;
4146     PtrInd = Builder.CreateSExtOrTrunc(PtrInd, II.getStep()->getType());
4147     // Determine the number of scalars we need to generate for each unroll
4148     // iteration. If the instruction is uniform, we only need to generate the
4149     // first lane. Otherwise, we generate all VF values.
4150     unsigned Lanes = Cost->isUniformAfterVectorization(P, VF) ? 1 : VF;
4151     // These are the scalar results. Notice that we don't generate vector GEPs
4152     // because scalar GEPs result in better code.
4153     for (unsigned Part = 0; Part < UF; ++Part) {
4154       for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
4155         Constant *Idx = ConstantInt::get(PtrInd->getType(), Lane + Part * VF);
4156         Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
4157         Value *SclrGep =
4158             emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II);
4159         SclrGep->setName("next.gep");
4160         VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep);
4161       }
4162     }
4163     return;
4164   }
4165   }
4166 }
4167 
4168 /// A helper function for checking whether an integer division-related
4169 /// instruction may divide by zero (in which case it must be predicated if
4170 /// executed conditionally in the scalar code).
4171 /// TODO: It may be worthwhile to generalize and check isKnownNonZero().
4172 /// Non-zero divisors that are non compile-time constants will not be
4173 /// converted into multiplication, so we will still end up scalarizing
4174 /// the division, but can do so w/o predication.
4175 static bool mayDivideByZero(Instruction &I) {
4176   assert((I.getOpcode() == Instruction::UDiv ||
4177           I.getOpcode() == Instruction::SDiv ||
4178           I.getOpcode() == Instruction::URem ||
4179           I.getOpcode() == Instruction::SRem) &&
4180          "Unexpected instruction");
4181   Value *Divisor = I.getOperand(1);
4182   auto *CInt = dyn_cast<ConstantInt>(Divisor);
4183   return !CInt || CInt->isZero();
4184 }
4185 
4186 void InnerLoopVectorizer::widenInstruction(Instruction &I) {
4187   switch (I.getOpcode()) {
4188   case Instruction::Br:
4189   case Instruction::PHI:
4190   case Instruction::GetElementPtr:
4191     llvm_unreachable("This instruction is handled by a different recipe.");
4192   case Instruction::UDiv:
4193   case Instruction::SDiv:
4194   case Instruction::SRem:
4195   case Instruction::URem:
4196   case Instruction::Add:
4197   case Instruction::FAdd:
4198   case Instruction::Sub:
4199   case Instruction::FSub:
4200   case Instruction::FNeg:
4201   case Instruction::Mul:
4202   case Instruction::FMul:
4203   case Instruction::FDiv:
4204   case Instruction::FRem:
4205   case Instruction::Shl:
4206   case Instruction::LShr:
4207   case Instruction::AShr:
4208   case Instruction::And:
4209   case Instruction::Or:
4210   case Instruction::Xor: {
4211     // Just widen unops and binops.
4212     setDebugLocFromInst(Builder, &I);
4213 
4214     for (unsigned Part = 0; Part < UF; ++Part) {
4215       SmallVector<Value *, 2> Ops;
4216       for (Value *Op : I.operands())
4217         Ops.push_back(getOrCreateVectorValue(Op, Part));
4218 
4219       Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops);
4220 
4221       if (auto *VecOp = dyn_cast<Instruction>(V))
4222         VecOp->copyIRFlags(&I);
4223 
4224       // Use this vector value for all users of the original instruction.
4225       VectorLoopValueMap.setVectorValue(&I, Part, V);
4226       addMetadata(V, &I);
4227     }
4228 
4229     break;
4230   }
4231   case Instruction::Select: {
4232     // Widen selects.
4233     // If the selector is loop invariant we can create a select
4234     // instruction with a scalar condition. Otherwise, use vector-select.
4235     auto *SE = PSE.getSE();
4236     bool InvariantCond =
4237         SE->isLoopInvariant(PSE.getSCEV(I.getOperand(0)), OrigLoop);
4238     setDebugLocFromInst(Builder, &I);
4239 
4240     // The condition can be loop invariant  but still defined inside the
4241     // loop. This means that we can't just use the original 'cond' value.
4242     // We have to take the 'vectorized' value and pick the first lane.
4243     // Instcombine will make this a no-op.
4244 
4245     auto *ScalarCond = getOrCreateScalarValue(I.getOperand(0), {0, 0});
4246 
4247     for (unsigned Part = 0; Part < UF; ++Part) {
4248       Value *Cond = getOrCreateVectorValue(I.getOperand(0), Part);
4249       Value *Op0 = getOrCreateVectorValue(I.getOperand(1), Part);
4250       Value *Op1 = getOrCreateVectorValue(I.getOperand(2), Part);
4251       Value *Sel =
4252           Builder.CreateSelect(InvariantCond ? ScalarCond : Cond, Op0, Op1);
4253       VectorLoopValueMap.setVectorValue(&I, Part, Sel);
4254       addMetadata(Sel, &I);
4255     }
4256 
4257     break;
4258   }
4259 
4260   case Instruction::ICmp:
4261   case Instruction::FCmp: {
4262     // Widen compares. Generate vector compares.
4263     bool FCmp = (I.getOpcode() == Instruction::FCmp);
4264     auto *Cmp = cast<CmpInst>(&I);
4265     setDebugLocFromInst(Builder, Cmp);
4266     for (unsigned Part = 0; Part < UF; ++Part) {
4267       Value *A = getOrCreateVectorValue(Cmp->getOperand(0), Part);
4268       Value *B = getOrCreateVectorValue(Cmp->getOperand(1), Part);
4269       Value *C = nullptr;
4270       if (FCmp) {
4271         // Propagate fast math flags.
4272         IRBuilder<>::FastMathFlagGuard FMFG(Builder);
4273         Builder.setFastMathFlags(Cmp->getFastMathFlags());
4274         C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
4275       } else {
4276         C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
4277       }
4278       VectorLoopValueMap.setVectorValue(&I, Part, C);
4279       addMetadata(C, &I);
4280     }
4281 
4282     break;
4283   }
4284 
4285   case Instruction::ZExt:
4286   case Instruction::SExt:
4287   case Instruction::FPToUI:
4288   case Instruction::FPToSI:
4289   case Instruction::FPExt:
4290   case Instruction::PtrToInt:
4291   case Instruction::IntToPtr:
4292   case Instruction::SIToFP:
4293   case Instruction::UIToFP:
4294   case Instruction::Trunc:
4295   case Instruction::FPTrunc:
4296   case Instruction::BitCast: {
4297     auto *CI = cast<CastInst>(&I);
4298     setDebugLocFromInst(Builder, CI);
4299 
4300     /// Vectorize casts.
4301     Type *DestTy =
4302         (VF == 1) ? CI->getType() : VectorType::get(CI->getType(), VF);
4303 
4304     for (unsigned Part = 0; Part < UF; ++Part) {
4305       Value *A = getOrCreateVectorValue(CI->getOperand(0), Part);
4306       Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
4307       VectorLoopValueMap.setVectorValue(&I, Part, Cast);
4308       addMetadata(Cast, &I);
4309     }
4310     break;
4311   }
4312 
4313   case Instruction::Call: {
4314     // Ignore dbg intrinsics.
4315     if (isa<DbgInfoIntrinsic>(I))
4316       break;
4317     setDebugLocFromInst(Builder, &I);
4318 
4319     Module *M = I.getParent()->getParent()->getParent();
4320     auto *CI = cast<CallInst>(&I);
4321 
4322     StringRef FnName = CI->getCalledFunction()->getName();
4323     Function *F = CI->getCalledFunction();
4324     Type *RetTy = ToVectorTy(CI->getType(), VF);
4325     SmallVector<Type *, 4> Tys;
4326     for (Value *ArgOperand : CI->arg_operands())
4327       Tys.push_back(ToVectorTy(ArgOperand->getType(), VF));
4328 
4329     Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
4330 
4331     // The flag shows whether we use Intrinsic or a usual Call for vectorized
4332     // version of the instruction.
4333     // Is it beneficial to perform intrinsic call compared to lib call?
4334     bool NeedToScalarize;
4335     unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize);
4336     bool UseVectorIntrinsic =
4337         ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost;
4338     assert((UseVectorIntrinsic || !NeedToScalarize) &&
4339            "Instruction should be scalarized elsewhere.");
4340 
4341     for (unsigned Part = 0; Part < UF; ++Part) {
4342       SmallVector<Value *, 4> Args;
4343       for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) {
4344         Value *Arg = CI->getArgOperand(i);
4345         // Some intrinsics have a scalar argument - don't replace it with a
4346         // vector.
4347         if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, i))
4348           Arg = getOrCreateVectorValue(CI->getArgOperand(i), Part);
4349         Args.push_back(Arg);
4350       }
4351 
4352       Function *VectorF;
4353       if (UseVectorIntrinsic) {
4354         // Use vector version of the intrinsic.
4355         Type *TysForDecl[] = {CI->getType()};
4356         if (VF > 1)
4357           TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
4358         VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
4359       } else {
4360         // Use vector version of the library call.
4361         StringRef VFnName = TLI->getVectorizedFunction(FnName, VF);
4362         assert(!VFnName.empty() && "Vector function name is empty.");
4363         VectorF = M->getFunction(VFnName);
4364         if (!VectorF) {
4365           // Generate a declaration
4366           FunctionType *FTy = FunctionType::get(RetTy, Tys, false);
4367           VectorF =
4368               Function::Create(FTy, Function::ExternalLinkage, VFnName, M);
4369           VectorF->copyAttributesFrom(F);
4370         }
4371       }
4372       assert(VectorF && "Can't create vector function.");
4373 
4374       SmallVector<OperandBundleDef, 1> OpBundles;
4375       CI->getOperandBundlesAsDefs(OpBundles);
4376       CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
4377 
4378       if (isa<FPMathOperator>(V))
4379         V->copyFastMathFlags(CI);
4380 
4381       VectorLoopValueMap.setVectorValue(&I, Part, V);
4382       addMetadata(V, &I);
4383     }
4384 
4385     break;
4386   }
4387 
4388   default:
4389     // This instruction is not vectorized by simple widening.
4390     LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
4391     llvm_unreachable("Unhandled instruction!");
4392   } // end of switch.
4393 }
4394 
4395 void InnerLoopVectorizer::updateAnalysis() {
4396   // Forget the original basic block.
4397   PSE.getSE()->forgetLoop(OrigLoop);
4398 
4399   // DT is not kept up-to-date for outer loop vectorization
4400   if (EnableVPlanNativePath)
4401     return;
4402 
4403   // Update the dominator tree information.
4404   assert(DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) &&
4405          "Entry does not dominate exit.");
4406 
4407   DT->addNewBlock(LoopMiddleBlock,
4408                   LI->getLoopFor(LoopVectorBody)->getLoopLatch());
4409   DT->addNewBlock(LoopScalarPreHeader, LoopBypassBlocks[0]);
4410   DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader);
4411   DT->changeImmediateDominator(LoopExitBlock, LoopBypassBlocks[0]);
4412   assert(DT->verify(DominatorTree::VerificationLevel::Fast));
4413 }
4414 
4415 void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
4416   // We should not collect Scalars more than once per VF. Right now, this
4417   // function is called from collectUniformsAndScalars(), which already does
4418   // this check. Collecting Scalars for VF=1 does not make any sense.
4419   assert(VF >= 2 && Scalars.find(VF) == Scalars.end() &&
4420          "This function should not be visited twice for the same VF");
4421 
4422   SmallSetVector<Instruction *, 8> Worklist;
4423 
4424   // These sets are used to seed the analysis with pointers used by memory
4425   // accesses that will remain scalar.
4426   SmallSetVector<Instruction *, 8> ScalarPtrs;
4427   SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
4428 
4429   // A helper that returns true if the use of Ptr by MemAccess will be scalar.
4430   // The pointer operands of loads and stores will be scalar as long as the
4431   // memory access is not a gather or scatter operation. The value operand of a
4432   // store will remain scalar if the store is scalarized.
4433   auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
4434     InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
4435     assert(WideningDecision != CM_Unknown &&
4436            "Widening decision should be ready at this moment");
4437     if (auto *Store = dyn_cast<StoreInst>(MemAccess))
4438       if (Ptr == Store->getValueOperand())
4439         return WideningDecision == CM_Scalarize;
4440     assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
4441            "Ptr is neither a value or pointer operand");
4442     return WideningDecision != CM_GatherScatter;
4443   };
4444 
4445   // A helper that returns true if the given value is a bitcast or
4446   // getelementptr instruction contained in the loop.
4447   auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
4448     return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
4449             isa<GetElementPtrInst>(V)) &&
4450            !TheLoop->isLoopInvariant(V);
4451   };
4452 
4453   // A helper that evaluates a memory access's use of a pointer. If the use
4454   // will be a scalar use, and the pointer is only used by memory accesses, we
4455   // place the pointer in ScalarPtrs. Otherwise, the pointer is placed in
4456   // PossibleNonScalarPtrs.
4457   auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
4458     // We only care about bitcast and getelementptr instructions contained in
4459     // the loop.
4460     if (!isLoopVaryingBitCastOrGEP(Ptr))
4461       return;
4462 
4463     // If the pointer has already been identified as scalar (e.g., if it was
4464     // also identified as uniform), there's nothing to do.
4465     auto *I = cast<Instruction>(Ptr);
4466     if (Worklist.count(I))
4467       return;
4468 
4469     // If the use of the pointer will be a scalar use, and all users of the
4470     // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
4471     // place the pointer in PossibleNonScalarPtrs.
4472     if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
4473           return isa<LoadInst>(U) || isa<StoreInst>(U);
4474         }))
4475       ScalarPtrs.insert(I);
4476     else
4477       PossibleNonScalarPtrs.insert(I);
4478   };
4479 
4480   // We seed the scalars analysis with three classes of instructions: (1)
4481   // instructions marked uniform-after-vectorization, (2) bitcast and
4482   // getelementptr instructions used by memory accesses requiring a scalar use,
4483   // and (3) pointer induction variables and their update instructions (we
4484   // currently only scalarize these).
4485   //
4486   // (1) Add to the worklist all instructions that have been identified as
4487   // uniform-after-vectorization.
4488   Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
4489 
4490   // (2) Add to the worklist all bitcast and getelementptr instructions used by
4491   // memory accesses requiring a scalar use. The pointer operands of loads and
4492   // stores will be scalar as long as the memory accesses is not a gather or
4493   // scatter operation. The value operand of a store will remain scalar if the
4494   // store is scalarized.
4495   for (auto *BB : TheLoop->blocks())
4496     for (auto &I : *BB) {
4497       if (auto *Load = dyn_cast<LoadInst>(&I)) {
4498         evaluatePtrUse(Load, Load->getPointerOperand());
4499       } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
4500         evaluatePtrUse(Store, Store->getPointerOperand());
4501         evaluatePtrUse(Store, Store->getValueOperand());
4502       }
4503     }
4504   for (auto *I : ScalarPtrs)
4505     if (PossibleNonScalarPtrs.find(I) == PossibleNonScalarPtrs.end()) {
4506       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
4507       Worklist.insert(I);
4508     }
4509 
4510   // (3) Add to the worklist all pointer induction variables and their update
4511   // instructions.
4512   //
4513   // TODO: Once we are able to vectorize pointer induction variables we should
4514   //       no longer insert them into the worklist here.
4515   auto *Latch = TheLoop->getLoopLatch();
4516   for (auto &Induction : *Legal->getInductionVars()) {
4517     auto *Ind = Induction.first;
4518     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4519     if (Induction.second.getKind() != InductionDescriptor::IK_PtrInduction)
4520       continue;
4521     Worklist.insert(Ind);
4522     Worklist.insert(IndUpdate);
4523     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
4524     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
4525                       << "\n");
4526   }
4527 
4528   // Insert the forced scalars.
4529   // FIXME: Currently widenPHIInstruction() often creates a dead vector
4530   // induction variable when the PHI user is scalarized.
4531   auto ForcedScalar = ForcedScalars.find(VF);
4532   if (ForcedScalar != ForcedScalars.end())
4533     for (auto *I : ForcedScalar->second)
4534       Worklist.insert(I);
4535 
4536   // Expand the worklist by looking through any bitcasts and getelementptr
4537   // instructions we've already identified as scalar. This is similar to the
4538   // expansion step in collectLoopUniforms(); however, here we're only
4539   // expanding to include additional bitcasts and getelementptr instructions.
4540   unsigned Idx = 0;
4541   while (Idx != Worklist.size()) {
4542     Instruction *Dst = Worklist[Idx++];
4543     if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
4544       continue;
4545     auto *Src = cast<Instruction>(Dst->getOperand(0));
4546     if (llvm::all_of(Src->users(), [&](User *U) -> bool {
4547           auto *J = cast<Instruction>(U);
4548           return !TheLoop->contains(J) || Worklist.count(J) ||
4549                  ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
4550                   isScalarUse(J, Src));
4551         })) {
4552       Worklist.insert(Src);
4553       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
4554     }
4555   }
4556 
4557   // An induction variable will remain scalar if all users of the induction
4558   // variable and induction variable update remain scalar.
4559   for (auto &Induction : *Legal->getInductionVars()) {
4560     auto *Ind = Induction.first;
4561     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4562 
4563     // We already considered pointer induction variables, so there's no reason
4564     // to look at their users again.
4565     //
4566     // TODO: Once we are able to vectorize pointer induction variables we
4567     //       should no longer skip over them here.
4568     if (Induction.second.getKind() == InductionDescriptor::IK_PtrInduction)
4569       continue;
4570 
4571     // Determine if all users of the induction variable are scalar after
4572     // vectorization.
4573     auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4574       auto *I = cast<Instruction>(U);
4575       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I);
4576     });
4577     if (!ScalarInd)
4578       continue;
4579 
4580     // Determine if all users of the induction variable update instruction are
4581     // scalar after vectorization.
4582     auto ScalarIndUpdate =
4583         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4584           auto *I = cast<Instruction>(U);
4585           return I == Ind || !TheLoop->contains(I) || Worklist.count(I);
4586         });
4587     if (!ScalarIndUpdate)
4588       continue;
4589 
4590     // The induction variable and its update instruction will remain scalar.
4591     Worklist.insert(Ind);
4592     Worklist.insert(IndUpdate);
4593     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
4594     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
4595                       << "\n");
4596   }
4597 
4598   Scalars[VF].insert(Worklist.begin(), Worklist.end());
4599 }
4600 
4601 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, unsigned VF) {
4602   if (!blockNeedsPredication(I->getParent()))
4603     return false;
4604   switch(I->getOpcode()) {
4605   default:
4606     break;
4607   case Instruction::Load:
4608   case Instruction::Store: {
4609     if (!Legal->isMaskRequired(I))
4610       return false;
4611     auto *Ptr = getLoadStorePointerOperand(I);
4612     auto *Ty = getMemInstValueType(I);
4613     // We have already decided how to vectorize this instruction, get that
4614     // result.
4615     if (VF > 1) {
4616       InstWidening WideningDecision = getWideningDecision(I, VF);
4617       assert(WideningDecision != CM_Unknown &&
4618              "Widening decision should be ready at this moment");
4619       return WideningDecision == CM_Scalarize;
4620     }
4621     const MaybeAlign Alignment = getLoadStoreAlignment(I);
4622     return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
4623                                 isLegalMaskedGather(Ty, Alignment))
4624                             : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
4625                                 isLegalMaskedScatter(Ty, Alignment));
4626   }
4627   case Instruction::UDiv:
4628   case Instruction::SDiv:
4629   case Instruction::SRem:
4630   case Instruction::URem:
4631     return mayDivideByZero(*I);
4632   }
4633   return false;
4634 }
4635 
4636 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(Instruction *I,
4637                                                                unsigned VF) {
4638   assert(isAccessInterleaved(I) && "Expecting interleaved access.");
4639   assert(getWideningDecision(I, VF) == CM_Unknown &&
4640          "Decision should not be set yet.");
4641   auto *Group = getInterleavedAccessGroup(I);
4642   assert(Group && "Must have a group.");
4643 
4644   // If the instruction's allocated size doesn't equal it's type size, it
4645   // requires padding and will be scalarized.
4646   auto &DL = I->getModule()->getDataLayout();
4647   auto *ScalarTy = getMemInstValueType(I);
4648   if (hasIrregularType(ScalarTy, DL, VF))
4649     return false;
4650 
4651   // Check if masking is required.
4652   // A Group may need masking for one of two reasons: it resides in a block that
4653   // needs predication, or it was decided to use masking to deal with gaps.
4654   bool PredicatedAccessRequiresMasking =
4655       Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I);
4656   bool AccessWithGapsRequiresMasking =
4657       Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
4658   if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking)
4659     return true;
4660 
4661   // If masked interleaving is required, we expect that the user/target had
4662   // enabled it, because otherwise it either wouldn't have been created or
4663   // it should have been invalidated by the CostModel.
4664   assert(useMaskedInterleavedAccesses(TTI) &&
4665          "Masked interleave-groups for predicated accesses are not enabled.");
4666 
4667   auto *Ty = getMemInstValueType(I);
4668   const MaybeAlign Alignment = getLoadStoreAlignment(I);
4669   return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
4670                           : TTI.isLegalMaskedStore(Ty, Alignment);
4671 }
4672 
4673 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(Instruction *I,
4674                                                                unsigned VF) {
4675   // Get and ensure we have a valid memory instruction.
4676   LoadInst *LI = dyn_cast<LoadInst>(I);
4677   StoreInst *SI = dyn_cast<StoreInst>(I);
4678   assert((LI || SI) && "Invalid memory instruction");
4679 
4680   auto *Ptr = getLoadStorePointerOperand(I);
4681 
4682   // In order to be widened, the pointer should be consecutive, first of all.
4683   if (!Legal->isConsecutivePtr(Ptr))
4684     return false;
4685 
4686   // If the instruction is a store located in a predicated block, it will be
4687   // scalarized.
4688   if (isScalarWithPredication(I))
4689     return false;
4690 
4691   // If the instruction's allocated size doesn't equal it's type size, it
4692   // requires padding and will be scalarized.
4693   auto &DL = I->getModule()->getDataLayout();
4694   auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType();
4695   if (hasIrregularType(ScalarTy, DL, VF))
4696     return false;
4697 
4698   return true;
4699 }
4700 
4701 void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
4702   // We should not collect Uniforms more than once per VF. Right now,
4703   // this function is called from collectUniformsAndScalars(), which
4704   // already does this check. Collecting Uniforms for VF=1 does not make any
4705   // sense.
4706 
4707   assert(VF >= 2 && Uniforms.find(VF) == Uniforms.end() &&
4708          "This function should not be visited twice for the same VF");
4709 
4710   // Visit the list of Uniforms. If we'll not find any uniform value, we'll
4711   // not analyze again.  Uniforms.count(VF) will return 1.
4712   Uniforms[VF].clear();
4713 
4714   // We now know that the loop is vectorizable!
4715   // Collect instructions inside the loop that will remain uniform after
4716   // vectorization.
4717 
4718   // Global values, params and instructions outside of current loop are out of
4719   // scope.
4720   auto isOutOfScope = [&](Value *V) -> bool {
4721     Instruction *I = dyn_cast<Instruction>(V);
4722     return (!I || !TheLoop->contains(I));
4723   };
4724 
4725   SetVector<Instruction *> Worklist;
4726   BasicBlock *Latch = TheLoop->getLoopLatch();
4727 
4728   // Instructions that are scalar with predication must not be considered
4729   // uniform after vectorization, because that would create an erroneous
4730   // replicating region where only a single instance out of VF should be formed.
4731   // TODO: optimize such seldom cases if found important, see PR40816.
4732   auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
4733     if (isScalarWithPredication(I, VF)) {
4734       LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "
4735                         << *I << "\n");
4736       return;
4737     }
4738     LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
4739     Worklist.insert(I);
4740   };
4741 
4742   // Start with the conditional branch. If the branch condition is an
4743   // instruction contained in the loop that is only used by the branch, it is
4744   // uniform.
4745   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
4746   if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
4747     addToWorklistIfAllowed(Cmp);
4748 
4749   // Holds consecutive and consecutive-like pointers. Consecutive-like pointers
4750   // are pointers that are treated like consecutive pointers during
4751   // vectorization. The pointer operands of interleaved accesses are an
4752   // example.
4753   SmallSetVector<Instruction *, 8> ConsecutiveLikePtrs;
4754 
4755   // Holds pointer operands of instructions that are possibly non-uniform.
4756   SmallPtrSet<Instruction *, 8> PossibleNonUniformPtrs;
4757 
4758   auto isUniformDecision = [&](Instruction *I, unsigned VF) {
4759     InstWidening WideningDecision = getWideningDecision(I, VF);
4760     assert(WideningDecision != CM_Unknown &&
4761            "Widening decision should be ready at this moment");
4762 
4763     return (WideningDecision == CM_Widen ||
4764             WideningDecision == CM_Widen_Reverse ||
4765             WideningDecision == CM_Interleave);
4766   };
4767   // Iterate over the instructions in the loop, and collect all
4768   // consecutive-like pointer operands in ConsecutiveLikePtrs. If it's possible
4769   // that a consecutive-like pointer operand will be scalarized, we collect it
4770   // in PossibleNonUniformPtrs instead. We use two sets here because a single
4771   // getelementptr instruction can be used by both vectorized and scalarized
4772   // memory instructions. For example, if a loop loads and stores from the same
4773   // location, but the store is conditional, the store will be scalarized, and
4774   // the getelementptr won't remain uniform.
4775   for (auto *BB : TheLoop->blocks())
4776     for (auto &I : *BB) {
4777       // If there's no pointer operand, there's nothing to do.
4778       auto *Ptr = dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
4779       if (!Ptr)
4780         continue;
4781 
4782       // True if all users of Ptr are memory accesses that have Ptr as their
4783       // pointer operand.
4784       auto UsersAreMemAccesses =
4785           llvm::all_of(Ptr->users(), [&](User *U) -> bool {
4786             return getLoadStorePointerOperand(U) == Ptr;
4787           });
4788 
4789       // Ensure the memory instruction will not be scalarized or used by
4790       // gather/scatter, making its pointer operand non-uniform. If the pointer
4791       // operand is used by any instruction other than a memory access, we
4792       // conservatively assume the pointer operand may be non-uniform.
4793       if (!UsersAreMemAccesses || !isUniformDecision(&I, VF))
4794         PossibleNonUniformPtrs.insert(Ptr);
4795 
4796       // If the memory instruction will be vectorized and its pointer operand
4797       // is consecutive-like, or interleaving - the pointer operand should
4798       // remain uniform.
4799       else
4800         ConsecutiveLikePtrs.insert(Ptr);
4801     }
4802 
4803   // Add to the Worklist all consecutive and consecutive-like pointers that
4804   // aren't also identified as possibly non-uniform.
4805   for (auto *V : ConsecutiveLikePtrs)
4806     if (PossibleNonUniformPtrs.find(V) == PossibleNonUniformPtrs.end())
4807       addToWorklistIfAllowed(V);
4808 
4809   // Expand Worklist in topological order: whenever a new instruction
4810   // is added , its users should be already inside Worklist.  It ensures
4811   // a uniform instruction will only be used by uniform instructions.
4812   unsigned idx = 0;
4813   while (idx != Worklist.size()) {
4814     Instruction *I = Worklist[idx++];
4815 
4816     for (auto OV : I->operand_values()) {
4817       // isOutOfScope operands cannot be uniform instructions.
4818       if (isOutOfScope(OV))
4819         continue;
4820       // First order recurrence Phi's should typically be considered
4821       // non-uniform.
4822       auto *OP = dyn_cast<PHINode>(OV);
4823       if (OP && Legal->isFirstOrderRecurrence(OP))
4824         continue;
4825       // If all the users of the operand are uniform, then add the
4826       // operand into the uniform worklist.
4827       auto *OI = cast<Instruction>(OV);
4828       if (llvm::all_of(OI->users(), [&](User *U) -> bool {
4829             auto *J = cast<Instruction>(U);
4830             return Worklist.count(J) ||
4831                    (OI == getLoadStorePointerOperand(J) &&
4832                     isUniformDecision(J, VF));
4833           }))
4834         addToWorklistIfAllowed(OI);
4835     }
4836   }
4837 
4838   // Returns true if Ptr is the pointer operand of a memory access instruction
4839   // I, and I is known to not require scalarization.
4840   auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
4841     return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF);
4842   };
4843 
4844   // For an instruction to be added into Worklist above, all its users inside
4845   // the loop should also be in Worklist. However, this condition cannot be
4846   // true for phi nodes that form a cyclic dependence. We must process phi
4847   // nodes separately. An induction variable will remain uniform if all users
4848   // of the induction variable and induction variable update remain uniform.
4849   // The code below handles both pointer and non-pointer induction variables.
4850   for (auto &Induction : *Legal->getInductionVars()) {
4851     auto *Ind = Induction.first;
4852     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4853 
4854     // Determine if all users of the induction variable are uniform after
4855     // vectorization.
4856     auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4857       auto *I = cast<Instruction>(U);
4858       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
4859              isVectorizedMemAccessUse(I, Ind);
4860     });
4861     if (!UniformInd)
4862       continue;
4863 
4864     // Determine if all users of the induction variable update instruction are
4865     // uniform after vectorization.
4866     auto UniformIndUpdate =
4867         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4868           auto *I = cast<Instruction>(U);
4869           return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
4870                  isVectorizedMemAccessUse(I, IndUpdate);
4871         });
4872     if (!UniformIndUpdate)
4873       continue;
4874 
4875     // The induction variable and its update instruction will remain uniform.
4876     addToWorklistIfAllowed(Ind);
4877     addToWorklistIfAllowed(IndUpdate);
4878   }
4879 
4880   Uniforms[VF].insert(Worklist.begin(), Worklist.end());
4881 }
4882 
4883 bool LoopVectorizationCostModel::runtimeChecksRequired() {
4884   LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
4885 
4886   if (Legal->getRuntimePointerChecking()->Need) {
4887     reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
4888         "runtime pointer checks needed. Enable vectorization of this "
4889         "loop with '#pragma clang loop vectorize(enable)' when "
4890         "compiling with -Os/-Oz",
4891         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4892     return true;
4893   }
4894 
4895   if (!PSE.getUnionPredicate().getPredicates().empty()) {
4896     reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
4897         "runtime SCEV checks needed. Enable vectorization of this "
4898         "loop with '#pragma clang loop vectorize(enable)' when "
4899         "compiling with -Os/-Oz",
4900         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4901     return true;
4902   }
4903 
4904   // FIXME: Avoid specializing for stride==1 instead of bailing out.
4905   if (!Legal->getLAI()->getSymbolicStrides().empty()) {
4906     reportVectorizationFailure("Runtime stride check is required with -Os/-Oz",
4907         "runtime stride == 1 checks needed. Enable vectorization of "
4908         "this loop with '#pragma clang loop vectorize(enable)' when "
4909         "compiling with -Os/-Oz",
4910         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4911     return true;
4912   }
4913 
4914   return false;
4915 }
4916 
4917 Optional<unsigned> LoopVectorizationCostModel::computeMaxVF() {
4918   if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
4919     // TODO: It may by useful to do since it's still likely to be dynamically
4920     // uniform if the target can skip.
4921     reportVectorizationFailure(
4922         "Not inserting runtime ptr check for divergent target",
4923         "runtime pointer checks needed. Not enabled for divergent target",
4924         "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
4925     return None;
4926   }
4927 
4928   unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
4929   LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
4930   if (TC == 1) {
4931     reportVectorizationFailure("Single iteration (non) loop",
4932         "loop trip count is one, irrelevant for vectorization",
4933         "SingleIterationLoop", ORE, TheLoop);
4934     return None;
4935   }
4936 
4937   switch (ScalarEpilogueStatus) {
4938   case CM_ScalarEpilogueAllowed:
4939     return computeFeasibleMaxVF(TC);
4940   case CM_ScalarEpilogueNotNeededUsePredicate:
4941     LLVM_DEBUG(
4942         dbgs() << "LV: vector predicate hint/switch found.\n"
4943                << "LV: Not allowing scalar epilogue, creating predicated "
4944                << "vector loop.\n");
4945     break;
4946   case CM_ScalarEpilogueNotAllowedLowTripLoop:
4947     // fallthrough as a special case of OptForSize
4948   case CM_ScalarEpilogueNotAllowedOptSize:
4949     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
4950       LLVM_DEBUG(
4951           dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
4952     else
4953       LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
4954                         << "count.\n");
4955 
4956     // Bail if runtime checks are required, which are not good when optimising
4957     // for size.
4958     if (runtimeChecksRequired())
4959       return None;
4960     break;
4961   }
4962 
4963   // Now try the tail folding
4964 
4965   // Invalidate interleave groups that require an epilogue if we can't mask
4966   // the interleave-group.
4967   if (!useMaskedInterleavedAccesses(TTI))
4968     InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
4969 
4970   unsigned MaxVF = computeFeasibleMaxVF(TC);
4971   if (TC > 0 && TC % MaxVF == 0) {
4972     // Accept MaxVF if we do not have a tail.
4973     LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
4974     return MaxVF;
4975   }
4976 
4977   // If we don't know the precise trip count, or if the trip count that we
4978   // found modulo the vectorization factor is not zero, try to fold the tail
4979   // by masking.
4980   // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
4981   if (Legal->prepareToFoldTailByMasking()) {
4982     FoldTailByMasking = true;
4983     return MaxVF;
4984   }
4985 
4986   if (TC == 0) {
4987     reportVectorizationFailure(
4988         "Unable to calculate the loop count due to complex control flow",
4989         "unable to calculate the loop count due to complex control flow",
4990         "UnknownLoopCountComplexCFG", ORE, TheLoop);
4991     return None;
4992   }
4993 
4994   reportVectorizationFailure(
4995       "Cannot optimize for size and vectorize at the same time.",
4996       "cannot optimize for size and vectorize at the same time. "
4997       "Enable vectorization of this loop with '#pragma clang loop "
4998       "vectorize(enable)' when compiling with -Os/-Oz",
4999       "NoTailLoopWithOptForSize", ORE, TheLoop);
5000   return None;
5001 }
5002 
5003 unsigned
5004 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) {
5005   MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
5006   unsigned SmallestType, WidestType;
5007   std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
5008   unsigned WidestRegister = TTI.getRegisterBitWidth(true);
5009 
5010   // Get the maximum safe dependence distance in bits computed by LAA.
5011   // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
5012   // the memory accesses that is most restrictive (involved in the smallest
5013   // dependence distance).
5014   unsigned MaxSafeRegisterWidth = Legal->getMaxSafeRegisterWidth();
5015 
5016   WidestRegister = std::min(WidestRegister, MaxSafeRegisterWidth);
5017 
5018   unsigned MaxVectorSize = WidestRegister / WidestType;
5019 
5020   LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
5021                     << " / " << WidestType << " bits.\n");
5022   LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
5023                     << WidestRegister << " bits.\n");
5024 
5025   assert(MaxVectorSize <= 256 && "Did not expect to pack so many elements"
5026                                  " into one vector!");
5027   if (MaxVectorSize == 0) {
5028     LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n");
5029     MaxVectorSize = 1;
5030     return MaxVectorSize;
5031   } else if (ConstTripCount && ConstTripCount < MaxVectorSize &&
5032              isPowerOf2_32(ConstTripCount)) {
5033     // We need to clamp the VF to be the ConstTripCount. There is no point in
5034     // choosing a higher viable VF as done in the loop below.
5035     LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: "
5036                       << ConstTripCount << "\n");
5037     MaxVectorSize = ConstTripCount;
5038     return MaxVectorSize;
5039   }
5040 
5041   unsigned MaxVF = MaxVectorSize;
5042   if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) ||
5043       (MaximizeBandwidth && isScalarEpilogueAllowed())) {
5044     // Collect all viable vectorization factors larger than the default MaxVF
5045     // (i.e. MaxVectorSize).
5046     SmallVector<unsigned, 8> VFs;
5047     unsigned NewMaxVectorSize = WidestRegister / SmallestType;
5048     for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2)
5049       VFs.push_back(VS);
5050 
5051     // For each VF calculate its register usage.
5052     auto RUs = calculateRegisterUsage(VFs);
5053 
5054     // Select the largest VF which doesn't require more registers than existing
5055     // ones.
5056     for (int i = RUs.size() - 1; i >= 0; --i) {
5057       bool Selected = true;
5058       for (auto& pair : RUs[i].MaxLocalUsers) {
5059         unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5060         if (pair.second > TargetNumRegisters)
5061           Selected = false;
5062       }
5063       if (Selected) {
5064         MaxVF = VFs[i];
5065         break;
5066       }
5067     }
5068     if (unsigned MinVF = TTI.getMinimumVF(SmallestType)) {
5069       if (MaxVF < MinVF) {
5070         LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
5071                           << ") with target's minimum: " << MinVF << '\n');
5072         MaxVF = MinVF;
5073       }
5074     }
5075   }
5076   return MaxVF;
5077 }
5078 
5079 VectorizationFactor
5080 LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) {
5081   float Cost = expectedCost(1).first;
5082   const float ScalarCost = Cost;
5083   unsigned Width = 1;
5084   LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n");
5085 
5086   bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
5087   if (ForceVectorization && MaxVF > 1) {
5088     // Ignore scalar width, because the user explicitly wants vectorization.
5089     // Initialize cost to max so that VF = 2 is, at least, chosen during cost
5090     // evaluation.
5091     Cost = std::numeric_limits<float>::max();
5092   }
5093 
5094   for (unsigned i = 2; i <= MaxVF; i *= 2) {
5095     // Notice that the vector loop needs to be executed less times, so
5096     // we need to divide the cost of the vector loops by the width of
5097     // the vector elements.
5098     VectorizationCostTy C = expectedCost(i);
5099     float VectorCost = C.first / (float)i;
5100     LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
5101                       << " costs: " << (int)VectorCost << ".\n");
5102     if (!C.second && !ForceVectorization) {
5103       LLVM_DEBUG(
5104           dbgs() << "LV: Not considering vector loop of width " << i
5105                  << " because it will not generate any vector instructions.\n");
5106       continue;
5107     }
5108     if (VectorCost < Cost) {
5109       Cost = VectorCost;
5110       Width = i;
5111     }
5112   }
5113 
5114   if (!EnableCondStoresVectorization && NumPredStores) {
5115     reportVectorizationFailure("There are conditional stores.",
5116         "store that is conditionally executed prevents vectorization",
5117         "ConditionalStore", ORE, TheLoop);
5118     Width = 1;
5119     Cost = ScalarCost;
5120   }
5121 
5122   LLVM_DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs()
5123              << "LV: Vectorization seems to be not beneficial, "
5124              << "but was forced by a user.\n");
5125   LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n");
5126   VectorizationFactor Factor = {Width, (unsigned)(Width * Cost)};
5127   return Factor;
5128 }
5129 
5130 std::pair<unsigned, unsigned>
5131 LoopVectorizationCostModel::getSmallestAndWidestTypes() {
5132   unsigned MinWidth = -1U;
5133   unsigned MaxWidth = 8;
5134   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5135 
5136   // For each block.
5137   for (BasicBlock *BB : TheLoop->blocks()) {
5138     // For each instruction in the loop.
5139     for (Instruction &I : BB->instructionsWithoutDebug()) {
5140       Type *T = I.getType();
5141 
5142       // Skip ignored values.
5143       if (ValuesToIgnore.find(&I) != ValuesToIgnore.end())
5144         continue;
5145 
5146       // Only examine Loads, Stores and PHINodes.
5147       if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
5148         continue;
5149 
5150       // Examine PHI nodes that are reduction variables. Update the type to
5151       // account for the recurrence type.
5152       if (auto *PN = dyn_cast<PHINode>(&I)) {
5153         if (!Legal->isReductionVariable(PN))
5154           continue;
5155         RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[PN];
5156         T = RdxDesc.getRecurrenceType();
5157       }
5158 
5159       // Examine the stored values.
5160       if (auto *ST = dyn_cast<StoreInst>(&I))
5161         T = ST->getValueOperand()->getType();
5162 
5163       // Ignore loaded pointer types and stored pointer types that are not
5164       // vectorizable.
5165       //
5166       // FIXME: The check here attempts to predict whether a load or store will
5167       //        be vectorized. We only know this for certain after a VF has
5168       //        been selected. Here, we assume that if an access can be
5169       //        vectorized, it will be. We should also look at extending this
5170       //        optimization to non-pointer types.
5171       //
5172       if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) &&
5173           !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I))
5174         continue;
5175 
5176       MinWidth = std::min(MinWidth,
5177                           (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
5178       MaxWidth = std::max(MaxWidth,
5179                           (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
5180     }
5181   }
5182 
5183   return {MinWidth, MaxWidth};
5184 }
5185 
5186 unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF,
5187                                                            unsigned LoopCost) {
5188   // -- The interleave heuristics --
5189   // We interleave the loop in order to expose ILP and reduce the loop overhead.
5190   // There are many micro-architectural considerations that we can't predict
5191   // at this level. For example, frontend pressure (on decode or fetch) due to
5192   // code size, or the number and capabilities of the execution ports.
5193   //
5194   // We use the following heuristics to select the interleave count:
5195   // 1. If the code has reductions, then we interleave to break the cross
5196   // iteration dependency.
5197   // 2. If the loop is really small, then we interleave to reduce the loop
5198   // overhead.
5199   // 3. We don't interleave if we think that we will spill registers to memory
5200   // due to the increased register pressure.
5201 
5202   if (!isScalarEpilogueAllowed())
5203     return 1;
5204 
5205   // We used the distance for the interleave count.
5206   if (Legal->getMaxSafeDepDistBytes() != -1U)
5207     return 1;
5208 
5209   // Do not interleave loops with a relatively small known or estimated trip
5210   // count.
5211   auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
5212   if (BestKnownTC && *BestKnownTC < TinyTripCountInterleaveThreshold)
5213     return 1;
5214 
5215   RegisterUsage R = calculateRegisterUsage({VF})[0];
5216   // We divide by these constants so assume that we have at least one
5217   // instruction that uses at least one register.
5218   for (auto& pair : R.MaxLocalUsers) {
5219     pair.second = std::max(pair.second, 1U);
5220   }
5221 
5222   // We calculate the interleave count using the following formula.
5223   // Subtract the number of loop invariants from the number of available
5224   // registers. These registers are used by all of the interleaved instances.
5225   // Next, divide the remaining registers by the number of registers that is
5226   // required by the loop, in order to estimate how many parallel instances
5227   // fit without causing spills. All of this is rounded down if necessary to be
5228   // a power of two. We want power of two interleave count to simplify any
5229   // addressing operations or alignment considerations.
5230   // We also want power of two interleave counts to ensure that the induction
5231   // variable of the vector loop wraps to zero, when tail is folded by masking;
5232   // this currently happens when OptForSize, in which case IC is set to 1 above.
5233   unsigned IC = UINT_MAX;
5234 
5235   for (auto& pair : R.MaxLocalUsers) {
5236     unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5237     LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
5238                       << " registers of "
5239                       << TTI.getRegisterClassName(pair.first) << " register class\n");
5240     if (VF == 1) {
5241       if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
5242         TargetNumRegisters = ForceTargetNumScalarRegs;
5243     } else {
5244       if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
5245         TargetNumRegisters = ForceTargetNumVectorRegs;
5246     }
5247     unsigned MaxLocalUsers = pair.second;
5248     unsigned LoopInvariantRegs = 0;
5249     if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end())
5250       LoopInvariantRegs = R.LoopInvariantRegs[pair.first];
5251 
5252     unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers);
5253     // Don't count the induction variable as interleaved.
5254     if (EnableIndVarRegisterHeur) {
5255       TmpIC =
5256           PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) /
5257                         std::max(1U, (MaxLocalUsers - 1)));
5258     }
5259 
5260     IC = std::min(IC, TmpIC);
5261   }
5262 
5263   // Clamp the interleave ranges to reasonable counts.
5264   unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF);
5265 
5266   // Check if the user has overridden the max.
5267   if (VF == 1) {
5268     if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
5269       MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
5270   } else {
5271     if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
5272       MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
5273   }
5274 
5275   // If trip count is known or estimated compile time constant, limit the
5276   // interleave count to be less than the trip count divided by VF.
5277   if (BestKnownTC) {
5278     MaxInterleaveCount = std::min(*BestKnownTC / VF, MaxInterleaveCount);
5279   }
5280 
5281   // If we did not calculate the cost for VF (because the user selected the VF)
5282   // then we calculate the cost of VF here.
5283   if (LoopCost == 0)
5284     LoopCost = expectedCost(VF).first;
5285 
5286   assert(LoopCost && "Non-zero loop cost expected");
5287 
5288   // Clamp the calculated IC to be between the 1 and the max interleave count
5289   // that the target and trip count allows.
5290   if (IC > MaxInterleaveCount)
5291     IC = MaxInterleaveCount;
5292   else if (IC < 1)
5293     IC = 1;
5294 
5295   // Interleave if we vectorized this loop and there is a reduction that could
5296   // benefit from interleaving.
5297   if (VF > 1 && !Legal->getReductionVars()->empty()) {
5298     LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
5299     return IC;
5300   }
5301 
5302   // Note that if we've already vectorized the loop we will have done the
5303   // runtime check and so interleaving won't require further checks.
5304   bool InterleavingRequiresRuntimePointerCheck =
5305       (VF == 1 && Legal->getRuntimePointerChecking()->Need);
5306 
5307   // We want to interleave small loops in order to reduce the loop overhead and
5308   // potentially expose ILP opportunities.
5309   LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n');
5310   if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) {
5311     // We assume that the cost overhead is 1 and we use the cost model
5312     // to estimate the cost of the loop and interleave until the cost of the
5313     // loop overhead is about 5% of the cost of the loop.
5314     unsigned SmallIC =
5315         std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost));
5316 
5317     // Interleave until store/load ports (estimated by max interleave count) are
5318     // saturated.
5319     unsigned NumStores = Legal->getNumStores();
5320     unsigned NumLoads = Legal->getNumLoads();
5321     unsigned StoresIC = IC / (NumStores ? NumStores : 1);
5322     unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
5323 
5324     // If we have a scalar reduction (vector reductions are already dealt with
5325     // by this point), we can increase the critical path length if the loop
5326     // we're interleaving is inside another loop. Limit, by default to 2, so the
5327     // critical path only gets increased by one reduction operation.
5328     if (!Legal->getReductionVars()->empty() && TheLoop->getLoopDepth() > 1) {
5329       unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
5330       SmallIC = std::min(SmallIC, F);
5331       StoresIC = std::min(StoresIC, F);
5332       LoadsIC = std::min(LoadsIC, F);
5333     }
5334 
5335     if (EnableLoadStoreRuntimeInterleave &&
5336         std::max(StoresIC, LoadsIC) > SmallIC) {
5337       LLVM_DEBUG(
5338           dbgs() << "LV: Interleaving to saturate store or load ports.\n");
5339       return std::max(StoresIC, LoadsIC);
5340     }
5341 
5342     LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
5343     return SmallIC;
5344   }
5345 
5346   // Interleave if this is a large loop (small loops are already dealt with by
5347   // this point) that could benefit from interleaving.
5348   bool HasReductions = !Legal->getReductionVars()->empty();
5349   if (TTI.enableAggressiveInterleaving(HasReductions)) {
5350     LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5351     return IC;
5352   }
5353 
5354   LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
5355   return 1;
5356 }
5357 
5358 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
5359 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {
5360   // This function calculates the register usage by measuring the highest number
5361   // of values that are alive at a single location. Obviously, this is a very
5362   // rough estimation. We scan the loop in a topological order in order and
5363   // assign a number to each instruction. We use RPO to ensure that defs are
5364   // met before their users. We assume that each instruction that has in-loop
5365   // users starts an interval. We record every time that an in-loop value is
5366   // used, so we have a list of the first and last occurrences of each
5367   // instruction. Next, we transpose this data structure into a multi map that
5368   // holds the list of intervals that *end* at a specific location. This multi
5369   // map allows us to perform a linear search. We scan the instructions linearly
5370   // and record each time that a new interval starts, by placing it in a set.
5371   // If we find this value in the multi-map then we remove it from the set.
5372   // The max register usage is the maximum size of the set.
5373   // We also search for instructions that are defined outside the loop, but are
5374   // used inside the loop. We need this number separately from the max-interval
5375   // usage number because when we unroll, loop-invariant values do not take
5376   // more register.
5377   LoopBlocksDFS DFS(TheLoop);
5378   DFS.perform(LI);
5379 
5380   RegisterUsage RU;
5381 
5382   // Each 'key' in the map opens a new interval. The values
5383   // of the map are the index of the 'last seen' usage of the
5384   // instruction that is the key.
5385   using IntervalMap = DenseMap<Instruction *, unsigned>;
5386 
5387   // Maps instruction to its index.
5388   SmallVector<Instruction *, 64> IdxToInstr;
5389   // Marks the end of each interval.
5390   IntervalMap EndPoint;
5391   // Saves the list of instruction indices that are used in the loop.
5392   SmallPtrSet<Instruction *, 8> Ends;
5393   // Saves the list of values that are used in the loop but are
5394   // defined outside the loop, such as arguments and constants.
5395   SmallPtrSet<Value *, 8> LoopInvariants;
5396 
5397   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
5398     for (Instruction &I : BB->instructionsWithoutDebug()) {
5399       IdxToInstr.push_back(&I);
5400 
5401       // Save the end location of each USE.
5402       for (Value *U : I.operands()) {
5403         auto *Instr = dyn_cast<Instruction>(U);
5404 
5405         // Ignore non-instruction values such as arguments, constants, etc.
5406         if (!Instr)
5407           continue;
5408 
5409         // If this instruction is outside the loop then record it and continue.
5410         if (!TheLoop->contains(Instr)) {
5411           LoopInvariants.insert(Instr);
5412           continue;
5413         }
5414 
5415         // Overwrite previous end points.
5416         EndPoint[Instr] = IdxToInstr.size();
5417         Ends.insert(Instr);
5418       }
5419     }
5420   }
5421 
5422   // Saves the list of intervals that end with the index in 'key'.
5423   using InstrList = SmallVector<Instruction *, 2>;
5424   DenseMap<unsigned, InstrList> TransposeEnds;
5425 
5426   // Transpose the EndPoints to a list of values that end at each index.
5427   for (auto &Interval : EndPoint)
5428     TransposeEnds[Interval.second].push_back(Interval.first);
5429 
5430   SmallPtrSet<Instruction *, 8> OpenIntervals;
5431 
5432   // Get the size of the widest register.
5433   unsigned MaxSafeDepDist = -1U;
5434   if (Legal->getMaxSafeDepDistBytes() != -1U)
5435     MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8;
5436   unsigned WidestRegister =
5437       std::min(TTI.getRegisterBitWidth(true), MaxSafeDepDist);
5438   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5439 
5440   SmallVector<RegisterUsage, 8> RUs(VFs.size());
5441   SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size());
5442 
5443   LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
5444 
5445   // A lambda that gets the register usage for the given type and VF.
5446   auto GetRegUsage = [&DL, WidestRegister](Type *Ty, unsigned VF) {
5447     if (Ty->isTokenTy())
5448       return 0U;
5449     unsigned TypeSize = DL.getTypeSizeInBits(Ty->getScalarType());
5450     return std::max<unsigned>(1, VF * TypeSize / WidestRegister);
5451   };
5452 
5453   for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
5454     Instruction *I = IdxToInstr[i];
5455 
5456     // Remove all of the instructions that end at this location.
5457     InstrList &List = TransposeEnds[i];
5458     for (Instruction *ToRemove : List)
5459       OpenIntervals.erase(ToRemove);
5460 
5461     // Ignore instructions that are never used within the loop.
5462     if (Ends.find(I) == Ends.end())
5463       continue;
5464 
5465     // Skip ignored values.
5466     if (ValuesToIgnore.find(I) != ValuesToIgnore.end())
5467       continue;
5468 
5469     // For each VF find the maximum usage of registers.
5470     for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
5471       // Count the number of live intervals.
5472       SmallMapVector<unsigned, unsigned, 4> RegUsage;
5473 
5474       if (VFs[j] == 1) {
5475         for (auto Inst : OpenIntervals) {
5476           unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
5477           if (RegUsage.find(ClassID) == RegUsage.end())
5478             RegUsage[ClassID] = 1;
5479           else
5480             RegUsage[ClassID] += 1;
5481         }
5482       } else {
5483         collectUniformsAndScalars(VFs[j]);
5484         for (auto Inst : OpenIntervals) {
5485           // Skip ignored values for VF > 1.
5486           if (VecValuesToIgnore.find(Inst) != VecValuesToIgnore.end())
5487             continue;
5488           if (isScalarAfterVectorization(Inst, VFs[j])) {
5489             unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
5490             if (RegUsage.find(ClassID) == RegUsage.end())
5491               RegUsage[ClassID] = 1;
5492             else
5493               RegUsage[ClassID] += 1;
5494           } else {
5495             unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType());
5496             if (RegUsage.find(ClassID) == RegUsage.end())
5497               RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]);
5498             else
5499               RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]);
5500           }
5501         }
5502       }
5503 
5504       for (auto& pair : RegUsage) {
5505         if (MaxUsages[j].find(pair.first) != MaxUsages[j].end())
5506           MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second);
5507         else
5508           MaxUsages[j][pair.first] = pair.second;
5509       }
5510     }
5511 
5512     LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
5513                       << OpenIntervals.size() << '\n');
5514 
5515     // Add the current instruction to the list of open intervals.
5516     OpenIntervals.insert(I);
5517   }
5518 
5519   for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
5520     SmallMapVector<unsigned, unsigned, 4> Invariant;
5521 
5522     for (auto Inst : LoopInvariants) {
5523       unsigned Usage = VFs[i] == 1 ? 1 : GetRegUsage(Inst->getType(), VFs[i]);
5524       unsigned ClassID = TTI.getRegisterClassForType(VFs[i] > 1, Inst->getType());
5525       if (Invariant.find(ClassID) == Invariant.end())
5526         Invariant[ClassID] = Usage;
5527       else
5528         Invariant[ClassID] += Usage;
5529     }
5530 
5531     LLVM_DEBUG({
5532       dbgs() << "LV(REG): VF = " << VFs[i] << '\n';
5533       dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size()
5534              << " item\n";
5535       for (const auto &pair : MaxUsages[i]) {
5536         dbgs() << "LV(REG): RegisterClass: "
5537                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5538                << " registers\n";
5539       }
5540       dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()
5541              << " item\n";
5542       for (const auto &pair : Invariant) {
5543         dbgs() << "LV(REG): RegisterClass: "
5544                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5545                << " registers\n";
5546       }
5547     });
5548 
5549     RU.LoopInvariantRegs = Invariant;
5550     RU.MaxLocalUsers = MaxUsages[i];
5551     RUs[i] = RU;
5552   }
5553 
5554   return RUs;
5555 }
5556 
5557 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){
5558   // TODO: Cost model for emulated masked load/store is completely
5559   // broken. This hack guides the cost model to use an artificially
5560   // high enough value to practically disable vectorization with such
5561   // operations, except where previously deployed legality hack allowed
5562   // using very low cost values. This is to avoid regressions coming simply
5563   // from moving "masked load/store" check from legality to cost model.
5564   // Masked Load/Gather emulation was previously never allowed.
5565   // Limited number of Masked Store/Scatter emulation was allowed.
5566   assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction");
5567   return isa<LoadInst>(I) ||
5568          (isa<StoreInst>(I) &&
5569           NumPredStores > NumberOfStoresToPredicate);
5570 }
5571 
5572 void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) {
5573   // If we aren't vectorizing the loop, or if we've already collected the
5574   // instructions to scalarize, there's nothing to do. Collection may already
5575   // have occurred if we have a user-selected VF and are now computing the
5576   // expected cost for interleaving.
5577   if (VF < 2 || InstsToScalarize.find(VF) != InstsToScalarize.end())
5578     return;
5579 
5580   // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
5581   // not profitable to scalarize any instructions, the presence of VF in the
5582   // map will indicate that we've analyzed it already.
5583   ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
5584 
5585   // Find all the instructions that are scalar with predication in the loop and
5586   // determine if it would be better to not if-convert the blocks they are in.
5587   // If so, we also record the instructions to scalarize.
5588   for (BasicBlock *BB : TheLoop->blocks()) {
5589     if (!blockNeedsPredication(BB))
5590       continue;
5591     for (Instruction &I : *BB)
5592       if (isScalarWithPredication(&I)) {
5593         ScalarCostsTy ScalarCosts;
5594         // Do not apply discount logic if hacked cost is needed
5595         // for emulated masked memrefs.
5596         if (!useEmulatedMaskMemRefHack(&I) &&
5597             computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
5598           ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
5599         // Remember that BB will remain after vectorization.
5600         PredicatedBBsAfterVectorization.insert(BB);
5601       }
5602   }
5603 }
5604 
5605 int LoopVectorizationCostModel::computePredInstDiscount(
5606     Instruction *PredInst, DenseMap<Instruction *, unsigned> &ScalarCosts,
5607     unsigned VF) {
5608   assert(!isUniformAfterVectorization(PredInst, VF) &&
5609          "Instruction marked uniform-after-vectorization will be predicated");
5610 
5611   // Initialize the discount to zero, meaning that the scalar version and the
5612   // vector version cost the same.
5613   int Discount = 0;
5614 
5615   // Holds instructions to analyze. The instructions we visit are mapped in
5616   // ScalarCosts. Those instructions are the ones that would be scalarized if
5617   // we find that the scalar version costs less.
5618   SmallVector<Instruction *, 8> Worklist;
5619 
5620   // Returns true if the given instruction can be scalarized.
5621   auto canBeScalarized = [&](Instruction *I) -> bool {
5622     // We only attempt to scalarize instructions forming a single-use chain
5623     // from the original predicated block that would otherwise be vectorized.
5624     // Although not strictly necessary, we give up on instructions we know will
5625     // already be scalar to avoid traversing chains that are unlikely to be
5626     // beneficial.
5627     if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
5628         isScalarAfterVectorization(I, VF))
5629       return false;
5630 
5631     // If the instruction is scalar with predication, it will be analyzed
5632     // separately. We ignore it within the context of PredInst.
5633     if (isScalarWithPredication(I))
5634       return false;
5635 
5636     // If any of the instruction's operands are uniform after vectorization,
5637     // the instruction cannot be scalarized. This prevents, for example, a
5638     // masked load from being scalarized.
5639     //
5640     // We assume we will only emit a value for lane zero of an instruction
5641     // marked uniform after vectorization, rather than VF identical values.
5642     // Thus, if we scalarize an instruction that uses a uniform, we would
5643     // create uses of values corresponding to the lanes we aren't emitting code
5644     // for. This behavior can be changed by allowing getScalarValue to clone
5645     // the lane zero values for uniforms rather than asserting.
5646     for (Use &U : I->operands())
5647       if (auto *J = dyn_cast<Instruction>(U.get()))
5648         if (isUniformAfterVectorization(J, VF))
5649           return false;
5650 
5651     // Otherwise, we can scalarize the instruction.
5652     return true;
5653   };
5654 
5655   // Compute the expected cost discount from scalarizing the entire expression
5656   // feeding the predicated instruction. We currently only consider expressions
5657   // that are single-use instruction chains.
5658   Worklist.push_back(PredInst);
5659   while (!Worklist.empty()) {
5660     Instruction *I = Worklist.pop_back_val();
5661 
5662     // If we've already analyzed the instruction, there's nothing to do.
5663     if (ScalarCosts.find(I) != ScalarCosts.end())
5664       continue;
5665 
5666     // Compute the cost of the vector instruction. Note that this cost already
5667     // includes the scalarization overhead of the predicated instruction.
5668     unsigned VectorCost = getInstructionCost(I, VF).first;
5669 
5670     // Compute the cost of the scalarized instruction. This cost is the cost of
5671     // the instruction as if it wasn't if-converted and instead remained in the
5672     // predicated block. We will scale this cost by block probability after
5673     // computing the scalarization overhead.
5674     unsigned ScalarCost = VF * getInstructionCost(I, 1).first;
5675 
5676     // Compute the scalarization overhead of needed insertelement instructions
5677     // and phi nodes.
5678     if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) {
5679       ScalarCost += TTI.getScalarizationOverhead(ToVectorTy(I->getType(), VF),
5680                                                  true, false);
5681       ScalarCost += VF * TTI.getCFInstrCost(Instruction::PHI);
5682     }
5683 
5684     // Compute the scalarization overhead of needed extractelement
5685     // instructions. For each of the instruction's operands, if the operand can
5686     // be scalarized, add it to the worklist; otherwise, account for the
5687     // overhead.
5688     for (Use &U : I->operands())
5689       if (auto *J = dyn_cast<Instruction>(U.get())) {
5690         assert(VectorType::isValidElementType(J->getType()) &&
5691                "Instruction has non-scalar type");
5692         if (canBeScalarized(J))
5693           Worklist.push_back(J);
5694         else if (needsExtract(J, VF))
5695           ScalarCost += TTI.getScalarizationOverhead(
5696                               ToVectorTy(J->getType(),VF), false, true);
5697       }
5698 
5699     // Scale the total scalar cost by block probability.
5700     ScalarCost /= getReciprocalPredBlockProb();
5701 
5702     // Compute the discount. A non-negative discount means the vector version
5703     // of the instruction costs more, and scalarizing would be beneficial.
5704     Discount += VectorCost - ScalarCost;
5705     ScalarCosts[I] = ScalarCost;
5706   }
5707 
5708   return Discount;
5709 }
5710 
5711 LoopVectorizationCostModel::VectorizationCostTy
5712 LoopVectorizationCostModel::expectedCost(unsigned VF) {
5713   VectorizationCostTy Cost;
5714 
5715   // For each block.
5716   for (BasicBlock *BB : TheLoop->blocks()) {
5717     VectorizationCostTy BlockCost;
5718 
5719     // For each instruction in the old loop.
5720     for (Instruction &I : BB->instructionsWithoutDebug()) {
5721       // Skip ignored values.
5722       if (ValuesToIgnore.find(&I) != ValuesToIgnore.end() ||
5723           (VF > 1 && VecValuesToIgnore.find(&I) != VecValuesToIgnore.end()))
5724         continue;
5725 
5726       VectorizationCostTy C = getInstructionCost(&I, VF);
5727 
5728       // Check if we should override the cost.
5729       if (ForceTargetInstructionCost.getNumOccurrences() > 0)
5730         C.first = ForceTargetInstructionCost;
5731 
5732       BlockCost.first += C.first;
5733       BlockCost.second |= C.second;
5734       LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first
5735                         << " for VF " << VF << " For instruction: " << I
5736                         << '\n');
5737     }
5738 
5739     // If we are vectorizing a predicated block, it will have been
5740     // if-converted. This means that the block's instructions (aside from
5741     // stores and instructions that may divide by zero) will now be
5742     // unconditionally executed. For the scalar case, we may not always execute
5743     // the predicated block. Thus, scale the block's cost by the probability of
5744     // executing it.
5745     if (VF == 1 && blockNeedsPredication(BB))
5746       BlockCost.first /= getReciprocalPredBlockProb();
5747 
5748     Cost.first += BlockCost.first;
5749     Cost.second |= BlockCost.second;
5750   }
5751 
5752   return Cost;
5753 }
5754 
5755 /// Gets Address Access SCEV after verifying that the access pattern
5756 /// is loop invariant except the induction variable dependence.
5757 ///
5758 /// This SCEV can be sent to the Target in order to estimate the address
5759 /// calculation cost.
5760 static const SCEV *getAddressAccessSCEV(
5761               Value *Ptr,
5762               LoopVectorizationLegality *Legal,
5763               PredicatedScalarEvolution &PSE,
5764               const Loop *TheLoop) {
5765 
5766   auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
5767   if (!Gep)
5768     return nullptr;
5769 
5770   // We are looking for a gep with all loop invariant indices except for one
5771   // which should be an induction variable.
5772   auto SE = PSE.getSE();
5773   unsigned NumOperands = Gep->getNumOperands();
5774   for (unsigned i = 1; i < NumOperands; ++i) {
5775     Value *Opd = Gep->getOperand(i);
5776     if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
5777         !Legal->isInductionVariable(Opd))
5778       return nullptr;
5779   }
5780 
5781   // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
5782   return PSE.getSCEV(Ptr);
5783 }
5784 
5785 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
5786   return Legal->hasStride(I->getOperand(0)) ||
5787          Legal->hasStride(I->getOperand(1));
5788 }
5789 
5790 unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
5791                                                                  unsigned VF) {
5792   assert(VF > 1 && "Scalarization cost of instruction implies vectorization.");
5793   Type *ValTy = getMemInstValueType(I);
5794   auto SE = PSE.getSE();
5795 
5796   unsigned AS = getLoadStoreAddressSpace(I);
5797   Value *Ptr = getLoadStorePointerOperand(I);
5798   Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
5799 
5800   // Figure out whether the access is strided and get the stride value
5801   // if it's known in compile time
5802   const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
5803 
5804   // Get the cost of the scalar memory instruction and address computation.
5805   unsigned Cost = VF * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
5806 
5807   // Don't pass *I here, since it is scalar but will actually be part of a
5808   // vectorized loop where the user of it is a vectorized instruction.
5809   const MaybeAlign Alignment = getLoadStoreAlignment(I);
5810   Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(),
5811                                    Alignment, AS);
5812 
5813   // Get the overhead of the extractelement and insertelement instructions
5814   // we might create due to scalarization.
5815   Cost += getScalarizationOverhead(I, VF);
5816 
5817   // If we have a predicated store, it may not be executed for each vector
5818   // lane. Scale the cost by the probability of executing the predicated
5819   // block.
5820   if (isPredicatedInst(I)) {
5821     Cost /= getReciprocalPredBlockProb();
5822 
5823     if (useEmulatedMaskMemRefHack(I))
5824       // Artificially setting to a high enough value to practically disable
5825       // vectorization with such operations.
5826       Cost = 3000000;
5827   }
5828 
5829   return Cost;
5830 }
5831 
5832 unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
5833                                                              unsigned VF) {
5834   Type *ValTy = getMemInstValueType(I);
5835   Type *VectorTy = ToVectorTy(ValTy, VF);
5836   Value *Ptr = getLoadStorePointerOperand(I);
5837   unsigned AS = getLoadStoreAddressSpace(I);
5838   int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
5839 
5840   assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
5841          "Stride should be 1 or -1 for consecutive memory access");
5842   const MaybeAlign Alignment = getLoadStoreAlignment(I);
5843   unsigned Cost = 0;
5844   if (Legal->isMaskRequired(I))
5845     Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy,
5846                                       Alignment ? Alignment->value() : 0, AS);
5847   else
5848     Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, I);
5849 
5850   bool Reverse = ConsecutiveStride < 0;
5851   if (Reverse)
5852     Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
5853   return Cost;
5854 }
5855 
5856 unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
5857                                                          unsigned VF) {
5858   Type *ValTy = getMemInstValueType(I);
5859   Type *VectorTy = ToVectorTy(ValTy, VF);
5860   const MaybeAlign Alignment = getLoadStoreAlignment(I);
5861   unsigned AS = getLoadStoreAddressSpace(I);
5862   if (isa<LoadInst>(I)) {
5863     return TTI.getAddressComputationCost(ValTy) +
5864            TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS) +
5865            TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
5866   }
5867   StoreInst *SI = cast<StoreInst>(I);
5868 
5869   bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand());
5870   return TTI.getAddressComputationCost(ValTy) +
5871          TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS) +
5872          (isLoopInvariantStoreValue
5873               ? 0
5874               : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
5875                                        VF - 1));
5876 }
5877 
5878 unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
5879                                                           unsigned VF) {
5880   Type *ValTy = getMemInstValueType(I);
5881   Type *VectorTy = ToVectorTy(ValTy, VF);
5882   const MaybeAlign Alignment = getLoadStoreAlignment(I);
5883   Value *Ptr = getLoadStorePointerOperand(I);
5884 
5885   return TTI.getAddressComputationCost(VectorTy) +
5886          TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr,
5887                                     Legal->isMaskRequired(I),
5888                                     Alignment ? Alignment->value() : 0);
5889 }
5890 
5891 unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
5892                                                             unsigned VF) {
5893   Type *ValTy = getMemInstValueType(I);
5894   Type *VectorTy = ToVectorTy(ValTy, VF);
5895   unsigned AS = getLoadStoreAddressSpace(I);
5896 
5897   auto Group = getInterleavedAccessGroup(I);
5898   assert(Group && "Fail to get an interleaved access group.");
5899 
5900   unsigned InterleaveFactor = Group->getFactor();
5901   Type *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
5902 
5903   // Holds the indices of existing members in an interleaved load group.
5904   // An interleaved store group doesn't need this as it doesn't allow gaps.
5905   SmallVector<unsigned, 4> Indices;
5906   if (isa<LoadInst>(I)) {
5907     for (unsigned i = 0; i < InterleaveFactor; i++)
5908       if (Group->getMember(i))
5909         Indices.push_back(i);
5910   }
5911 
5912   // Calculate the cost of the whole interleaved group.
5913   bool UseMaskForGaps =
5914       Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
5915   unsigned Cost = TTI.getInterleavedMemoryOpCost(
5916       I->getOpcode(), WideVecTy, Group->getFactor(), Indices,
5917       Group->getAlignment(), AS, Legal->isMaskRequired(I), UseMaskForGaps);
5918 
5919   if (Group->isReverse()) {
5920     // TODO: Add support for reversed masked interleaved access.
5921     assert(!Legal->isMaskRequired(I) &&
5922            "Reverse masked interleaved access not supported.");
5923     Cost += Group->getNumMembers() *
5924             TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
5925   }
5926   return Cost;
5927 }
5928 
5929 unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
5930                                                               unsigned VF) {
5931   // Calculate scalar cost only. Vectorization cost should be ready at this
5932   // moment.
5933   if (VF == 1) {
5934     Type *ValTy = getMemInstValueType(I);
5935     const MaybeAlign Alignment = getLoadStoreAlignment(I);
5936     unsigned AS = getLoadStoreAddressSpace(I);
5937 
5938     return TTI.getAddressComputationCost(ValTy) +
5939            TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, I);
5940   }
5941   return getWideningCost(I, VF);
5942 }
5943 
5944 LoopVectorizationCostModel::VectorizationCostTy
5945 LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
5946   // If we know that this instruction will remain uniform, check the cost of
5947   // the scalar version.
5948   if (isUniformAfterVectorization(I, VF))
5949     VF = 1;
5950 
5951   if (VF > 1 && isProfitableToScalarize(I, VF))
5952     return VectorizationCostTy(InstsToScalarize[VF][I], false);
5953 
5954   // Forced scalars do not have any scalarization overhead.
5955   auto ForcedScalar = ForcedScalars.find(VF);
5956   if (VF > 1 && ForcedScalar != ForcedScalars.end()) {
5957     auto InstSet = ForcedScalar->second;
5958     if (InstSet.find(I) != InstSet.end())
5959       return VectorizationCostTy((getInstructionCost(I, 1).first * VF), false);
5960   }
5961 
5962   Type *VectorTy;
5963   unsigned C = getInstructionCost(I, VF, VectorTy);
5964 
5965   bool TypeNotScalarized =
5966       VF > 1 && VectorTy->isVectorTy() && TTI.getNumberOfParts(VectorTy) < VF;
5967   return VectorizationCostTy(C, TypeNotScalarized);
5968 }
5969 
5970 unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
5971                                                               unsigned VF) {
5972 
5973   if (VF == 1)
5974     return 0;
5975 
5976   unsigned Cost = 0;
5977   Type *RetTy = ToVectorTy(I->getType(), VF);
5978   if (!RetTy->isVoidTy() &&
5979       (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
5980     Cost += TTI.getScalarizationOverhead(RetTy, true, false);
5981 
5982   // Some targets keep addresses scalar.
5983   if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
5984     return Cost;
5985 
5986   // Some targets support efficient element stores.
5987   if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
5988     return Cost;
5989 
5990   // Collect operands to consider.
5991   CallInst *CI = dyn_cast<CallInst>(I);
5992   Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands();
5993 
5994   // Skip operands that do not require extraction/scalarization and do not incur
5995   // any overhead.
5996   return Cost + TTI.getOperandsScalarizationOverhead(
5997                     filterExtractingOperands(Ops, VF), VF);
5998 }
5999 
6000 void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {
6001   if (VF == 1)
6002     return;
6003   NumPredStores = 0;
6004   for (BasicBlock *BB : TheLoop->blocks()) {
6005     // For each instruction in the old loop.
6006     for (Instruction &I : *BB) {
6007       Value *Ptr =  getLoadStorePointerOperand(&I);
6008       if (!Ptr)
6009         continue;
6010 
6011       // TODO: We should generate better code and update the cost model for
6012       // predicated uniform stores. Today they are treated as any other
6013       // predicated store (see added test cases in
6014       // invariant-store-vectorization.ll).
6015       if (isa<StoreInst>(&I) && isScalarWithPredication(&I))
6016         NumPredStores++;
6017 
6018       if (Legal->isUniform(Ptr) &&
6019           // Conditional loads and stores should be scalarized and predicated.
6020           // isScalarWithPredication cannot be used here since masked
6021           // gather/scatters are not considered scalar with predication.
6022           !Legal->blockNeedsPredication(I.getParent())) {
6023         // TODO: Avoid replicating loads and stores instead of
6024         // relying on instcombine to remove them.
6025         // Load: Scalar load + broadcast
6026         // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
6027         unsigned Cost = getUniformMemOpCost(&I, VF);
6028         setWideningDecision(&I, VF, CM_Scalarize, Cost);
6029         continue;
6030       }
6031 
6032       // We assume that widening is the best solution when possible.
6033       if (memoryInstructionCanBeWidened(&I, VF)) {
6034         unsigned Cost = getConsecutiveMemOpCost(&I, VF);
6035         int ConsecutiveStride =
6036                Legal->isConsecutivePtr(getLoadStorePointerOperand(&I));
6037         assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6038                "Expected consecutive stride.");
6039         InstWidening Decision =
6040             ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
6041         setWideningDecision(&I, VF, Decision, Cost);
6042         continue;
6043       }
6044 
6045       // Choose between Interleaving, Gather/Scatter or Scalarization.
6046       unsigned InterleaveCost = std::numeric_limits<unsigned>::max();
6047       unsigned NumAccesses = 1;
6048       if (isAccessInterleaved(&I)) {
6049         auto Group = getInterleavedAccessGroup(&I);
6050         assert(Group && "Fail to get an interleaved access group.");
6051 
6052         // Make one decision for the whole group.
6053         if (getWideningDecision(&I, VF) != CM_Unknown)
6054           continue;
6055 
6056         NumAccesses = Group->getNumMembers();
6057         if (interleavedAccessCanBeWidened(&I, VF))
6058           InterleaveCost = getInterleaveGroupCost(&I, VF);
6059       }
6060 
6061       unsigned GatherScatterCost =
6062           isLegalGatherOrScatter(&I)
6063               ? getGatherScatterCost(&I, VF) * NumAccesses
6064               : std::numeric_limits<unsigned>::max();
6065 
6066       unsigned ScalarizationCost =
6067           getMemInstScalarizationCost(&I, VF) * NumAccesses;
6068 
6069       // Choose better solution for the current VF,
6070       // write down this decision and use it during vectorization.
6071       unsigned Cost;
6072       InstWidening Decision;
6073       if (InterleaveCost <= GatherScatterCost &&
6074           InterleaveCost < ScalarizationCost) {
6075         Decision = CM_Interleave;
6076         Cost = InterleaveCost;
6077       } else if (GatherScatterCost < ScalarizationCost) {
6078         Decision = CM_GatherScatter;
6079         Cost = GatherScatterCost;
6080       } else {
6081         Decision = CM_Scalarize;
6082         Cost = ScalarizationCost;
6083       }
6084       // If the instructions belongs to an interleave group, the whole group
6085       // receives the same decision. The whole group receives the cost, but
6086       // the cost will actually be assigned to one instruction.
6087       if (auto Group = getInterleavedAccessGroup(&I))
6088         setWideningDecision(Group, VF, Decision, Cost);
6089       else
6090         setWideningDecision(&I, VF, Decision, Cost);
6091     }
6092   }
6093 
6094   // Make sure that any load of address and any other address computation
6095   // remains scalar unless there is gather/scatter support. This avoids
6096   // inevitable extracts into address registers, and also has the benefit of
6097   // activating LSR more, since that pass can't optimize vectorized
6098   // addresses.
6099   if (TTI.prefersVectorizedAddressing())
6100     return;
6101 
6102   // Start with all scalar pointer uses.
6103   SmallPtrSet<Instruction *, 8> AddrDefs;
6104   for (BasicBlock *BB : TheLoop->blocks())
6105     for (Instruction &I : *BB) {
6106       Instruction *PtrDef =
6107         dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
6108       if (PtrDef && TheLoop->contains(PtrDef) &&
6109           getWideningDecision(&I, VF) != CM_GatherScatter)
6110         AddrDefs.insert(PtrDef);
6111     }
6112 
6113   // Add all instructions used to generate the addresses.
6114   SmallVector<Instruction *, 4> Worklist;
6115   for (auto *I : AddrDefs)
6116     Worklist.push_back(I);
6117   while (!Worklist.empty()) {
6118     Instruction *I = Worklist.pop_back_val();
6119     for (auto &Op : I->operands())
6120       if (auto *InstOp = dyn_cast<Instruction>(Op))
6121         if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
6122             AddrDefs.insert(InstOp).second)
6123           Worklist.push_back(InstOp);
6124   }
6125 
6126   for (auto *I : AddrDefs) {
6127     if (isa<LoadInst>(I)) {
6128       // Setting the desired widening decision should ideally be handled in
6129       // by cost functions, but since this involves the task of finding out
6130       // if the loaded register is involved in an address computation, it is
6131       // instead changed here when we know this is the case.
6132       InstWidening Decision = getWideningDecision(I, VF);
6133       if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
6134         // Scalarize a widened load of address.
6135         setWideningDecision(I, VF, CM_Scalarize,
6136                             (VF * getMemoryInstructionCost(I, 1)));
6137       else if (auto Group = getInterleavedAccessGroup(I)) {
6138         // Scalarize an interleave group of address loads.
6139         for (unsigned I = 0; I < Group->getFactor(); ++I) {
6140           if (Instruction *Member = Group->getMember(I))
6141             setWideningDecision(Member, VF, CM_Scalarize,
6142                                 (VF * getMemoryInstructionCost(Member, 1)));
6143         }
6144       }
6145     } else
6146       // Make sure I gets scalarized and a cost estimate without
6147       // scalarization overhead.
6148       ForcedScalars[VF].insert(I);
6149   }
6150 }
6151 
6152 unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
6153                                                         unsigned VF,
6154                                                         Type *&VectorTy) {
6155   Type *RetTy = I->getType();
6156   if (canTruncateToMinimalBitwidth(I, VF))
6157     RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
6158   VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF);
6159   auto SE = PSE.getSE();
6160 
6161   // TODO: We need to estimate the cost of intrinsic calls.
6162   switch (I->getOpcode()) {
6163   case Instruction::GetElementPtr:
6164     // We mark this instruction as zero-cost because the cost of GEPs in
6165     // vectorized code depends on whether the corresponding memory instruction
6166     // is scalarized or not. Therefore, we handle GEPs with the memory
6167     // instruction cost.
6168     return 0;
6169   case Instruction::Br: {
6170     // In cases of scalarized and predicated instructions, there will be VF
6171     // predicated blocks in the vectorized loop. Each branch around these
6172     // blocks requires also an extract of its vector compare i1 element.
6173     bool ScalarPredicatedBB = false;
6174     BranchInst *BI = cast<BranchInst>(I);
6175     if (VF > 1 && BI->isConditional() &&
6176         (PredicatedBBsAfterVectorization.find(BI->getSuccessor(0)) !=
6177              PredicatedBBsAfterVectorization.end() ||
6178          PredicatedBBsAfterVectorization.find(BI->getSuccessor(1)) !=
6179              PredicatedBBsAfterVectorization.end()))
6180       ScalarPredicatedBB = true;
6181 
6182     if (ScalarPredicatedBB) {
6183       // Return cost for branches around scalarized and predicated blocks.
6184       Type *Vec_i1Ty =
6185           VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
6186       return (TTI.getScalarizationOverhead(Vec_i1Ty, false, true) +
6187               (TTI.getCFInstrCost(Instruction::Br) * VF));
6188     } else if (I->getParent() == TheLoop->getLoopLatch() || VF == 1)
6189       // The back-edge branch will remain, as will all scalar branches.
6190       return TTI.getCFInstrCost(Instruction::Br);
6191     else
6192       // This branch will be eliminated by if-conversion.
6193       return 0;
6194     // Note: We currently assume zero cost for an unconditional branch inside
6195     // a predicated block since it will become a fall-through, although we
6196     // may decide in the future to call TTI for all branches.
6197   }
6198   case Instruction::PHI: {
6199     auto *Phi = cast<PHINode>(I);
6200 
6201     // First-order recurrences are replaced by vector shuffles inside the loop.
6202     // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type.
6203     if (VF > 1 && Legal->isFirstOrderRecurrence(Phi))
6204       return TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector,
6205                                 VectorTy, VF - 1, VectorType::get(RetTy, 1));
6206 
6207     // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
6208     // converted into select instructions. We require N - 1 selects per phi
6209     // node, where N is the number of incoming values.
6210     if (VF > 1 && Phi->getParent() != TheLoop->getHeader())
6211       return (Phi->getNumIncomingValues() - 1) *
6212              TTI.getCmpSelInstrCost(
6213                  Instruction::Select, ToVectorTy(Phi->getType(), VF),
6214                  ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF));
6215 
6216     return TTI.getCFInstrCost(Instruction::PHI);
6217   }
6218   case Instruction::UDiv:
6219   case Instruction::SDiv:
6220   case Instruction::URem:
6221   case Instruction::SRem:
6222     // If we have a predicated instruction, it may not be executed for each
6223     // vector lane. Get the scalarization cost and scale this amount by the
6224     // probability of executing the predicated block. If the instruction is not
6225     // predicated, we fall through to the next case.
6226     if (VF > 1 && isScalarWithPredication(I)) {
6227       unsigned Cost = 0;
6228 
6229       // These instructions have a non-void type, so account for the phi nodes
6230       // that we will create. This cost is likely to be zero. The phi node
6231       // cost, if any, should be scaled by the block probability because it
6232       // models a copy at the end of each predicated block.
6233       Cost += VF * TTI.getCFInstrCost(Instruction::PHI);
6234 
6235       // The cost of the non-predicated instruction.
6236       Cost += VF * TTI.getArithmeticInstrCost(I->getOpcode(), RetTy);
6237 
6238       // The cost of insertelement and extractelement instructions needed for
6239       // scalarization.
6240       Cost += getScalarizationOverhead(I, VF);
6241 
6242       // Scale the cost by the probability of executing the predicated blocks.
6243       // This assumes the predicated block for each vector lane is equally
6244       // likely.
6245       return Cost / getReciprocalPredBlockProb();
6246     }
6247     LLVM_FALLTHROUGH;
6248   case Instruction::Add:
6249   case Instruction::FAdd:
6250   case Instruction::Sub:
6251   case Instruction::FSub:
6252   case Instruction::Mul:
6253   case Instruction::FMul:
6254   case Instruction::FDiv:
6255   case Instruction::FRem:
6256   case Instruction::Shl:
6257   case Instruction::LShr:
6258   case Instruction::AShr:
6259   case Instruction::And:
6260   case Instruction::Or:
6261   case Instruction::Xor: {
6262     // Since we will replace the stride by 1 the multiplication should go away.
6263     if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal))
6264       return 0;
6265     // Certain instructions can be cheaper to vectorize if they have a constant
6266     // second vector operand. One example of this are shifts on x86.
6267     Value *Op2 = I->getOperand(1);
6268     TargetTransformInfo::OperandValueProperties Op2VP;
6269     TargetTransformInfo::OperandValueKind Op2VK =
6270         TTI.getOperandInfo(Op2, Op2VP);
6271     if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2))
6272       Op2VK = TargetTransformInfo::OK_UniformValue;
6273 
6274     SmallVector<const Value *, 4> Operands(I->operand_values());
6275     unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
6276     return N * TTI.getArithmeticInstrCost(
6277                    I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue,
6278                    Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I);
6279   }
6280   case Instruction::FNeg: {
6281     unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
6282     return N * TTI.getArithmeticInstrCost(
6283                    I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue,
6284                    TargetTransformInfo::OK_AnyValue,
6285                    TargetTransformInfo::OP_None, TargetTransformInfo::OP_None,
6286                    I->getOperand(0), I);
6287   }
6288   case Instruction::Select: {
6289     SelectInst *SI = cast<SelectInst>(I);
6290     const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
6291     bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
6292     Type *CondTy = SI->getCondition()->getType();
6293     if (!ScalarCond)
6294       CondTy = VectorType::get(CondTy, VF);
6295 
6296     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, I);
6297   }
6298   case Instruction::ICmp:
6299   case Instruction::FCmp: {
6300     Type *ValTy = I->getOperand(0)->getType();
6301     Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
6302     if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
6303       ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
6304     VectorTy = ToVectorTy(ValTy, VF);
6305     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, I);
6306   }
6307   case Instruction::Store:
6308   case Instruction::Load: {
6309     unsigned Width = VF;
6310     if (Width > 1) {
6311       InstWidening Decision = getWideningDecision(I, Width);
6312       assert(Decision != CM_Unknown &&
6313              "CM decision should be taken at this point");
6314       if (Decision == CM_Scalarize)
6315         Width = 1;
6316     }
6317     VectorTy = ToVectorTy(getMemInstValueType(I), Width);
6318     return getMemoryInstructionCost(I, VF);
6319   }
6320   case Instruction::ZExt:
6321   case Instruction::SExt:
6322   case Instruction::FPToUI:
6323   case Instruction::FPToSI:
6324   case Instruction::FPExt:
6325   case Instruction::PtrToInt:
6326   case Instruction::IntToPtr:
6327   case Instruction::SIToFP:
6328   case Instruction::UIToFP:
6329   case Instruction::Trunc:
6330   case Instruction::FPTrunc:
6331   case Instruction::BitCast: {
6332     // We optimize the truncation of induction variables having constant
6333     // integer steps. The cost of these truncations is the same as the scalar
6334     // operation.
6335     if (isOptimizableIVTruncate(I, VF)) {
6336       auto *Trunc = cast<TruncInst>(I);
6337       return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
6338                                   Trunc->getSrcTy(), Trunc);
6339     }
6340 
6341     Type *SrcScalarTy = I->getOperand(0)->getType();
6342     Type *SrcVecTy =
6343         VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
6344     if (canTruncateToMinimalBitwidth(I, VF)) {
6345       // This cast is going to be shrunk. This may remove the cast or it might
6346       // turn it into slightly different cast. For example, if MinBW == 16,
6347       // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
6348       //
6349       // Calculate the modified src and dest types.
6350       Type *MinVecTy = VectorTy;
6351       if (I->getOpcode() == Instruction::Trunc) {
6352         SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
6353         VectorTy =
6354             largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
6355       } else if (I->getOpcode() == Instruction::ZExt ||
6356                  I->getOpcode() == Instruction::SExt) {
6357         SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy);
6358         VectorTy =
6359             smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
6360       }
6361     }
6362 
6363     unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
6364     return N * TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy, I);
6365   }
6366   case Instruction::Call: {
6367     bool NeedToScalarize;
6368     CallInst *CI = cast<CallInst>(I);
6369     unsigned CallCost = getVectorCallCost(CI, VF, NeedToScalarize);
6370     if (getVectorIntrinsicIDForCall(CI, TLI))
6371       return std::min(CallCost, getVectorIntrinsicCost(CI, VF));
6372     return CallCost;
6373   }
6374   default:
6375     // The cost of executing VF copies of the scalar instruction. This opcode
6376     // is unknown. Assume that it is the same as 'mul'.
6377     return VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy) +
6378            getScalarizationOverhead(I, VF);
6379   } // end of switch.
6380 }
6381 
6382 char LoopVectorize::ID = 0;
6383 
6384 static const char lv_name[] = "Loop Vectorization";
6385 
6386 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
6387 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
6388 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)
6389 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
6390 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
6391 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
6392 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
6393 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
6394 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
6395 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
6396 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
6397 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
6398 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
6399 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
6400 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
6401 
6402 namespace llvm {
6403 
6404 Pass *createLoopVectorizePass() { return new LoopVectorize(); }
6405 
6406 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced,
6407                               bool VectorizeOnlyWhenForced) {
6408   return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced);
6409 }
6410 
6411 } // end namespace llvm
6412 
6413 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
6414   // Check if the pointer operand of a load or store instruction is
6415   // consecutive.
6416   if (auto *Ptr = getLoadStorePointerOperand(Inst))
6417     return Legal->isConsecutivePtr(Ptr);
6418   return false;
6419 }
6420 
6421 void LoopVectorizationCostModel::collectValuesToIgnore() {
6422   // Ignore ephemeral values.
6423   CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
6424 
6425   // Ignore type-promoting instructions we identified during reduction
6426   // detection.
6427   for (auto &Reduction : *Legal->getReductionVars()) {
6428     RecurrenceDescriptor &RedDes = Reduction.second;
6429     SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
6430     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
6431   }
6432   // Ignore type-casting instructions we identified during induction
6433   // detection.
6434   for (auto &Induction : *Legal->getInductionVars()) {
6435     InductionDescriptor &IndDes = Induction.second;
6436     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
6437     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
6438   }
6439 }
6440 
6441 // TODO: we could return a pair of values that specify the max VF and
6442 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
6443 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
6444 // doesn't have a cost model that can choose which plan to execute if
6445 // more than one is generated.
6446 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits,
6447                                  LoopVectorizationCostModel &CM) {
6448   unsigned WidestType;
6449   std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
6450   return WidestVectorRegBits / WidestType;
6451 }
6452 
6453 VectorizationFactor
6454 LoopVectorizationPlanner::planInVPlanNativePath(unsigned UserVF) {
6455   unsigned VF = UserVF;
6456   // Outer loop handling: They may require CFG and instruction level
6457   // transformations before even evaluating whether vectorization is profitable.
6458   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
6459   // the vectorization pipeline.
6460   if (!OrigLoop->empty()) {
6461     // If the user doesn't provide a vectorization factor, determine a
6462     // reasonable one.
6463     if (!UserVF) {
6464       VF = determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM);
6465       LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
6466 
6467       // Make sure we have a VF > 1 for stress testing.
6468       if (VPlanBuildStressTest && VF < 2) {
6469         LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
6470                           << "overriding computed VF.\n");
6471         VF = 4;
6472       }
6473     }
6474     assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
6475     assert(isPowerOf2_32(VF) && "VF needs to be a power of two");
6476     LLVM_DEBUG(dbgs() << "LV: Using " << (UserVF ? "user " : "") << "VF " << VF
6477                       << " to build VPlans.\n");
6478     buildVPlans(VF, VF);
6479 
6480     // For VPlan build stress testing, we bail out after VPlan construction.
6481     if (VPlanBuildStressTest)
6482       return VectorizationFactor::Disabled();
6483 
6484     return {VF, 0};
6485   }
6486 
6487   LLVM_DEBUG(
6488       dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
6489                 "VPlan-native path.\n");
6490   return VectorizationFactor::Disabled();
6491 }
6492 
6493 Optional<VectorizationFactor> LoopVectorizationPlanner::plan(unsigned UserVF) {
6494   assert(OrigLoop->empty() && "Inner loop expected.");
6495   Optional<unsigned> MaybeMaxVF = CM.computeMaxVF();
6496   if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved.
6497     return None;
6498 
6499   // Invalidate interleave groups if all blocks of loop will be predicated.
6500   if (CM.blockNeedsPredication(OrigLoop->getHeader()) &&
6501       !useMaskedInterleavedAccesses(*TTI)) {
6502     LLVM_DEBUG(
6503         dbgs()
6504         << "LV: Invalidate all interleaved groups due to fold-tail by masking "
6505            "which requires masked-interleaved support.\n");
6506     CM.InterleaveInfo.reset();
6507   }
6508 
6509   if (UserVF) {
6510     LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
6511     assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two");
6512     // Collect the instructions (and their associated costs) that will be more
6513     // profitable to scalarize.
6514     CM.selectUserVectorizationFactor(UserVF);
6515     buildVPlansWithVPRecipes(UserVF, UserVF);
6516     LLVM_DEBUG(printPlans(dbgs()));
6517     return {{UserVF, 0}};
6518   }
6519 
6520   unsigned MaxVF = MaybeMaxVF.getValue();
6521   assert(MaxVF != 0 && "MaxVF is zero.");
6522 
6523   for (unsigned VF = 1; VF <= MaxVF; VF *= 2) {
6524     // Collect Uniform and Scalar instructions after vectorization with VF.
6525     CM.collectUniformsAndScalars(VF);
6526 
6527     // Collect the instructions (and their associated costs) that will be more
6528     // profitable to scalarize.
6529     if (VF > 1)
6530       CM.collectInstsToScalarize(VF);
6531   }
6532 
6533   buildVPlansWithVPRecipes(1, MaxVF);
6534   LLVM_DEBUG(printPlans(dbgs()));
6535   if (MaxVF == 1)
6536     return VectorizationFactor::Disabled();
6537 
6538   // Select the optimal vectorization factor.
6539   return CM.selectVectorizationFactor(MaxVF);
6540 }
6541 
6542 void LoopVectorizationPlanner::setBestPlan(unsigned VF, unsigned UF) {
6543   LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF
6544                     << '\n');
6545   BestVF = VF;
6546   BestUF = UF;
6547 
6548   erase_if(VPlans, [VF](const VPlanPtr &Plan) {
6549     return !Plan->hasVF(VF);
6550   });
6551   assert(VPlans.size() == 1 && "Best VF has not a single VPlan.");
6552 }
6553 
6554 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV,
6555                                            DominatorTree *DT) {
6556   // Perform the actual loop transformation.
6557 
6558   // 1. Create a new empty loop. Unlink the old loop and connect the new one.
6559   VPCallbackILV CallbackILV(ILV);
6560 
6561   VPTransformState State{BestVF, BestUF,      LI,
6562                          DT,     ILV.Builder, ILV.VectorLoopValueMap,
6563                          &ILV,   CallbackILV};
6564   State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
6565   State.TripCount = ILV.getOrCreateTripCount(nullptr);
6566 
6567   //===------------------------------------------------===//
6568   //
6569   // Notice: any optimization or new instruction that go
6570   // into the code below should also be implemented in
6571   // the cost-model.
6572   //
6573   //===------------------------------------------------===//
6574 
6575   // 2. Copy and widen instructions from the old loop into the new loop.
6576   assert(VPlans.size() == 1 && "Not a single VPlan to execute.");
6577   VPlans.front()->execute(&State);
6578 
6579   // 3. Fix the vectorized code: take care of header phi's, live-outs,
6580   //    predication, updating analyses.
6581   ILV.fixVectorizedLoop();
6582 }
6583 
6584 void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
6585     SmallPtrSetImpl<Instruction *> &DeadInstructions) {
6586   BasicBlock *Latch = OrigLoop->getLoopLatch();
6587 
6588   // We create new control-flow for the vectorized loop, so the original
6589   // condition will be dead after vectorization if it's only used by the
6590   // branch.
6591   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
6592   if (Cmp && Cmp->hasOneUse())
6593     DeadInstructions.insert(Cmp);
6594 
6595   // We create new "steps" for induction variable updates to which the original
6596   // induction variables map. An original update instruction will be dead if
6597   // all its users except the induction variable are dead.
6598   for (auto &Induction : *Legal->getInductionVars()) {
6599     PHINode *Ind = Induction.first;
6600     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
6601     if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
6602           return U == Ind || DeadInstructions.find(cast<Instruction>(U)) !=
6603                                  DeadInstructions.end();
6604         }))
6605       DeadInstructions.insert(IndUpdate);
6606 
6607     // We record as "Dead" also the type-casting instructions we had identified
6608     // during induction analysis. We don't need any handling for them in the
6609     // vectorized loop because we have proven that, under a proper runtime
6610     // test guarding the vectorized loop, the value of the phi, and the casted
6611     // value of the phi, are the same. The last instruction in this casting chain
6612     // will get its scalar/vector/widened def from the scalar/vector/widened def
6613     // of the respective phi node. Any other casts in the induction def-use chain
6614     // have no other uses outside the phi update chain, and will be ignored.
6615     InductionDescriptor &IndDes = Induction.second;
6616     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
6617     DeadInstructions.insert(Casts.begin(), Casts.end());
6618   }
6619 }
6620 
6621 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; }
6622 
6623 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; }
6624 
6625 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step,
6626                                         Instruction::BinaryOps BinOp) {
6627   // When unrolling and the VF is 1, we only need to add a simple scalar.
6628   Type *Ty = Val->getType();
6629   assert(!Ty->isVectorTy() && "Val must be a scalar");
6630 
6631   if (Ty->isFloatingPointTy()) {
6632     Constant *C = ConstantFP::get(Ty, (double)StartIdx);
6633 
6634     // Floating point operations had to be 'fast' to enable the unrolling.
6635     Value *MulOp = addFastMathFlag(Builder.CreateFMul(C, Step));
6636     return addFastMathFlag(Builder.CreateBinOp(BinOp, Val, MulOp));
6637   }
6638   Constant *C = ConstantInt::get(Ty, StartIdx);
6639   return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction");
6640 }
6641 
6642 static void AddRuntimeUnrollDisableMetaData(Loop *L) {
6643   SmallVector<Metadata *, 4> MDs;
6644   // Reserve first location for self reference to the LoopID metadata node.
6645   MDs.push_back(nullptr);
6646   bool IsUnrollMetadata = false;
6647   MDNode *LoopID = L->getLoopID();
6648   if (LoopID) {
6649     // First find existing loop unrolling disable metadata.
6650     for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
6651       auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
6652       if (MD) {
6653         const auto *S = dyn_cast<MDString>(MD->getOperand(0));
6654         IsUnrollMetadata =
6655             S && S->getString().startswith("llvm.loop.unroll.disable");
6656       }
6657       MDs.push_back(LoopID->getOperand(i));
6658     }
6659   }
6660 
6661   if (!IsUnrollMetadata) {
6662     // Add runtime unroll disable metadata.
6663     LLVMContext &Context = L->getHeader()->getContext();
6664     SmallVector<Metadata *, 1> DisableOperands;
6665     DisableOperands.push_back(
6666         MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
6667     MDNode *DisableNode = MDNode::get(Context, DisableOperands);
6668     MDs.push_back(DisableNode);
6669     MDNode *NewLoopID = MDNode::get(Context, MDs);
6670     // Set operand 0 to refer to the loop id itself.
6671     NewLoopID->replaceOperandWith(0, NewLoopID);
6672     L->setLoopID(NewLoopID);
6673   }
6674 }
6675 
6676 bool LoopVectorizationPlanner::getDecisionAndClampRange(
6677     const std::function<bool(unsigned)> &Predicate, VFRange &Range) {
6678   assert(Range.End > Range.Start && "Trying to test an empty VF range.");
6679   bool PredicateAtRangeStart = Predicate(Range.Start);
6680 
6681   for (unsigned TmpVF = Range.Start * 2; TmpVF < Range.End; TmpVF *= 2)
6682     if (Predicate(TmpVF) != PredicateAtRangeStart) {
6683       Range.End = TmpVF;
6684       break;
6685     }
6686 
6687   return PredicateAtRangeStart;
6688 }
6689 
6690 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
6691 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
6692 /// of VF's starting at a given VF and extending it as much as possible. Each
6693 /// vectorization decision can potentially shorten this sub-range during
6694 /// buildVPlan().
6695 void LoopVectorizationPlanner::buildVPlans(unsigned MinVF, unsigned MaxVF) {
6696   for (unsigned VF = MinVF; VF < MaxVF + 1;) {
6697     VFRange SubRange = {VF, MaxVF + 1};
6698     VPlans.push_back(buildVPlan(SubRange));
6699     VF = SubRange.End;
6700   }
6701 }
6702 
6703 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
6704                                          VPlanPtr &Plan) {
6705   assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
6706 
6707   // Look for cached value.
6708   std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
6709   EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
6710   if (ECEntryIt != EdgeMaskCache.end())
6711     return ECEntryIt->second;
6712 
6713   VPValue *SrcMask = createBlockInMask(Src, Plan);
6714 
6715   // The terminator has to be a branch inst!
6716   BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
6717   assert(BI && "Unexpected terminator found");
6718 
6719   if (!BI->isConditional())
6720     return EdgeMaskCache[Edge] = SrcMask;
6721 
6722   VPValue *EdgeMask = Plan->getVPValue(BI->getCondition());
6723   assert(EdgeMask && "No Edge Mask found for condition");
6724 
6725   if (BI->getSuccessor(0) != Dst)
6726     EdgeMask = Builder.createNot(EdgeMask);
6727 
6728   if (SrcMask) // Otherwise block in-mask is all-one, no need to AND.
6729     EdgeMask = Builder.createAnd(EdgeMask, SrcMask);
6730 
6731   return EdgeMaskCache[Edge] = EdgeMask;
6732 }
6733 
6734 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
6735   assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
6736 
6737   // Look for cached value.
6738   BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB);
6739   if (BCEntryIt != BlockMaskCache.end())
6740     return BCEntryIt->second;
6741 
6742   // All-one mask is modelled as no-mask following the convention for masked
6743   // load/store/gather/scatter. Initialize BlockMask to no-mask.
6744   VPValue *BlockMask = nullptr;
6745 
6746   if (OrigLoop->getHeader() == BB) {
6747     if (!CM.blockNeedsPredication(BB))
6748       return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one.
6749 
6750     // Introduce the early-exit compare IV <= BTC to form header block mask.
6751     // This is used instead of IV < TC because TC may wrap, unlike BTC.
6752     VPValue *IV = Plan->getVPValue(Legal->getPrimaryInduction());
6753     VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
6754     BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
6755     return BlockMaskCache[BB] = BlockMask;
6756   }
6757 
6758   // This is the block mask. We OR all incoming edges.
6759   for (auto *Predecessor : predecessors(BB)) {
6760     VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan);
6761     if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too.
6762       return BlockMaskCache[BB] = EdgeMask;
6763 
6764     if (!BlockMask) { // BlockMask has its initialized nullptr value.
6765       BlockMask = EdgeMask;
6766       continue;
6767     }
6768 
6769     BlockMask = Builder.createOr(BlockMask, EdgeMask);
6770   }
6771 
6772   return BlockMaskCache[BB] = BlockMask;
6773 }
6774 
6775 VPWidenMemoryInstructionRecipe *
6776 VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range,
6777                                   VPlanPtr &Plan) {
6778   if (!isa<LoadInst>(I) && !isa<StoreInst>(I))
6779     return nullptr;
6780 
6781   auto willWiden = [&](unsigned VF) -> bool {
6782     if (VF == 1)
6783       return false;
6784     LoopVectorizationCostModel::InstWidening Decision =
6785         CM.getWideningDecision(I, VF);
6786     assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
6787            "CM decision should be taken at this point.");
6788     if (Decision == LoopVectorizationCostModel::CM_Interleave)
6789       return true;
6790     if (CM.isScalarAfterVectorization(I, VF) ||
6791         CM.isProfitableToScalarize(I, VF))
6792       return false;
6793     return Decision != LoopVectorizationCostModel::CM_Scalarize;
6794   };
6795 
6796   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
6797     return nullptr;
6798 
6799   VPValue *Mask = nullptr;
6800   if (Legal->isMaskRequired(I))
6801     Mask = createBlockInMask(I->getParent(), Plan);
6802 
6803   return new VPWidenMemoryInstructionRecipe(*I, Mask);
6804 }
6805 
6806 VPWidenIntOrFpInductionRecipe *
6807 VPRecipeBuilder::tryToOptimizeInduction(Instruction *I, VFRange &Range) {
6808   if (PHINode *Phi = dyn_cast<PHINode>(I)) {
6809     // Check if this is an integer or fp induction. If so, build the recipe that
6810     // produces its scalar and vector values.
6811     InductionDescriptor II = Legal->getInductionVars()->lookup(Phi);
6812     if (II.getKind() == InductionDescriptor::IK_IntInduction ||
6813         II.getKind() == InductionDescriptor::IK_FpInduction)
6814       return new VPWidenIntOrFpInductionRecipe(Phi);
6815 
6816     return nullptr;
6817   }
6818 
6819   // Optimize the special case where the source is a constant integer
6820   // induction variable. Notice that we can only optimize the 'trunc' case
6821   // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
6822   // (c) other casts depend on pointer size.
6823 
6824   // Determine whether \p K is a truncation based on an induction variable that
6825   // can be optimized.
6826   auto isOptimizableIVTruncate =
6827       [&](Instruction *K) -> std::function<bool(unsigned)> {
6828     return
6829         [=](unsigned VF) -> bool { return CM.isOptimizableIVTruncate(K, VF); };
6830   };
6831 
6832   if (isa<TruncInst>(I) && LoopVectorizationPlanner::getDecisionAndClampRange(
6833                                isOptimizableIVTruncate(I), Range))
6834     return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)),
6835                                              cast<TruncInst>(I));
6836   return nullptr;
6837 }
6838 
6839 VPBlendRecipe *VPRecipeBuilder::tryToBlend(Instruction *I, VPlanPtr &Plan) {
6840   PHINode *Phi = dyn_cast<PHINode>(I);
6841   if (!Phi || Phi->getParent() == OrigLoop->getHeader())
6842     return nullptr;
6843 
6844   // We know that all PHIs in non-header blocks are converted into selects, so
6845   // we don't have to worry about the insertion order and we can just use the
6846   // builder. At this point we generate the predication tree. There may be
6847   // duplications since this is a simple recursive scan, but future
6848   // optimizations will clean it up.
6849 
6850   SmallVector<VPValue *, 2> Masks;
6851   unsigned NumIncoming = Phi->getNumIncomingValues();
6852   for (unsigned In = 0; In < NumIncoming; In++) {
6853     VPValue *EdgeMask =
6854       createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan);
6855     assert((EdgeMask || NumIncoming == 1) &&
6856            "Multiple predecessors with one having a full mask");
6857     if (EdgeMask)
6858       Masks.push_back(EdgeMask);
6859   }
6860   return new VPBlendRecipe(Phi, Masks);
6861 }
6862 
6863 bool VPRecipeBuilder::tryToWiden(Instruction *I, VPBasicBlock *VPBB,
6864                                  VFRange &Range) {
6865 
6866   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
6867       [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range);
6868 
6869   if (IsPredicated)
6870     return false;
6871 
6872   auto IsVectorizableOpcode = [](unsigned Opcode) {
6873     switch (Opcode) {
6874     case Instruction::Add:
6875     case Instruction::And:
6876     case Instruction::AShr:
6877     case Instruction::BitCast:
6878     case Instruction::Br:
6879     case Instruction::Call:
6880     case Instruction::FAdd:
6881     case Instruction::FCmp:
6882     case Instruction::FDiv:
6883     case Instruction::FMul:
6884     case Instruction::FNeg:
6885     case Instruction::FPExt:
6886     case Instruction::FPToSI:
6887     case Instruction::FPToUI:
6888     case Instruction::FPTrunc:
6889     case Instruction::FRem:
6890     case Instruction::FSub:
6891     case Instruction::ICmp:
6892     case Instruction::IntToPtr:
6893     case Instruction::Load:
6894     case Instruction::LShr:
6895     case Instruction::Mul:
6896     case Instruction::Or:
6897     case Instruction::PHI:
6898     case Instruction::PtrToInt:
6899     case Instruction::SDiv:
6900     case Instruction::Select:
6901     case Instruction::SExt:
6902     case Instruction::Shl:
6903     case Instruction::SIToFP:
6904     case Instruction::SRem:
6905     case Instruction::Store:
6906     case Instruction::Sub:
6907     case Instruction::Trunc:
6908     case Instruction::UDiv:
6909     case Instruction::UIToFP:
6910     case Instruction::URem:
6911     case Instruction::Xor:
6912     case Instruction::ZExt:
6913       return true;
6914     }
6915     return false;
6916   };
6917 
6918   if (!IsVectorizableOpcode(I->getOpcode()))
6919     return false;
6920 
6921   if (CallInst *CI = dyn_cast<CallInst>(I)) {
6922     Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
6923     if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
6924                ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect))
6925       return false;
6926   }
6927 
6928   auto willWiden = [&](unsigned VF) -> bool {
6929     if (!isa<PHINode>(I) && (CM.isScalarAfterVectorization(I, VF) ||
6930                              CM.isProfitableToScalarize(I, VF)))
6931       return false;
6932     if (CallInst *CI = dyn_cast<CallInst>(I)) {
6933       Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
6934       // The following case may be scalarized depending on the VF.
6935       // The flag shows whether we use Intrinsic or a usual Call for vectorized
6936       // version of the instruction.
6937       // Is it beneficial to perform intrinsic call compared to lib call?
6938       bool NeedToScalarize;
6939       unsigned CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize);
6940       bool UseVectorIntrinsic =
6941           ID && CM.getVectorIntrinsicCost(CI, VF) <= CallCost;
6942       return UseVectorIntrinsic || !NeedToScalarize;
6943     }
6944     if (isa<LoadInst>(I) || isa<StoreInst>(I)) {
6945       assert(CM.getWideningDecision(I, VF) ==
6946                  LoopVectorizationCostModel::CM_Scalarize &&
6947              "Memory widening decisions should have been taken care by now");
6948       return false;
6949     }
6950     return true;
6951   };
6952 
6953   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
6954     return false;
6955   // If this ingredient's recipe is to be recorded, keep its recipe a singleton
6956   // to avoid having to split recipes later.
6957   bool IsSingleton = Ingredient2Recipe.count(I);
6958 
6959   // Success: widen this instruction.
6960 
6961   // Use the default widening recipe. We optimize the common case where
6962   // consecutive instructions can be represented by a single recipe.
6963   if (!IsSingleton && !VPBB->empty() && LastExtensibleRecipe == &VPBB->back() &&
6964       LastExtensibleRecipe->appendInstruction(I))
6965     return true;
6966 
6967   VPWidenRecipe *WidenRecipe = new VPWidenRecipe(I);
6968   if (!IsSingleton)
6969     LastExtensibleRecipe = WidenRecipe;
6970   setRecipe(I, WidenRecipe);
6971   VPBB->appendRecipe(WidenRecipe);
6972   return true;
6973 }
6974 
6975 VPBasicBlock *VPRecipeBuilder::handleReplication(
6976     Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
6977     DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe,
6978     VPlanPtr &Plan) {
6979   bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
6980       [&](unsigned VF) { return CM.isUniformAfterVectorization(I, VF); },
6981       Range);
6982 
6983   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
6984       [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range);
6985 
6986   auto *Recipe = new VPReplicateRecipe(I, IsUniform, IsPredicated);
6987   setRecipe(I, Recipe);
6988 
6989   // Find if I uses a predicated instruction. If so, it will use its scalar
6990   // value. Avoid hoisting the insert-element which packs the scalar value into
6991   // a vector value, as that happens iff all users use the vector value.
6992   for (auto &Op : I->operands())
6993     if (auto *PredInst = dyn_cast<Instruction>(Op))
6994       if (PredInst2Recipe.find(PredInst) != PredInst2Recipe.end())
6995         PredInst2Recipe[PredInst]->setAlsoPack(false);
6996 
6997   // Finalize the recipe for Instr, first if it is not predicated.
6998   if (!IsPredicated) {
6999     LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
7000     VPBB->appendRecipe(Recipe);
7001     return VPBB;
7002   }
7003   LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
7004   assert(VPBB->getSuccessors().empty() &&
7005          "VPBB has successors when handling predicated replication.");
7006   // Record predicated instructions for above packing optimizations.
7007   PredInst2Recipe[I] = Recipe;
7008   VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan);
7009   VPBlockUtils::insertBlockAfter(Region, VPBB);
7010   auto *RegSucc = new VPBasicBlock();
7011   VPBlockUtils::insertBlockAfter(RegSucc, Region);
7012   return RegSucc;
7013 }
7014 
7015 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr,
7016                                                       VPRecipeBase *PredRecipe,
7017                                                       VPlanPtr &Plan) {
7018   // Instructions marked for predication are replicated and placed under an
7019   // if-then construct to prevent side-effects.
7020 
7021   // Generate recipes to compute the block mask for this region.
7022   VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan);
7023 
7024   // Build the triangular if-then region.
7025   std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
7026   assert(Instr->getParent() && "Predicated instruction not in any basic block");
7027   auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask);
7028   auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
7029   auto *PHIRecipe =
7030       Instr->getType()->isVoidTy() ? nullptr : new VPPredInstPHIRecipe(Instr);
7031   auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
7032   auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe);
7033   VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true);
7034 
7035   // Note: first set Entry as region entry and then connect successors starting
7036   // from it in order, to propagate the "parent" of each VPBasicBlock.
7037   VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry);
7038   VPBlockUtils::connectBlocks(Pred, Exit);
7039 
7040   return Region;
7041 }
7042 
7043 bool VPRecipeBuilder::tryToCreateRecipe(Instruction *Instr, VFRange &Range,
7044                                         VPlanPtr &Plan, VPBasicBlock *VPBB) {
7045   VPRecipeBase *Recipe = nullptr;
7046 
7047   // First, check for specific widening recipes that deal with memory
7048   // operations, inductions and Phi nodes.
7049   if ((Recipe = tryToWidenMemory(Instr, Range, Plan)) ||
7050       (Recipe = tryToOptimizeInduction(Instr, Range)) ||
7051       (Recipe = tryToBlend(Instr, Plan)) ||
7052       (isa<PHINode>(Instr) &&
7053        (Recipe = new VPWidenPHIRecipe(cast<PHINode>(Instr))))) {
7054     setRecipe(Instr, Recipe);
7055     VPBB->appendRecipe(Recipe);
7056     return true;
7057   }
7058 
7059   // Handle GEP widening.
7060   if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Instr)) {
7061     auto Scalarize = [&](unsigned VF) {
7062       return CM.isScalarWithPredication(Instr, VF) ||
7063              CM.isScalarAfterVectorization(Instr, VF) ||
7064              CM.isProfitableToScalarize(Instr, VF);
7065     };
7066     if (LoopVectorizationPlanner::getDecisionAndClampRange(Scalarize, Range))
7067       return false;
7068     VPWidenGEPRecipe *Recipe = new VPWidenGEPRecipe(GEP, OrigLoop);
7069     setRecipe(Instr, Recipe);
7070     VPBB->appendRecipe(Recipe);
7071     return true;
7072   }
7073 
7074   // Check if Instr is to be widened by a general VPWidenRecipe, after
7075   // having first checked for specific widening recipes.
7076   if (tryToWiden(Instr, VPBB, Range))
7077     return true;
7078 
7079   return false;
7080 }
7081 
7082 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF,
7083                                                         unsigned MaxVF) {
7084   assert(OrigLoop->empty() && "Inner loop expected.");
7085 
7086   // Collect conditions feeding internal conditional branches; they need to be
7087   // represented in VPlan for it to model masking.
7088   SmallPtrSet<Value *, 1> NeedDef;
7089 
7090   auto *Latch = OrigLoop->getLoopLatch();
7091   for (BasicBlock *BB : OrigLoop->blocks()) {
7092     if (BB == Latch)
7093       continue;
7094     BranchInst *Branch = dyn_cast<BranchInst>(BB->getTerminator());
7095     if (Branch && Branch->isConditional())
7096       NeedDef.insert(Branch->getCondition());
7097   }
7098 
7099   // If the tail is to be folded by masking, the primary induction variable
7100   // needs to be represented in VPlan for it to model early-exit masking.
7101   // Also, both the Phi and the live-out instruction of each reduction are
7102   // required in order to introduce a select between them in VPlan.
7103   if (CM.foldTailByMasking()) {
7104     NeedDef.insert(Legal->getPrimaryInduction());
7105     for (auto &Reduction : *Legal->getReductionVars()) {
7106       NeedDef.insert(Reduction.first);
7107       NeedDef.insert(Reduction.second.getLoopExitInstr());
7108     }
7109   }
7110 
7111   // Collect instructions from the original loop that will become trivially dead
7112   // in the vectorized loop. We don't need to vectorize these instructions. For
7113   // example, original induction update instructions can become dead because we
7114   // separately emit induction "steps" when generating code for the new loop.
7115   // Similarly, we create a new latch condition when setting up the structure
7116   // of the new loop, so the old one can become dead.
7117   SmallPtrSet<Instruction *, 4> DeadInstructions;
7118   collectTriviallyDeadInstructions(DeadInstructions);
7119 
7120   for (unsigned VF = MinVF; VF < MaxVF + 1;) {
7121     VFRange SubRange = {VF, MaxVF + 1};
7122     VPlans.push_back(
7123         buildVPlanWithVPRecipes(SubRange, NeedDef, DeadInstructions));
7124     VF = SubRange.End;
7125   }
7126 }
7127 
7128 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
7129     VFRange &Range, SmallPtrSetImpl<Value *> &NeedDef,
7130     SmallPtrSetImpl<Instruction *> &DeadInstructions) {
7131 
7132   // Hold a mapping from predicated instructions to their recipes, in order to
7133   // fix their AlsoPack behavior if a user is determined to replicate and use a
7134   // scalar instead of vector value.
7135   DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe;
7136 
7137   DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter();
7138 
7139   SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
7140 
7141   VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, Builder);
7142 
7143   // ---------------------------------------------------------------------------
7144   // Pre-construction: record ingredients whose recipes we'll need to further
7145   // process after constructing the initial VPlan.
7146   // ---------------------------------------------------------------------------
7147 
7148   // Mark instructions we'll need to sink later and their targets as
7149   // ingredients whose recipe we'll need to record.
7150   for (auto &Entry : SinkAfter) {
7151     RecipeBuilder.recordRecipeOf(Entry.first);
7152     RecipeBuilder.recordRecipeOf(Entry.second);
7153   }
7154 
7155   // For each interleave group which is relevant for this (possibly trimmed)
7156   // Range, add it to the set of groups to be later applied to the VPlan and add
7157   // placeholders for its members' Recipes which we'll be replacing with a
7158   // single VPInterleaveRecipe.
7159   for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
7160     auto applyIG = [IG, this](unsigned VF) -> bool {
7161       return (VF >= 2 && // Query is illegal for VF == 1
7162               CM.getWideningDecision(IG->getInsertPos(), VF) ==
7163                   LoopVectorizationCostModel::CM_Interleave);
7164     };
7165     if (!getDecisionAndClampRange(applyIG, Range))
7166       continue;
7167     InterleaveGroups.insert(IG);
7168     for (unsigned i = 0; i < IG->getFactor(); i++)
7169       if (Instruction *Member = IG->getMember(i))
7170         RecipeBuilder.recordRecipeOf(Member);
7171   };
7172 
7173   // ---------------------------------------------------------------------------
7174   // Build initial VPlan: Scan the body of the loop in a topological order to
7175   // visit each basic block after having visited its predecessor basic blocks.
7176   // ---------------------------------------------------------------------------
7177 
7178   // Create a dummy pre-entry VPBasicBlock to start building the VPlan.
7179   VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry");
7180   auto Plan = std::make_unique<VPlan>(VPBB);
7181 
7182   // Represent values that will have defs inside VPlan.
7183   for (Value *V : NeedDef)
7184     Plan->addVPValue(V);
7185 
7186   // Scan the body of the loop in a topological order to visit each basic block
7187   // after having visited its predecessor basic blocks.
7188   LoopBlocksDFS DFS(OrigLoop);
7189   DFS.perform(LI);
7190 
7191   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
7192     // Relevant instructions from basic block BB will be grouped into VPRecipe
7193     // ingredients and fill a new VPBasicBlock.
7194     unsigned VPBBsForBB = 0;
7195     auto *FirstVPBBForBB = new VPBasicBlock(BB->getName());
7196     VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB);
7197     VPBB = FirstVPBBForBB;
7198     Builder.setInsertPoint(VPBB);
7199 
7200     // Introduce each ingredient into VPlan.
7201     for (Instruction &I : BB->instructionsWithoutDebug()) {
7202       Instruction *Instr = &I;
7203 
7204       // First filter out irrelevant instructions, to ensure no recipes are
7205       // built for them.
7206       if (isa<BranchInst>(Instr) ||
7207           DeadInstructions.find(Instr) != DeadInstructions.end())
7208         continue;
7209 
7210       if (RecipeBuilder.tryToCreateRecipe(Instr, Range, Plan, VPBB))
7211         continue;
7212 
7213       // Otherwise, if all widening options failed, Instruction is to be
7214       // replicated. This may create a successor for VPBB.
7215       VPBasicBlock *NextVPBB = RecipeBuilder.handleReplication(
7216           Instr, Range, VPBB, PredInst2Recipe, Plan);
7217       if (NextVPBB != VPBB) {
7218         VPBB = NextVPBB;
7219         VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++)
7220                                     : "");
7221       }
7222     }
7223   }
7224 
7225   // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks
7226   // may also be empty, such as the last one VPBB, reflecting original
7227   // basic-blocks with no recipes.
7228   VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry());
7229   assert(PreEntry->empty() && "Expecting empty pre-entry block.");
7230   VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor());
7231   VPBlockUtils::disconnectBlocks(PreEntry, Entry);
7232   delete PreEntry;
7233 
7234   // ---------------------------------------------------------------------------
7235   // Transform initial VPlan: Apply previously taken decisions, in order, to
7236   // bring the VPlan to its final state.
7237   // ---------------------------------------------------------------------------
7238 
7239   // Apply Sink-After legal constraints.
7240   for (auto &Entry : SinkAfter) {
7241     VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first);
7242     VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second);
7243     Sink->moveAfter(Target);
7244   }
7245 
7246   // Interleave memory: for each Interleave Group we marked earlier as relevant
7247   // for this VPlan, replace the Recipes widening its memory instructions with a
7248   // single VPInterleaveRecipe at its insertion point.
7249   for (auto IG : InterleaveGroups) {
7250     auto *Recipe = cast<VPWidenMemoryInstructionRecipe>(
7251         RecipeBuilder.getRecipe(IG->getInsertPos()));
7252     (new VPInterleaveRecipe(IG, Recipe->getMask()))->insertBefore(Recipe);
7253 
7254     for (unsigned i = 0; i < IG->getFactor(); ++i)
7255       if (Instruction *Member = IG->getMember(i)) {
7256         RecipeBuilder.getRecipe(Member)->eraseFromParent();
7257       }
7258   }
7259 
7260   // Finally, if tail is folded by masking, introduce selects between the phi
7261   // and the live-out instruction of each reduction, at the end of the latch.
7262   if (CM.foldTailByMasking()) {
7263     Builder.setInsertPoint(VPBB);
7264     auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan);
7265     for (auto &Reduction : *Legal->getReductionVars()) {
7266       VPValue *Phi = Plan->getVPValue(Reduction.first);
7267       VPValue *Red = Plan->getVPValue(Reduction.second.getLoopExitInstr());
7268       Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi});
7269     }
7270   }
7271 
7272   std::string PlanName;
7273   raw_string_ostream RSO(PlanName);
7274   unsigned VF = Range.Start;
7275   Plan->addVF(VF);
7276   RSO << "Initial VPlan for VF={" << VF;
7277   for (VF *= 2; VF < Range.End; VF *= 2) {
7278     Plan->addVF(VF);
7279     RSO << "," << VF;
7280   }
7281   RSO << "},UF>=1";
7282   RSO.flush();
7283   Plan->setName(PlanName);
7284 
7285   return Plan;
7286 }
7287 
7288 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
7289   // Outer loop handling: They may require CFG and instruction level
7290   // transformations before even evaluating whether vectorization is profitable.
7291   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
7292   // the vectorization pipeline.
7293   assert(!OrigLoop->empty());
7294   assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
7295 
7296   // Create new empty VPlan
7297   auto Plan = std::make_unique<VPlan>();
7298 
7299   // Build hierarchical CFG
7300   VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
7301   HCFGBuilder.buildHierarchicalCFG();
7302 
7303   for (unsigned VF = Range.Start; VF < Range.End; VF *= 2)
7304     Plan->addVF(VF);
7305 
7306   if (EnableVPlanPredication) {
7307     VPlanPredicator VPP(*Plan);
7308     VPP.predicate();
7309 
7310     // Avoid running transformation to recipes until masked code generation in
7311     // VPlan-native path is in place.
7312     return Plan;
7313   }
7314 
7315   SmallPtrSet<Instruction *, 1> DeadInstructions;
7316   VPlanTransforms::VPInstructionsToVPRecipes(
7317       OrigLoop, Plan, Legal->getInductionVars(), DeadInstructions);
7318   return Plan;
7319 }
7320 
7321 Value* LoopVectorizationPlanner::VPCallbackILV::
7322 getOrCreateVectorValues(Value *V, unsigned Part) {
7323       return ILV.getOrCreateVectorValue(V, Part);
7324 }
7325 
7326 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent) const {
7327   O << " +\n"
7328     << Indent << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
7329   IG->getInsertPos()->printAsOperand(O, false);
7330   if (User) {
7331     O << ", ";
7332     User->getOperand(0)->printAsOperand(O);
7333   }
7334   O << "\\l\"";
7335   for (unsigned i = 0; i < IG->getFactor(); ++i)
7336     if (Instruction *I = IG->getMember(i))
7337       O << " +\n"
7338         << Indent << "\"  " << VPlanIngredient(I) << " " << i << "\\l\"";
7339 }
7340 
7341 void VPWidenRecipe::execute(VPTransformState &State) {
7342   for (auto &Instr : make_range(Begin, End))
7343     State.ILV->widenInstruction(Instr);
7344 }
7345 
7346 void VPWidenGEPRecipe::execute(VPTransformState &State) {
7347   State.ILV->widenGEP(GEP, State.UF, State.VF, IsPtrLoopInvariant,
7348                       IsIndexLoopInvariant);
7349 }
7350 
7351 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
7352   assert(!State.Instance && "Int or FP induction being replicated.");
7353   State.ILV->widenIntOrFpInduction(IV, Trunc);
7354 }
7355 
7356 void VPWidenPHIRecipe::execute(VPTransformState &State) {
7357   State.ILV->widenPHIInstruction(Phi, State.UF, State.VF);
7358 }
7359 
7360 void VPBlendRecipe::execute(VPTransformState &State) {
7361   State.ILV->setDebugLocFromInst(State.Builder, Phi);
7362   // We know that all PHIs in non-header blocks are converted into
7363   // selects, so we don't have to worry about the insertion order and we
7364   // can just use the builder.
7365   // At this point we generate the predication tree. There may be
7366   // duplications since this is a simple recursive scan, but future
7367   // optimizations will clean it up.
7368 
7369   unsigned NumIncoming = Phi->getNumIncomingValues();
7370 
7371   assert((User || NumIncoming == 1) &&
7372          "Multiple predecessors with predecessors having a full mask");
7373   // Generate a sequence of selects of the form:
7374   // SELECT(Mask3, In3,
7375   //      SELECT(Mask2, In2,
7376   //                   ( ...)))
7377   InnerLoopVectorizer::VectorParts Entry(State.UF);
7378   for (unsigned In = 0; In < NumIncoming; ++In) {
7379     for (unsigned Part = 0; Part < State.UF; ++Part) {
7380       // We might have single edge PHIs (blocks) - use an identity
7381       // 'select' for the first PHI operand.
7382       Value *In0 =
7383           State.ILV->getOrCreateVectorValue(Phi->getIncomingValue(In), Part);
7384       if (In == 0)
7385         Entry[Part] = In0; // Initialize with the first incoming value.
7386       else {
7387         // Select between the current value and the previous incoming edge
7388         // based on the incoming mask.
7389         Value *Cond = State.get(User->getOperand(In), Part);
7390         Entry[Part] =
7391             State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi");
7392       }
7393     }
7394   }
7395   for (unsigned Part = 0; Part < State.UF; ++Part)
7396     State.ValueMap.setVectorValue(Phi, Part, Entry[Part]);
7397 }
7398 
7399 void VPInterleaveRecipe::execute(VPTransformState &State) {
7400   assert(!State.Instance && "Interleave group being replicated.");
7401   if (!User)
7402     return State.ILV->vectorizeInterleaveGroup(IG->getInsertPos());
7403 
7404   // Last (and currently only) operand is a mask.
7405   InnerLoopVectorizer::VectorParts MaskValues(State.UF);
7406   VPValue *Mask = User->getOperand(User->getNumOperands() - 1);
7407   for (unsigned Part = 0; Part < State.UF; ++Part)
7408     MaskValues[Part] = State.get(Mask, Part);
7409   State.ILV->vectorizeInterleaveGroup(IG->getInsertPos(), &MaskValues);
7410 }
7411 
7412 void VPReplicateRecipe::execute(VPTransformState &State) {
7413   if (State.Instance) { // Generate a single instance.
7414     State.ILV->scalarizeInstruction(Ingredient, *State.Instance, IsPredicated);
7415     // Insert scalar instance packing it into a vector.
7416     if (AlsoPack && State.VF > 1) {
7417       // If we're constructing lane 0, initialize to start from undef.
7418       if (State.Instance->Lane == 0) {
7419         Value *Undef =
7420             UndefValue::get(VectorType::get(Ingredient->getType(), State.VF));
7421         State.ValueMap.setVectorValue(Ingredient, State.Instance->Part, Undef);
7422       }
7423       State.ILV->packScalarIntoVectorValue(Ingredient, *State.Instance);
7424     }
7425     return;
7426   }
7427 
7428   // Generate scalar instances for all VF lanes of all UF parts, unless the
7429   // instruction is uniform inwhich case generate only the first lane for each
7430   // of the UF parts.
7431   unsigned EndLane = IsUniform ? 1 : State.VF;
7432   for (unsigned Part = 0; Part < State.UF; ++Part)
7433     for (unsigned Lane = 0; Lane < EndLane; ++Lane)
7434       State.ILV->scalarizeInstruction(Ingredient, {Part, Lane}, IsPredicated);
7435 }
7436 
7437 void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
7438   assert(State.Instance && "Branch on Mask works only on single instance.");
7439 
7440   unsigned Part = State.Instance->Part;
7441   unsigned Lane = State.Instance->Lane;
7442 
7443   Value *ConditionBit = nullptr;
7444   if (!User) // Block in mask is all-one.
7445     ConditionBit = State.Builder.getTrue();
7446   else {
7447     VPValue *BlockInMask = User->getOperand(0);
7448     ConditionBit = State.get(BlockInMask, Part);
7449     if (ConditionBit->getType()->isVectorTy())
7450       ConditionBit = State.Builder.CreateExtractElement(
7451           ConditionBit, State.Builder.getInt32(Lane));
7452   }
7453 
7454   // Replace the temporary unreachable terminator with a new conditional branch,
7455   // whose two destinations will be set later when they are created.
7456   auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
7457   assert(isa<UnreachableInst>(CurrentTerminator) &&
7458          "Expected to replace unreachable terminator with conditional branch.");
7459   auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
7460   CondBr->setSuccessor(0, nullptr);
7461   ReplaceInstWithInst(CurrentTerminator, CondBr);
7462 }
7463 
7464 void VPPredInstPHIRecipe::execute(VPTransformState &State) {
7465   assert(State.Instance && "Predicated instruction PHI works per instance.");
7466   Instruction *ScalarPredInst = cast<Instruction>(
7467       State.ValueMap.getScalarValue(PredInst, *State.Instance));
7468   BasicBlock *PredicatedBB = ScalarPredInst->getParent();
7469   BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
7470   assert(PredicatingBB && "Predicated block has no single predecessor.");
7471 
7472   // By current pack/unpack logic we need to generate only a single phi node: if
7473   // a vector value for the predicated instruction exists at this point it means
7474   // the instruction has vector users only, and a phi for the vector value is
7475   // needed. In this case the recipe of the predicated instruction is marked to
7476   // also do that packing, thereby "hoisting" the insert-element sequence.
7477   // Otherwise, a phi node for the scalar value is needed.
7478   unsigned Part = State.Instance->Part;
7479   if (State.ValueMap.hasVectorValue(PredInst, Part)) {
7480     Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part);
7481     InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
7482     PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
7483     VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
7484     VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
7485     State.ValueMap.resetVectorValue(PredInst, Part, VPhi); // Update cache.
7486   } else {
7487     Type *PredInstType = PredInst->getType();
7488     PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
7489     Phi->addIncoming(UndefValue::get(ScalarPredInst->getType()), PredicatingBB);
7490     Phi->addIncoming(ScalarPredInst, PredicatedBB);
7491     State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi);
7492   }
7493 }
7494 
7495 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
7496   VPValue *Mask = getMask();
7497   if (!Mask)
7498     return State.ILV->vectorizeMemoryInstruction(&Instr);
7499 
7500   InnerLoopVectorizer::VectorParts MaskValues(State.UF);
7501   for (unsigned Part = 0; Part < State.UF; ++Part)
7502     MaskValues[Part] = State.get(Mask, Part);
7503   State.ILV->vectorizeMemoryInstruction(&Instr, &MaskValues);
7504 }
7505 
7506 static ScalarEpilogueLowering
7507 getScalarEpilogueLowering(Function *F, Loop *L, LoopVectorizeHints &Hints,
7508                           ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI,
7509                           TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
7510                           AssumptionCache *AC, LoopInfo *LI,
7511                           ScalarEvolution *SE, DominatorTree *DT,
7512                           const LoopAccessInfo *LAI) {
7513   ScalarEpilogueLowering SEL = CM_ScalarEpilogueAllowed;
7514   bool PredicateOptDisabled = PreferPredicateOverEpilog.getNumOccurrences() &&
7515                               !PreferPredicateOverEpilog;
7516 
7517   if (Hints.getForce() != LoopVectorizeHints::FK_Enabled &&
7518       (F->hasOptSize() ||
7519        llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
7520                                    PGSOQueryType::IRPass)))
7521     SEL = CM_ScalarEpilogueNotAllowedOptSize;
7522   else if (PreferPredicateOverEpilog ||
7523            Hints.getPredicate() == LoopVectorizeHints::FK_Enabled ||
7524            (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, LAI) &&
7525             Hints.getPredicate() != LoopVectorizeHints::FK_Disabled &&
7526             !PredicateOptDisabled))
7527     SEL = CM_ScalarEpilogueNotNeededUsePredicate;
7528 
7529   return SEL;
7530 }
7531 
7532 // Process the loop in the VPlan-native vectorization path. This path builds
7533 // VPlan upfront in the vectorization pipeline, which allows to apply
7534 // VPlan-to-VPlan transformations from the very beginning without modifying the
7535 // input LLVM IR.
7536 static bool processLoopInVPlanNativePath(
7537     Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
7538     LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
7539     TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
7540     OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
7541     ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) {
7542 
7543   assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
7544   Function *F = L->getHeader()->getParent();
7545   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
7546 
7547   ScalarEpilogueLowering SEL =
7548     getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, AC, LI,
7549                               PSE.getSE(), DT, LVL->getLAI());
7550 
7551   LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
7552                                 &Hints, IAI);
7553   // Use the planner for outer loop vectorization.
7554   // TODO: CM is not used at this point inside the planner. Turn CM into an
7555   // optional argument if we don't need it in the future.
7556   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI);
7557 
7558   // Get user vectorization factor.
7559   const unsigned UserVF = Hints.getWidth();
7560 
7561   // Plan how to best vectorize, return the best VF and its cost.
7562   const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
7563 
7564   // If we are stress testing VPlan builds, do not attempt to generate vector
7565   // code. Masked vector code generation support will follow soon.
7566   // Also, do not attempt to vectorize if no vector code will be produced.
7567   if (VPlanBuildStressTest || EnableVPlanPredication ||
7568       VectorizationFactor::Disabled() == VF)
7569     return false;
7570 
7571   LVP.setBestPlan(VF.Width, 1);
7572 
7573   InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL,
7574                          &CM);
7575   LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
7576                     << L->getHeader()->getParent()->getName() << "\"\n");
7577   LVP.executePlan(LB, DT);
7578 
7579   // Mark the loop as already vectorized to avoid vectorizing again.
7580   Hints.setAlreadyVectorized();
7581 
7582   LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent()));
7583   return true;
7584 }
7585 
7586 bool LoopVectorizePass::processLoop(Loop *L) {
7587   assert((EnableVPlanNativePath || L->empty()) &&
7588          "VPlan-native path is not enabled. Only process inner loops.");
7589 
7590 #ifndef NDEBUG
7591   const std::string DebugLocStr = getDebugLocString(L);
7592 #endif /* NDEBUG */
7593 
7594   LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \""
7595                     << L->getHeader()->getParent()->getName() << "\" from "
7596                     << DebugLocStr << "\n");
7597 
7598   LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE);
7599 
7600   LLVM_DEBUG(
7601       dbgs() << "LV: Loop hints:"
7602              << " force="
7603              << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
7604                      ? "disabled"
7605                      : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
7606                             ? "enabled"
7607                             : "?"))
7608              << " width=" << Hints.getWidth()
7609              << " unroll=" << Hints.getInterleave() << "\n");
7610 
7611   // Function containing loop
7612   Function *F = L->getHeader()->getParent();
7613 
7614   // Looking at the diagnostic output is the only way to determine if a loop
7615   // was vectorized (other than looking at the IR or machine code), so it
7616   // is important to generate an optimization remark for each loop. Most of
7617   // these messages are generated as OptimizationRemarkAnalysis. Remarks
7618   // generated as OptimizationRemark and OptimizationRemarkMissed are
7619   // less verbose reporting vectorized loops and unvectorized loops that may
7620   // benefit from vectorization, respectively.
7621 
7622   if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
7623     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
7624     return false;
7625   }
7626 
7627   PredicatedScalarEvolution PSE(*SE, *L);
7628 
7629   // Check if it is legal to vectorize the loop.
7630   LoopVectorizationRequirements Requirements(*ORE);
7631   LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE,
7632                                 &Requirements, &Hints, DB, AC);
7633   if (!LVL.canVectorize(EnableVPlanNativePath)) {
7634     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
7635     Hints.emitRemarkWithHints();
7636     return false;
7637   }
7638 
7639   // Check the function attributes and profiles to find out if this function
7640   // should be optimized for size.
7641   ScalarEpilogueLowering SEL =
7642     getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, AC, LI,
7643                               PSE.getSE(), DT, LVL.getLAI());
7644 
7645   // Entrance to the VPlan-native vectorization path. Outer loops are processed
7646   // here. They may require CFG and instruction level transformations before
7647   // even evaluating whether vectorization is profitable. Since we cannot modify
7648   // the incoming IR, we need to build VPlan upfront in the vectorization
7649   // pipeline.
7650   if (!L->empty())
7651     return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
7652                                         ORE, BFI, PSI, Hints);
7653 
7654   assert(L->empty() && "Inner loop expected.");
7655 
7656   // Check the loop for a trip count threshold: vectorize loops with a tiny trip
7657   // count by optimizing for size, to minimize overheads.
7658   auto ExpectedTC = getSmallBestKnownTC(*SE, L);
7659   if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
7660     LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
7661                       << "This loop is worth vectorizing only if no scalar "
7662                       << "iteration overheads are incurred.");
7663     if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
7664       LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
7665     else {
7666       LLVM_DEBUG(dbgs() << "\n");
7667       SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
7668     }
7669   }
7670 
7671   // Check the function attributes to see if implicit floats are allowed.
7672   // FIXME: This check doesn't seem possibly correct -- what if the loop is
7673   // an integer loop and the vector instructions selected are purely integer
7674   // vector instructions?
7675   if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
7676     reportVectorizationFailure(
7677         "Can't vectorize when the NoImplicitFloat attribute is used",
7678         "loop not vectorized due to NoImplicitFloat attribute",
7679         "NoImplicitFloat", ORE, L);
7680     Hints.emitRemarkWithHints();
7681     return false;
7682   }
7683 
7684   // Check if the target supports potentially unsafe FP vectorization.
7685   // FIXME: Add a check for the type of safety issue (denormal, signaling)
7686   // for the target we're vectorizing for, to make sure none of the
7687   // additional fp-math flags can help.
7688   if (Hints.isPotentiallyUnsafe() &&
7689       TTI->isFPVectorizationPotentiallyUnsafe()) {
7690     reportVectorizationFailure(
7691         "Potentially unsafe FP op prevents vectorization",
7692         "loop not vectorized due to unsafe FP support.",
7693         "UnsafeFP", ORE, L);
7694     Hints.emitRemarkWithHints();
7695     return false;
7696   }
7697 
7698   bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
7699   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
7700 
7701   // If an override option has been passed in for interleaved accesses, use it.
7702   if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
7703     UseInterleaved = EnableInterleavedMemAccesses;
7704 
7705   // Analyze interleaved memory accesses.
7706   if (UseInterleaved) {
7707     IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
7708   }
7709 
7710   // Use the cost model.
7711   LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
7712                                 F, &Hints, IAI);
7713   CM.collectValuesToIgnore();
7714 
7715   // Use the planner for vectorization.
7716   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI);
7717 
7718   // Get user vectorization factor.
7719   unsigned UserVF = Hints.getWidth();
7720 
7721   // Plan how to best vectorize, return the best VF and its cost.
7722   Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF);
7723 
7724   VectorizationFactor VF = VectorizationFactor::Disabled();
7725   unsigned IC = 1;
7726   unsigned UserIC = Hints.getInterleave();
7727 
7728   if (MaybeVF) {
7729     VF = *MaybeVF;
7730     // Select the interleave count.
7731     IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
7732   }
7733 
7734   // Identify the diagnostic messages that should be produced.
7735   std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
7736   bool VectorizeLoop = true, InterleaveLoop = true;
7737   if (Requirements.doesNotMeet(F, L, Hints)) {
7738     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization "
7739                          "requirements.\n");
7740     Hints.emitRemarkWithHints();
7741     return false;
7742   }
7743 
7744   if (VF.Width == 1) {
7745     LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
7746     VecDiagMsg = std::make_pair(
7747         "VectorizationNotBeneficial",
7748         "the cost-model indicates that vectorization is not beneficial");
7749     VectorizeLoop = false;
7750   }
7751 
7752   if (!MaybeVF && UserIC > 1) {
7753     // Tell the user interleaving was avoided up-front, despite being explicitly
7754     // requested.
7755     LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
7756                          "interleaving should be avoided up front\n");
7757     IntDiagMsg = std::make_pair(
7758         "InterleavingAvoided",
7759         "Ignoring UserIC, because interleaving was avoided up front");
7760     InterleaveLoop = false;
7761   } else if (IC == 1 && UserIC <= 1) {
7762     // Tell the user interleaving is not beneficial.
7763     LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
7764     IntDiagMsg = std::make_pair(
7765         "InterleavingNotBeneficial",
7766         "the cost-model indicates that interleaving is not beneficial");
7767     InterleaveLoop = false;
7768     if (UserIC == 1) {
7769       IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
7770       IntDiagMsg.second +=
7771           " and is explicitly disabled or interleave count is set to 1";
7772     }
7773   } else if (IC > 1 && UserIC == 1) {
7774     // Tell the user interleaving is beneficial, but it explicitly disabled.
7775     LLVM_DEBUG(
7776         dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
7777     IntDiagMsg = std::make_pair(
7778         "InterleavingBeneficialButDisabled",
7779         "the cost-model indicates that interleaving is beneficial "
7780         "but is explicitly disabled or interleave count is set to 1");
7781     InterleaveLoop = false;
7782   }
7783 
7784   // Override IC if user provided an interleave count.
7785   IC = UserIC > 0 ? UserIC : IC;
7786 
7787   // Emit diagnostic messages, if any.
7788   const char *VAPassName = Hints.vectorizeAnalysisPassName();
7789   if (!VectorizeLoop && !InterleaveLoop) {
7790     // Do not vectorize or interleaving the loop.
7791     ORE->emit([&]() {
7792       return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
7793                                       L->getStartLoc(), L->getHeader())
7794              << VecDiagMsg.second;
7795     });
7796     ORE->emit([&]() {
7797       return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
7798                                       L->getStartLoc(), L->getHeader())
7799              << IntDiagMsg.second;
7800     });
7801     return false;
7802   } else if (!VectorizeLoop && InterleaveLoop) {
7803     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
7804     ORE->emit([&]() {
7805       return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
7806                                         L->getStartLoc(), L->getHeader())
7807              << VecDiagMsg.second;
7808     });
7809   } else if (VectorizeLoop && !InterleaveLoop) {
7810     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
7811                       << ") in " << DebugLocStr << '\n');
7812     ORE->emit([&]() {
7813       return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
7814                                         L->getStartLoc(), L->getHeader())
7815              << IntDiagMsg.second;
7816     });
7817   } else if (VectorizeLoop && InterleaveLoop) {
7818     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
7819                       << ") in " << DebugLocStr << '\n');
7820     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
7821   }
7822 
7823   LVP.setBestPlan(VF.Width, IC);
7824 
7825   using namespace ore;
7826   bool DisableRuntimeUnroll = false;
7827   MDNode *OrigLoopID = L->getLoopID();
7828 
7829   if (!VectorizeLoop) {
7830     assert(IC > 1 && "interleave count should not be 1 or 0");
7831     // If we decided that it is not legal to vectorize the loop, then
7832     // interleave it.
7833     InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
7834                                &CM);
7835     LVP.executePlan(Unroller, DT);
7836 
7837     ORE->emit([&]() {
7838       return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
7839                                 L->getHeader())
7840              << "interleaved loop (interleaved count: "
7841              << NV("InterleaveCount", IC) << ")";
7842     });
7843   } else {
7844     // If we decided that it is *legal* to vectorize the loop, then do it.
7845     InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
7846                            &LVL, &CM);
7847     LVP.executePlan(LB, DT);
7848     ++LoopsVectorized;
7849 
7850     // Add metadata to disable runtime unrolling a scalar loop when there are
7851     // no runtime checks about strides and memory. A scalar loop that is
7852     // rarely used is not worth unrolling.
7853     if (!LB.areSafetyChecksAdded())
7854       DisableRuntimeUnroll = true;
7855 
7856     // Report the vectorization decision.
7857     ORE->emit([&]() {
7858       return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(),
7859                                 L->getHeader())
7860              << "vectorized loop (vectorization width: "
7861              << NV("VectorizationFactor", VF.Width)
7862              << ", interleaved count: " << NV("InterleaveCount", IC) << ")";
7863     });
7864   }
7865 
7866   Optional<MDNode *> RemainderLoopID =
7867       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
7868                                       LLVMLoopVectorizeFollowupEpilogue});
7869   if (RemainderLoopID.hasValue()) {
7870     L->setLoopID(RemainderLoopID.getValue());
7871   } else {
7872     if (DisableRuntimeUnroll)
7873       AddRuntimeUnrollDisableMetaData(L);
7874 
7875     // Mark the loop as already vectorized to avoid vectorizing again.
7876     Hints.setAlreadyVectorized();
7877   }
7878 
7879   LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent()));
7880   return true;
7881 }
7882 
7883 bool LoopVectorizePass::runImpl(
7884     Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_,
7885     DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_,
7886     DemandedBits &DB_, AliasAnalysis &AA_, AssumptionCache &AC_,
7887     std::function<const LoopAccessInfo &(Loop &)> &GetLAA_,
7888     OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) {
7889   SE = &SE_;
7890   LI = &LI_;
7891   TTI = &TTI_;
7892   DT = &DT_;
7893   BFI = &BFI_;
7894   TLI = TLI_;
7895   AA = &AA_;
7896   AC = &AC_;
7897   GetLAA = &GetLAA_;
7898   DB = &DB_;
7899   ORE = &ORE_;
7900   PSI = PSI_;
7901 
7902   // Don't attempt if
7903   // 1. the target claims to have no vector registers, and
7904   // 2. interleaving won't help ILP.
7905   //
7906   // The second condition is necessary because, even if the target has no
7907   // vector registers, loop vectorization may still enable scalar
7908   // interleaving.
7909   if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) &&
7910       TTI->getMaxInterleaveFactor(1) < 2)
7911     return false;
7912 
7913   bool Changed = false;
7914 
7915   // The vectorizer requires loops to be in simplified form.
7916   // Since simplification may add new inner loops, it has to run before the
7917   // legality and profitability checks. This means running the loop vectorizer
7918   // will simplify all loops, regardless of whether anything end up being
7919   // vectorized.
7920   for (auto &L : *LI)
7921     Changed |=
7922         simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
7923 
7924   // Build up a worklist of inner-loops to vectorize. This is necessary as
7925   // the act of vectorizing or partially unrolling a loop creates new loops
7926   // and can invalidate iterators across the loops.
7927   SmallVector<Loop *, 8> Worklist;
7928 
7929   for (Loop *L : *LI)
7930     collectSupportedLoops(*L, LI, ORE, Worklist);
7931 
7932   LoopsAnalyzed += Worklist.size();
7933 
7934   // Now walk the identified inner loops.
7935   while (!Worklist.empty()) {
7936     Loop *L = Worklist.pop_back_val();
7937 
7938     // For the inner loops we actually process, form LCSSA to simplify the
7939     // transform.
7940     Changed |= formLCSSARecursively(*L, *DT, LI, SE);
7941 
7942     Changed |= processLoop(L);
7943   }
7944 
7945   // Process each loop nest in the function.
7946   return Changed;
7947 }
7948 
7949 PreservedAnalyses LoopVectorizePass::run(Function &F,
7950                                          FunctionAnalysisManager &AM) {
7951     auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
7952     auto &LI = AM.getResult<LoopAnalysis>(F);
7953     auto &TTI = AM.getResult<TargetIRAnalysis>(F);
7954     auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
7955     auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F);
7956     auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
7957     auto &AA = AM.getResult<AAManager>(F);
7958     auto &AC = AM.getResult<AssumptionAnalysis>(F);
7959     auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
7960     auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
7961     MemorySSA *MSSA = EnableMSSALoopDependency
7962                           ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA()
7963                           : nullptr;
7964 
7965     auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
7966     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
7967         [&](Loop &L) -> const LoopAccessInfo & {
7968       LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI, MSSA};
7969       return LAM.getResult<LoopAccessAnalysis>(L, AR);
7970     };
7971     const ModuleAnalysisManager &MAM =
7972         AM.getResult<ModuleAnalysisManagerFunctionProxy>(F).getManager();
7973     ProfileSummaryInfo *PSI =
7974         MAM.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
7975     bool Changed =
7976         runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI);
7977     if (!Changed)
7978       return PreservedAnalyses::all();
7979     PreservedAnalyses PA;
7980 
7981     // We currently do not preserve loopinfo/dominator analyses with outer loop
7982     // vectorization. Until this is addressed, mark these analyses as preserved
7983     // only for non-VPlan-native path.
7984     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
7985     if (!EnableVPlanNativePath) {
7986       PA.preserve<LoopAnalysis>();
7987       PA.preserve<DominatorTreeAnalysis>();
7988     }
7989     PA.preserve<BasicAA>();
7990     PA.preserve<GlobalsAA>();
7991     return PA;
7992 }
7993