1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10 // and generates target-independent LLVM-IR.
11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12 // of instructions in order to estimate the profitability of vectorization.
13 //
14 // The loop vectorizer combines consecutive loop iterations into a single
15 // 'wide' iteration. After this transformation the index is incremented
16 // by the SIMD vector width, and not by one.
17 //
18 // This pass has three parts:
19 // 1. The main loop pass that drives the different parts.
20 // 2. LoopVectorizationLegality - A unit that checks for the legality
21 //    of the vectorization.
22 // 3. InnerLoopVectorizer - A unit that performs the actual
23 //    widening of instructions.
24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
25 //    of vectorization. It decides on the optimal vector width, which
26 //    can be one, if vectorization is not profitable.
27 //
28 // There is a development effort going on to migrate loop vectorizer to the
29 // VPlan infrastructure and to introduce outer loop vectorization support (see
30 // docs/Proposal/VectorizationPlan.rst and
31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32 // purpose, we temporarily introduced the VPlan-native vectorization path: an
33 // alternative vectorization path that is natively implemented on top of the
34 // VPlan infrastructure. See EnableVPlanNativePath for enabling.
35 //
36 //===----------------------------------------------------------------------===//
37 //
38 // The reduction-variable vectorization is based on the paper:
39 //  D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40 //
41 // Variable uniformity checks are inspired by:
42 //  Karrenberg, R. and Hack, S. Whole Function Vectorization.
43 //
44 // The interleaved access vectorization is based on the paper:
45 //  Dorit Nuzman, Ira Rosen and Ayal Zaks.  Auto-Vectorization of Interleaved
46 //  Data for SIMD
47 //
48 // Other ideas/concepts are from:
49 //  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50 //
51 //  S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua.  An Evaluation of
52 //  Vectorizing Compilers.
53 //
54 //===----------------------------------------------------------------------===//
55 
56 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
57 #include "LoopVectorizationPlanner.h"
58 #include "VPRecipeBuilder.h"
59 #include "VPlan.h"
60 #include "VPlanHCFGBuilder.h"
61 #include "VPlanHCFGTransforms.h"
62 #include "VPlanPredicator.h"
63 #include "llvm/ADT/APInt.h"
64 #include "llvm/ADT/ArrayRef.h"
65 #include "llvm/ADT/DenseMap.h"
66 #include "llvm/ADT/DenseMapInfo.h"
67 #include "llvm/ADT/Hashing.h"
68 #include "llvm/ADT/MapVector.h"
69 #include "llvm/ADT/None.h"
70 #include "llvm/ADT/Optional.h"
71 #include "llvm/ADT/STLExtras.h"
72 #include "llvm/ADT/SetVector.h"
73 #include "llvm/ADT/SmallPtrSet.h"
74 #include "llvm/ADT/SmallVector.h"
75 #include "llvm/ADT/Statistic.h"
76 #include "llvm/ADT/StringRef.h"
77 #include "llvm/ADT/Twine.h"
78 #include "llvm/ADT/iterator_range.h"
79 #include "llvm/Analysis/AssumptionCache.h"
80 #include "llvm/Analysis/BasicAliasAnalysis.h"
81 #include "llvm/Analysis/BlockFrequencyInfo.h"
82 #include "llvm/Analysis/CFG.h"
83 #include "llvm/Analysis/CodeMetrics.h"
84 #include "llvm/Analysis/DemandedBits.h"
85 #include "llvm/Analysis/GlobalsModRef.h"
86 #include "llvm/Analysis/LoopAccessAnalysis.h"
87 #include "llvm/Analysis/LoopAnalysisManager.h"
88 #include "llvm/Analysis/LoopInfo.h"
89 #include "llvm/Analysis/LoopIterator.h"
90 #include "llvm/Analysis/MemorySSA.h"
91 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
92 #include "llvm/Analysis/ProfileSummaryInfo.h"
93 #include "llvm/Analysis/ScalarEvolution.h"
94 #include "llvm/Analysis/ScalarEvolutionExpander.h"
95 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
96 #include "llvm/Analysis/TargetLibraryInfo.h"
97 #include "llvm/Analysis/TargetTransformInfo.h"
98 #include "llvm/Analysis/VectorUtils.h"
99 #include "llvm/IR/Attributes.h"
100 #include "llvm/IR/BasicBlock.h"
101 #include "llvm/IR/CFG.h"
102 #include "llvm/IR/Constant.h"
103 #include "llvm/IR/Constants.h"
104 #include "llvm/IR/DataLayout.h"
105 #include "llvm/IR/DebugInfoMetadata.h"
106 #include "llvm/IR/DebugLoc.h"
107 #include "llvm/IR/DerivedTypes.h"
108 #include "llvm/IR/DiagnosticInfo.h"
109 #include "llvm/IR/Dominators.h"
110 #include "llvm/IR/Function.h"
111 #include "llvm/IR/IRBuilder.h"
112 #include "llvm/IR/InstrTypes.h"
113 #include "llvm/IR/Instruction.h"
114 #include "llvm/IR/Instructions.h"
115 #include "llvm/IR/IntrinsicInst.h"
116 #include "llvm/IR/Intrinsics.h"
117 #include "llvm/IR/LLVMContext.h"
118 #include "llvm/IR/Metadata.h"
119 #include "llvm/IR/Module.h"
120 #include "llvm/IR/Operator.h"
121 #include "llvm/IR/Type.h"
122 #include "llvm/IR/Use.h"
123 #include "llvm/IR/User.h"
124 #include "llvm/IR/Value.h"
125 #include "llvm/IR/ValueHandle.h"
126 #include "llvm/IR/Verifier.h"
127 #include "llvm/InitializePasses.h"
128 #include "llvm/Pass.h"
129 #include "llvm/Support/Casting.h"
130 #include "llvm/Support/CommandLine.h"
131 #include "llvm/Support/Compiler.h"
132 #include "llvm/Support/Debug.h"
133 #include "llvm/Support/ErrorHandling.h"
134 #include "llvm/Support/MathExtras.h"
135 #include "llvm/Support/raw_ostream.h"
136 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
137 #include "llvm/Transforms/Utils/LoopSimplify.h"
138 #include "llvm/Transforms/Utils/LoopUtils.h"
139 #include "llvm/Transforms/Utils/LoopVersioning.h"
140 #include "llvm/Transforms/Utils/SizeOpts.h"
141 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
142 #include <algorithm>
143 #include <cassert>
144 #include <cstdint>
145 #include <cstdlib>
146 #include <functional>
147 #include <iterator>
148 #include <limits>
149 #include <memory>
150 #include <string>
151 #include <tuple>
152 #include <utility>
153 
154 using namespace llvm;
155 
156 #define LV_NAME "loop-vectorize"
157 #define DEBUG_TYPE LV_NAME
158 
159 /// @{
160 /// Metadata attribute names
161 static const char *const LLVMLoopVectorizeFollowupAll =
162     "llvm.loop.vectorize.followup_all";
163 static const char *const LLVMLoopVectorizeFollowupVectorized =
164     "llvm.loop.vectorize.followup_vectorized";
165 static const char *const LLVMLoopVectorizeFollowupEpilogue =
166     "llvm.loop.vectorize.followup_epilogue";
167 /// @}
168 
169 STATISTIC(LoopsVectorized, "Number of loops vectorized");
170 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
171 
172 /// Loops with a known constant trip count below this number are vectorized only
173 /// if no scalar iteration overheads are incurred.
174 static cl::opt<unsigned> TinyTripCountVectorThreshold(
175     "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
176     cl::desc("Loops with a constant trip count that is smaller than this "
177              "value are vectorized only if no scalar iteration overheads "
178              "are incurred."));
179 
180 // Indicates that an epilogue is undesired, predication is preferred.
181 // This means that the vectorizer will try to fold the loop-tail (epilogue)
182 // into the loop and predicate the loop body accordingly.
183 static cl::opt<bool> PreferPredicateOverEpilog(
184     "prefer-predicate-over-epilog", cl::init(false), cl::Hidden,
185     cl::desc("Indicate that an epilogue is undesired, predication should be "
186              "used instead."));
187 
188 static cl::opt<bool> MaximizeBandwidth(
189     "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
190     cl::desc("Maximize bandwidth when selecting vectorization factor which "
191              "will be determined by the smallest type in loop."));
192 
193 static cl::opt<bool> EnableInterleavedMemAccesses(
194     "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
195     cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
196 
197 /// An interleave-group may need masking if it resides in a block that needs
198 /// predication, or in order to mask away gaps.
199 static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
200     "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
201     cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
202 
203 static cl::opt<unsigned> TinyTripCountInterleaveThreshold(
204     "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden,
205     cl::desc("We don't interleave loops with a estimated constant trip count "
206              "below this number"));
207 
208 static cl::opt<unsigned> ForceTargetNumScalarRegs(
209     "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
210     cl::desc("A flag that overrides the target's number of scalar registers."));
211 
212 static cl::opt<unsigned> ForceTargetNumVectorRegs(
213     "force-target-num-vector-regs", cl::init(0), cl::Hidden,
214     cl::desc("A flag that overrides the target's number of vector registers."));
215 
216 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
217     "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
218     cl::desc("A flag that overrides the target's max interleave factor for "
219              "scalar loops."));
220 
221 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
222     "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
223     cl::desc("A flag that overrides the target's max interleave factor for "
224              "vectorized loops."));
225 
226 static cl::opt<unsigned> ForceTargetInstructionCost(
227     "force-target-instruction-cost", cl::init(0), cl::Hidden,
228     cl::desc("A flag that overrides the target's expected cost for "
229              "an instruction to a single constant value. Mostly "
230              "useful for getting consistent testing."));
231 
232 static cl::opt<unsigned> SmallLoopCost(
233     "small-loop-cost", cl::init(20), cl::Hidden,
234     cl::desc(
235         "The cost of a loop that is considered 'small' by the interleaver."));
236 
237 static cl::opt<bool> LoopVectorizeWithBlockFrequency(
238     "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
239     cl::desc("Enable the use of the block frequency analysis to access PGO "
240              "heuristics minimizing code growth in cold regions and being more "
241              "aggressive in hot regions."));
242 
243 // Runtime interleave loops for load/store throughput.
244 static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
245     "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
246     cl::desc(
247         "Enable runtime interleaving until load/store ports are saturated"));
248 
249 /// The number of stores in a loop that are allowed to need predication.
250 static cl::opt<unsigned> NumberOfStoresToPredicate(
251     "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
252     cl::desc("Max number of stores to be predicated behind an if."));
253 
254 static cl::opt<bool> EnableIndVarRegisterHeur(
255     "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
256     cl::desc("Count the induction variable only once when interleaving"));
257 
258 static cl::opt<bool> EnableCondStoresVectorization(
259     "enable-cond-stores-vec", cl::init(true), cl::Hidden,
260     cl::desc("Enable if predication of stores during vectorization."));
261 
262 static cl::opt<unsigned> MaxNestedScalarReductionIC(
263     "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
264     cl::desc("The maximum interleave count to use when interleaving a scalar "
265              "reduction in a nested loop."));
266 
267 cl::opt<bool> EnableVPlanNativePath(
268     "enable-vplan-native-path", cl::init(false), cl::Hidden,
269     cl::desc("Enable VPlan-native vectorization path with "
270              "support for outer loop vectorization."));
271 
272 // FIXME: Remove this switch once we have divergence analysis. Currently we
273 // assume divergent non-backedge branches when this switch is true.
274 cl::opt<bool> EnableVPlanPredication(
275     "enable-vplan-predication", cl::init(false), cl::Hidden,
276     cl::desc("Enable VPlan-native vectorization path predicator with "
277              "support for outer loop vectorization."));
278 
279 // This flag enables the stress testing of the VPlan H-CFG construction in the
280 // VPlan-native vectorization path. It must be used in conjuction with
281 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
282 // verification of the H-CFGs built.
283 static cl::opt<bool> VPlanBuildStressTest(
284     "vplan-build-stress-test", cl::init(false), cl::Hidden,
285     cl::desc(
286         "Build VPlan for every supported loop nest in the function and bail "
287         "out right after the build (stress test the VPlan H-CFG construction "
288         "in the VPlan-native vectorization path)."));
289 
290 cl::opt<bool> llvm::EnableLoopInterleaving(
291     "interleave-loops", cl::init(true), cl::Hidden,
292     cl::desc("Enable loop interleaving in Loop vectorization passes"));
293 cl::opt<bool> llvm::EnableLoopVectorization(
294     "vectorize-loops", cl::init(true), cl::Hidden,
295     cl::desc("Run the Loop vectorization passes"));
296 
297 /// A helper function for converting Scalar types to vector types.
298 /// If the incoming type is void, we return void. If the VF is 1, we return
299 /// the scalar type.
300 static Type *ToVectorTy(Type *Scalar, unsigned VF) {
301   if (Scalar->isVoidTy() || VF == 1)
302     return Scalar;
303   return VectorType::get(Scalar, VF);
304 }
305 
306 /// A helper function that returns the type of loaded or stored value.
307 static Type *getMemInstValueType(Value *I) {
308   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
309          "Expected Load or Store instruction");
310   if (auto *LI = dyn_cast<LoadInst>(I))
311     return LI->getType();
312   return cast<StoreInst>(I)->getValueOperand()->getType();
313 }
314 
315 /// A helper function that returns true if the given type is irregular. The
316 /// type is irregular if its allocated size doesn't equal the store size of an
317 /// element of the corresponding vector type at the given vectorization factor.
318 static bool hasIrregularType(Type *Ty, const DataLayout &DL, unsigned VF) {
319   // Determine if an array of VF elements of type Ty is "bitcast compatible"
320   // with a <VF x Ty> vector.
321   if (VF > 1) {
322     auto *VectorTy = VectorType::get(Ty, VF);
323     return VF * DL.getTypeAllocSize(Ty) != DL.getTypeStoreSize(VectorTy);
324   }
325 
326   // If the vectorization factor is one, we just check if an array of type Ty
327   // requires padding between elements.
328   return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
329 }
330 
331 /// A helper function that returns the reciprocal of the block probability of
332 /// predicated blocks. If we return X, we are assuming the predicated block
333 /// will execute once for every X iterations of the loop header.
334 ///
335 /// TODO: We should use actual block probability here, if available. Currently,
336 ///       we always assume predicated blocks have a 50% chance of executing.
337 static unsigned getReciprocalPredBlockProb() { return 2; }
338 
339 /// A helper function that adds a 'fast' flag to floating-point operations.
340 static Value *addFastMathFlag(Value *V) {
341   if (isa<FPMathOperator>(V))
342     cast<Instruction>(V)->setFastMathFlags(FastMathFlags::getFast());
343   return V;
344 }
345 
346 static Value *addFastMathFlag(Value *V, FastMathFlags FMF) {
347   if (isa<FPMathOperator>(V))
348     cast<Instruction>(V)->setFastMathFlags(FMF);
349   return V;
350 }
351 
352 /// A helper function that returns an integer or floating-point constant with
353 /// value C.
354 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
355   return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
356                            : ConstantFP::get(Ty, C);
357 }
358 
359 /// Returns "best known" trip count for the specified loop \p L as defined by
360 /// the following procedure:
361 ///   1) Returns exact trip count if it is known.
362 ///   2) Returns expected trip count according to profile data if any.
363 ///   3) Returns upper bound estimate if it is known.
364 ///   4) Returns None if all of the above failed.
365 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) {
366   // Check if exact trip count is known.
367   if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
368     return ExpectedTC;
369 
370   // Check if there is an expected trip count available from profile data.
371   if (LoopVectorizeWithBlockFrequency)
372     if (auto EstimatedTC = getLoopEstimatedTripCount(L))
373       return EstimatedTC;
374 
375   // Check if upper bound estimate is known.
376   if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
377     return ExpectedTC;
378 
379   return None;
380 }
381 
382 namespace llvm {
383 
384 /// InnerLoopVectorizer vectorizes loops which contain only one basic
385 /// block to a specified vectorization factor (VF).
386 /// This class performs the widening of scalars into vectors, or multiple
387 /// scalars. This class also implements the following features:
388 /// * It inserts an epilogue loop for handling loops that don't have iteration
389 ///   counts that are known to be a multiple of the vectorization factor.
390 /// * It handles the code generation for reduction variables.
391 /// * Scalarization (implementation using scalars) of un-vectorizable
392 ///   instructions.
393 /// InnerLoopVectorizer does not perform any vectorization-legality
394 /// checks, and relies on the caller to check for the different legality
395 /// aspects. The InnerLoopVectorizer relies on the
396 /// LoopVectorizationLegality class to provide information about the induction
397 /// and reduction variables that were found to a given vectorization factor.
398 class InnerLoopVectorizer {
399 public:
400   InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
401                       LoopInfo *LI, DominatorTree *DT,
402                       const TargetLibraryInfo *TLI,
403                       const TargetTransformInfo *TTI, AssumptionCache *AC,
404                       OptimizationRemarkEmitter *ORE, unsigned VecWidth,
405                       unsigned UnrollFactor, LoopVectorizationLegality *LVL,
406                       LoopVectorizationCostModel *CM)
407       : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
408         AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
409         Builder(PSE.getSE()->getContext()),
410         VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM) {}
411   virtual ~InnerLoopVectorizer() = default;
412 
413   /// Create a new empty loop. Unlink the old loop and connect the new one.
414   /// Return the pre-header block of the new loop.
415   BasicBlock *createVectorizedLoopSkeleton();
416 
417   /// Widen a single instruction within the innermost loop.
418   void widenInstruction(Instruction &I);
419 
420   /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
421   void fixVectorizedLoop();
422 
423   // Return true if any runtime check is added.
424   bool areSafetyChecksAdded() { return AddedSafetyChecks; }
425 
426   /// A type for vectorized values in the new loop. Each value from the
427   /// original loop, when vectorized, is represented by UF vector values in the
428   /// new unrolled loop, where UF is the unroll factor.
429   using VectorParts = SmallVector<Value *, 2>;
430 
431   /// Vectorize a single PHINode in a block. This method handles the induction
432   /// variable canonicalization. It supports both VF = 1 for unrolled loops and
433   /// arbitrary length vectors.
434   void widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF);
435 
436   /// A helper function to scalarize a single Instruction in the innermost loop.
437   /// Generates a sequence of scalar instances for each lane between \p MinLane
438   /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
439   /// inclusive..
440   void scalarizeInstruction(Instruction *Instr, const VPIteration &Instance,
441                             bool IfPredicateInstr);
442 
443   /// Widen an integer or floating-point induction variable \p IV. If \p Trunc
444   /// is provided, the integer induction variable will first be truncated to
445   /// the corresponding type.
446   void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr);
447 
448   /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a
449   /// vector or scalar value on-demand if one is not yet available. When
450   /// vectorizing a loop, we visit the definition of an instruction before its
451   /// uses. When visiting the definition, we either vectorize or scalarize the
452   /// instruction, creating an entry for it in the corresponding map. (In some
453   /// cases, such as induction variables, we will create both vector and scalar
454   /// entries.) Then, as we encounter uses of the definition, we derive values
455   /// for each scalar or vector use unless such a value is already available.
456   /// For example, if we scalarize a definition and one of its uses is vector,
457   /// we build the required vector on-demand with an insertelement sequence
458   /// when visiting the use. Otherwise, if the use is scalar, we can use the
459   /// existing scalar definition.
460   ///
461   /// Return a value in the new loop corresponding to \p V from the original
462   /// loop at unroll index \p Part. If the value has already been vectorized,
463   /// the corresponding vector entry in VectorLoopValueMap is returned. If,
464   /// however, the value has a scalar entry in VectorLoopValueMap, we construct
465   /// a new vector value on-demand by inserting the scalar values into a vector
466   /// with an insertelement sequence. If the value has been neither vectorized
467   /// nor scalarized, it must be loop invariant, so we simply broadcast the
468   /// value into a vector.
469   Value *getOrCreateVectorValue(Value *V, unsigned Part);
470 
471   /// Return a value in the new loop corresponding to \p V from the original
472   /// loop at unroll and vector indices \p Instance. If the value has been
473   /// vectorized but not scalarized, the necessary extractelement instruction
474   /// will be generated.
475   Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance);
476 
477   /// Construct the vector value of a scalarized value \p V one lane at a time.
478   void packScalarIntoVectorValue(Value *V, const VPIteration &Instance);
479 
480   /// Try to vectorize the interleaved access group that \p Instr belongs to,
481   /// optionally masking the vector operations if \p BlockInMask is non-null.
482   void vectorizeInterleaveGroup(Instruction *Instr,
483                                 VectorParts *BlockInMask = nullptr);
484 
485   /// Vectorize Load and Store instructions, optionally masking the vector
486   /// operations if \p BlockInMask is non-null.
487   void vectorizeMemoryInstruction(Instruction *Instr,
488                                   VectorParts *BlockInMask = nullptr);
489 
490   /// Set the debug location in the builder using the debug location in
491   /// the instruction.
492   void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr);
493 
494   /// Fix the non-induction PHIs in the OrigPHIsToFix vector.
495   void fixNonInductionPHIs(void);
496 
497 protected:
498   friend class LoopVectorizationPlanner;
499 
500   /// A small list of PHINodes.
501   using PhiVector = SmallVector<PHINode *, 4>;
502 
503   /// A type for scalarized values in the new loop. Each value from the
504   /// original loop, when scalarized, is represented by UF x VF scalar values
505   /// in the new unrolled loop, where UF is the unroll factor and VF is the
506   /// vectorization factor.
507   using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
508 
509   /// Set up the values of the IVs correctly when exiting the vector loop.
510   void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
511                     Value *CountRoundDown, Value *EndValue,
512                     BasicBlock *MiddleBlock);
513 
514   /// Create a new induction variable inside L.
515   PHINode *createInductionVariable(Loop *L, Value *Start, Value *End,
516                                    Value *Step, Instruction *DL);
517 
518   /// Handle all cross-iteration phis in the header.
519   void fixCrossIterationPHIs();
520 
521   /// Fix a first-order recurrence. This is the second phase of vectorizing
522   /// this phi node.
523   void fixFirstOrderRecurrence(PHINode *Phi);
524 
525   /// Fix a reduction cross-iteration phi. This is the second phase of
526   /// vectorizing this phi node.
527   void fixReduction(PHINode *Phi);
528 
529   /// The Loop exit block may have single value PHI nodes with some
530   /// incoming value. While vectorizing we only handled real values
531   /// that were defined inside the loop and we should have one value for
532   /// each predecessor of its parent basic block. See PR14725.
533   void fixLCSSAPHIs();
534 
535   /// Iteratively sink the scalarized operands of a predicated instruction into
536   /// the block that was created for it.
537   void sinkScalarOperands(Instruction *PredInst);
538 
539   /// Shrinks vector element sizes to the smallest bitwidth they can be legally
540   /// represented as.
541   void truncateToMinimalBitwidths();
542 
543   /// Insert the new loop to the loop hierarchy and pass manager
544   /// and update the analysis passes.
545   void updateAnalysis();
546 
547   /// Create a broadcast instruction. This method generates a broadcast
548   /// instruction (shuffle) for loop invariant values and for the induction
549   /// value. If this is the induction variable then we extend it to N, N+1, ...
550   /// this is needed because each iteration in the loop corresponds to a SIMD
551   /// element.
552   virtual Value *getBroadcastInstrs(Value *V);
553 
554   /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...)
555   /// to each vector element of Val. The sequence starts at StartIndex.
556   /// \p Opcode is relevant for FP induction variable.
557   virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step,
558                                Instruction::BinaryOps Opcode =
559                                Instruction::BinaryOpsEnd);
560 
561   /// Compute scalar induction steps. \p ScalarIV is the scalar induction
562   /// variable on which to base the steps, \p Step is the size of the step, and
563   /// \p EntryVal is the value from the original loop that maps to the steps.
564   /// Note that \p EntryVal doesn't have to be an induction variable - it
565   /// can also be a truncate instruction.
566   void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal,
567                         const InductionDescriptor &ID);
568 
569   /// Create a vector induction phi node based on an existing scalar one. \p
570   /// EntryVal is the value from the original loop that maps to the vector phi
571   /// node, and \p Step is the loop-invariant step. If \p EntryVal is a
572   /// truncate instruction, instead of widening the original IV, we widen a
573   /// version of the IV truncated to \p EntryVal's type.
574   void createVectorIntOrFpInductionPHI(const InductionDescriptor &II,
575                                        Value *Step, Instruction *EntryVal);
576 
577   /// Returns true if an instruction \p I should be scalarized instead of
578   /// vectorized for the chosen vectorization factor.
579   bool shouldScalarizeInstruction(Instruction *I) const;
580 
581   /// Returns true if we should generate a scalar version of \p IV.
582   bool needsScalarInduction(Instruction *IV) const;
583 
584   /// If there is a cast involved in the induction variable \p ID, which should
585   /// be ignored in the vectorized loop body, this function records the
586   /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the
587   /// cast. We had already proved that the casted Phi is equal to the uncasted
588   /// Phi in the vectorized loop (under a runtime guard), and therefore
589   /// there is no need to vectorize the cast - the same value can be used in the
590   /// vector loop for both the Phi and the cast.
591   /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified,
592   /// Otherwise, \p VectorLoopValue is a widened/vectorized value.
593   ///
594   /// \p EntryVal is the value from the original loop that maps to the vector
595   /// phi node and is used to distinguish what is the IV currently being
596   /// processed - original one (if \p EntryVal is a phi corresponding to the
597   /// original IV) or the "newly-created" one based on the proof mentioned above
598   /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the
599   /// latter case \p EntryVal is a TruncInst and we must not record anything for
600   /// that IV, but it's error-prone to expect callers of this routine to care
601   /// about that, hence this explicit parameter.
602   void recordVectorLoopValueForInductionCast(const InductionDescriptor &ID,
603                                              const Instruction *EntryVal,
604                                              Value *VectorLoopValue,
605                                              unsigned Part,
606                                              unsigned Lane = UINT_MAX);
607 
608   /// Generate a shuffle sequence that will reverse the vector Vec.
609   virtual Value *reverseVector(Value *Vec);
610 
611   /// Returns (and creates if needed) the original loop trip count.
612   Value *getOrCreateTripCount(Loop *NewLoop);
613 
614   /// Returns (and creates if needed) the trip count of the widened loop.
615   Value *getOrCreateVectorTripCount(Loop *NewLoop);
616 
617   /// Returns a bitcasted value to the requested vector type.
618   /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
619   Value *createBitOrPointerCast(Value *V, VectorType *DstVTy,
620                                 const DataLayout &DL);
621 
622   /// Emit a bypass check to see if the vector trip count is zero, including if
623   /// it overflows.
624   void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass);
625 
626   /// Emit a bypass check to see if all of the SCEV assumptions we've
627   /// had to make are correct.
628   void emitSCEVChecks(Loop *L, BasicBlock *Bypass);
629 
630   /// Emit bypass checks to check any memory assumptions we may have made.
631   void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass);
632 
633   /// Compute the transformed value of Index at offset StartValue using step
634   /// StepValue.
635   /// For integer induction, returns StartValue + Index * StepValue.
636   /// For pointer induction, returns StartValue[Index * StepValue].
637   /// FIXME: The newly created binary instructions should contain nsw/nuw
638   /// flags, which can be found from the original scalar operations.
639   Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE,
640                               const DataLayout &DL,
641                               const InductionDescriptor &ID) const;
642 
643   /// Add additional metadata to \p To that was not present on \p Orig.
644   ///
645   /// Currently this is used to add the noalias annotations based on the
646   /// inserted memchecks.  Use this for instructions that are *cloned* into the
647   /// vector loop.
648   void addNewMetadata(Instruction *To, const Instruction *Orig);
649 
650   /// Add metadata from one instruction to another.
651   ///
652   /// This includes both the original MDs from \p From and additional ones (\see
653   /// addNewMetadata).  Use this for *newly created* instructions in the vector
654   /// loop.
655   void addMetadata(Instruction *To, Instruction *From);
656 
657   /// Similar to the previous function but it adds the metadata to a
658   /// vector of instructions.
659   void addMetadata(ArrayRef<Value *> To, Instruction *From);
660 
661   /// The original loop.
662   Loop *OrigLoop;
663 
664   /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
665   /// dynamic knowledge to simplify SCEV expressions and converts them to a
666   /// more usable form.
667   PredicatedScalarEvolution &PSE;
668 
669   /// Loop Info.
670   LoopInfo *LI;
671 
672   /// Dominator Tree.
673   DominatorTree *DT;
674 
675   /// Alias Analysis.
676   AliasAnalysis *AA;
677 
678   /// Target Library Info.
679   const TargetLibraryInfo *TLI;
680 
681   /// Target Transform Info.
682   const TargetTransformInfo *TTI;
683 
684   /// Assumption Cache.
685   AssumptionCache *AC;
686 
687   /// Interface to emit optimization remarks.
688   OptimizationRemarkEmitter *ORE;
689 
690   /// LoopVersioning.  It's only set up (non-null) if memchecks were
691   /// used.
692   ///
693   /// This is currently only used to add no-alias metadata based on the
694   /// memchecks.  The actually versioning is performed manually.
695   std::unique_ptr<LoopVersioning> LVer;
696 
697   /// The vectorization SIMD factor to use. Each vector will have this many
698   /// vector elements.
699   unsigned VF;
700 
701   /// The vectorization unroll factor to use. Each scalar is vectorized to this
702   /// many different vector instructions.
703   unsigned UF;
704 
705   /// The builder that we use
706   IRBuilder<> Builder;
707 
708   // --- Vectorization state ---
709 
710   /// The vector-loop preheader.
711   BasicBlock *LoopVectorPreHeader;
712 
713   /// The scalar-loop preheader.
714   BasicBlock *LoopScalarPreHeader;
715 
716   /// Middle Block between the vector and the scalar.
717   BasicBlock *LoopMiddleBlock;
718 
719   /// The ExitBlock of the scalar loop.
720   BasicBlock *LoopExitBlock;
721 
722   /// The vector loop body.
723   BasicBlock *LoopVectorBody;
724 
725   /// The scalar loop body.
726   BasicBlock *LoopScalarBody;
727 
728   /// A list of all bypass blocks. The first block is the entry of the loop.
729   SmallVector<BasicBlock *, 4> LoopBypassBlocks;
730 
731   /// The new Induction variable which was added to the new block.
732   PHINode *Induction = nullptr;
733 
734   /// The induction variable of the old basic block.
735   PHINode *OldInduction = nullptr;
736 
737   /// Maps values from the original loop to their corresponding values in the
738   /// vectorized loop. A key value can map to either vector values, scalar
739   /// values or both kinds of values, depending on whether the key was
740   /// vectorized and scalarized.
741   VectorizerValueMap VectorLoopValueMap;
742 
743   /// Store instructions that were predicated.
744   SmallVector<Instruction *, 4> PredicatedInstructions;
745 
746   /// Trip count of the original loop.
747   Value *TripCount = nullptr;
748 
749   /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
750   Value *VectorTripCount = nullptr;
751 
752   /// The legality analysis.
753   LoopVectorizationLegality *Legal;
754 
755   /// The profitablity analysis.
756   LoopVectorizationCostModel *Cost;
757 
758   // Record whether runtime checks are added.
759   bool AddedSafetyChecks = false;
760 
761   // Holds the end values for each induction variable. We save the end values
762   // so we can later fix-up the external users of the induction variables.
763   DenseMap<PHINode *, Value *> IVEndValues;
764 
765   // Vector of original scalar PHIs whose corresponding widened PHIs need to be
766   // fixed up at the end of vector code generation.
767   SmallVector<PHINode *, 8> OrigPHIsToFix;
768 };
769 
770 class InnerLoopUnroller : public InnerLoopVectorizer {
771 public:
772   InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
773                     LoopInfo *LI, DominatorTree *DT,
774                     const TargetLibraryInfo *TLI,
775                     const TargetTransformInfo *TTI, AssumptionCache *AC,
776                     OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
777                     LoopVectorizationLegality *LVL,
778                     LoopVectorizationCostModel *CM)
779       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1,
780                             UnrollFactor, LVL, CM) {}
781 
782 private:
783   Value *getBroadcastInstrs(Value *V) override;
784   Value *getStepVector(Value *Val, int StartIdx, Value *Step,
785                        Instruction::BinaryOps Opcode =
786                        Instruction::BinaryOpsEnd) override;
787   Value *reverseVector(Value *Vec) override;
788 };
789 
790 } // end namespace llvm
791 
792 /// Look for a meaningful debug location on the instruction or it's
793 /// operands.
794 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
795   if (!I)
796     return I;
797 
798   DebugLoc Empty;
799   if (I->getDebugLoc() != Empty)
800     return I;
801 
802   for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) {
803     if (Instruction *OpInst = dyn_cast<Instruction>(*OI))
804       if (OpInst->getDebugLoc() != Empty)
805         return OpInst;
806   }
807 
808   return I;
809 }
810 
811 void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) {
812   if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) {
813     const DILocation *DIL = Inst->getDebugLoc();
814     if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
815         !isa<DbgInfoIntrinsic>(Inst)) {
816       auto NewDIL = DIL->cloneByMultiplyingDuplicationFactor(UF * VF);
817       if (NewDIL)
818         B.SetCurrentDebugLocation(NewDIL.getValue());
819       else
820         LLVM_DEBUG(dbgs()
821                    << "Failed to create new discriminator: "
822                    << DIL->getFilename() << " Line: " << DIL->getLine());
823     }
824     else
825       B.SetCurrentDebugLocation(DIL);
826   } else
827     B.SetCurrentDebugLocation(DebugLoc());
828 }
829 
830 /// Write a record \p DebugMsg about vectorization failure to the debug
831 /// output stream. If \p I is passed, it is an instruction that prevents
832 /// vectorization.
833 #ifndef NDEBUG
834 static void debugVectorizationFailure(const StringRef DebugMsg,
835     Instruction *I) {
836   dbgs() << "LV: Not vectorizing: " << DebugMsg;
837   if (I != nullptr)
838     dbgs() << " " << *I;
839   else
840     dbgs() << '.';
841   dbgs() << '\n';
842 }
843 #endif
844 
845 /// Create an analysis remark that explains why vectorization failed
846 ///
847 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint).  \p
848 /// RemarkName is the identifier for the remark.  If \p I is passed it is an
849 /// instruction that prevents vectorization.  Otherwise \p TheLoop is used for
850 /// the location of the remark.  \return the remark object that can be
851 /// streamed to.
852 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
853     StringRef RemarkName, Loop *TheLoop, Instruction *I) {
854   Value *CodeRegion = TheLoop->getHeader();
855   DebugLoc DL = TheLoop->getStartLoc();
856 
857   if (I) {
858     CodeRegion = I->getParent();
859     // If there is no debug location attached to the instruction, revert back to
860     // using the loop's.
861     if (I->getDebugLoc())
862       DL = I->getDebugLoc();
863   }
864 
865   OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion);
866   R << "loop not vectorized: ";
867   return R;
868 }
869 
870 namespace llvm {
871 
872 void reportVectorizationFailure(const StringRef DebugMsg,
873     const StringRef OREMsg, const StringRef ORETag,
874     OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) {
875   LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I));
876   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
877   ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(),
878                 ORETag, TheLoop, I) << OREMsg);
879 }
880 
881 } // end namespace llvm
882 
883 #ifndef NDEBUG
884 /// \return string containing a file name and a line # for the given loop.
885 static std::string getDebugLocString(const Loop *L) {
886   std::string Result;
887   if (L) {
888     raw_string_ostream OS(Result);
889     if (const DebugLoc LoopDbgLoc = L->getStartLoc())
890       LoopDbgLoc.print(OS);
891     else
892       // Just print the module name.
893       OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
894     OS.flush();
895   }
896   return Result;
897 }
898 #endif
899 
900 void InnerLoopVectorizer::addNewMetadata(Instruction *To,
901                                          const Instruction *Orig) {
902   // If the loop was versioned with memchecks, add the corresponding no-alias
903   // metadata.
904   if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig)))
905     LVer->annotateInstWithNoAlias(To, Orig);
906 }
907 
908 void InnerLoopVectorizer::addMetadata(Instruction *To,
909                                       Instruction *From) {
910   propagateMetadata(To, From);
911   addNewMetadata(To, From);
912 }
913 
914 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,
915                                       Instruction *From) {
916   for (Value *V : To) {
917     if (Instruction *I = dyn_cast<Instruction>(V))
918       addMetadata(I, From);
919   }
920 }
921 
922 namespace llvm {
923 
924 // Loop vectorization cost-model hints how the scalar epilogue loop should be
925 // lowered.
926 enum ScalarEpilogueLowering {
927 
928   // The default: allowing scalar epilogues.
929   CM_ScalarEpilogueAllowed,
930 
931   // Vectorization with OptForSize: don't allow epilogues.
932   CM_ScalarEpilogueNotAllowedOptSize,
933 
934   // A special case of vectorisation with OptForSize: loops with a very small
935   // trip count are considered for vectorization under OptForSize, thereby
936   // making sure the cost of their loop body is dominant, free of runtime
937   // guards and scalar iteration overheads.
938   CM_ScalarEpilogueNotAllowedLowTripLoop,
939 
940   // Loop hint predicate indicating an epilogue is undesired.
941   CM_ScalarEpilogueNotNeededUsePredicate
942 };
943 
944 /// LoopVectorizationCostModel - estimates the expected speedups due to
945 /// vectorization.
946 /// In many cases vectorization is not profitable. This can happen because of
947 /// a number of reasons. In this class we mainly attempt to predict the
948 /// expected speedup/slowdowns due to the supported instruction set. We use the
949 /// TargetTransformInfo to query the different backends for the cost of
950 /// different operations.
951 class LoopVectorizationCostModel {
952 public:
953   LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
954                              PredicatedScalarEvolution &PSE, LoopInfo *LI,
955                              LoopVectorizationLegality *Legal,
956                              const TargetTransformInfo &TTI,
957                              const TargetLibraryInfo *TLI, DemandedBits *DB,
958                              AssumptionCache *AC,
959                              OptimizationRemarkEmitter *ORE, const Function *F,
960                              const LoopVectorizeHints *Hints,
961                              InterleavedAccessInfo &IAI)
962       : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
963         TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
964         Hints(Hints), InterleaveInfo(IAI) {}
965 
966   /// \return An upper bound for the vectorization factor, or None if
967   /// vectorization and interleaving should be avoided up front.
968   Optional<unsigned> computeMaxVF();
969 
970   /// \return True if runtime checks are required for vectorization, and false
971   /// otherwise.
972   bool runtimeChecksRequired();
973 
974   /// \return The most profitable vectorization factor and the cost of that VF.
975   /// This method checks every power of two up to MaxVF. If UserVF is not ZERO
976   /// then this vectorization factor will be selected if vectorization is
977   /// possible.
978   VectorizationFactor selectVectorizationFactor(unsigned MaxVF);
979 
980   /// Setup cost-based decisions for user vectorization factor.
981   void selectUserVectorizationFactor(unsigned UserVF) {
982     collectUniformsAndScalars(UserVF);
983     collectInstsToScalarize(UserVF);
984   }
985 
986   /// \return The size (in bits) of the smallest and widest types in the code
987   /// that needs to be vectorized. We ignore values that remain scalar such as
988   /// 64 bit loop indices.
989   std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
990 
991   /// \return The desired interleave count.
992   /// If interleave count has been specified by metadata it will be returned.
993   /// Otherwise, the interleave count is computed and returned. VF and LoopCost
994   /// are the selected vectorization factor and the cost of the selected VF.
995   unsigned selectInterleaveCount(unsigned VF, unsigned LoopCost);
996 
997   /// Memory access instruction may be vectorized in more than one way.
998   /// Form of instruction after vectorization depends on cost.
999   /// This function takes cost-based decisions for Load/Store instructions
1000   /// and collects them in a map. This decisions map is used for building
1001   /// the lists of loop-uniform and loop-scalar instructions.
1002   /// The calculated cost is saved with widening decision in order to
1003   /// avoid redundant calculations.
1004   void setCostBasedWideningDecision(unsigned VF);
1005 
1006   /// A struct that represents some properties of the register usage
1007   /// of a loop.
1008   struct RegisterUsage {
1009     /// Holds the number of loop invariant values that are used in the loop.
1010     /// The key is ClassID of target-provided register class.
1011     SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs;
1012     /// Holds the maximum number of concurrent live intervals in the loop.
1013     /// The key is ClassID of target-provided register class.
1014     SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
1015   };
1016 
1017   /// \return Returns information about the register usages of the loop for the
1018   /// given vectorization factors.
1019   SmallVector<RegisterUsage, 8> calculateRegisterUsage(ArrayRef<unsigned> VFs);
1020 
1021   /// Collect values we want to ignore in the cost model.
1022   void collectValuesToIgnore();
1023 
1024   /// \returns The smallest bitwidth each instruction can be represented with.
1025   /// The vector equivalents of these instructions should be truncated to this
1026   /// type.
1027   const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
1028     return MinBWs;
1029   }
1030 
1031   /// \returns True if it is more profitable to scalarize instruction \p I for
1032   /// vectorization factor \p VF.
1033   bool isProfitableToScalarize(Instruction *I, unsigned VF) const {
1034     assert(VF > 1 && "Profitable to scalarize relevant only for VF > 1.");
1035 
1036     // Cost model is not run in the VPlan-native path - return conservative
1037     // result until this changes.
1038     if (EnableVPlanNativePath)
1039       return false;
1040 
1041     auto Scalars = InstsToScalarize.find(VF);
1042     assert(Scalars != InstsToScalarize.end() &&
1043            "VF not yet analyzed for scalarization profitability");
1044     return Scalars->second.find(I) != Scalars->second.end();
1045   }
1046 
1047   /// Returns true if \p I is known to be uniform after vectorization.
1048   bool isUniformAfterVectorization(Instruction *I, unsigned VF) const {
1049     if (VF == 1)
1050       return true;
1051 
1052     // Cost model is not run in the VPlan-native path - return conservative
1053     // result until this changes.
1054     if (EnableVPlanNativePath)
1055       return false;
1056 
1057     auto UniformsPerVF = Uniforms.find(VF);
1058     assert(UniformsPerVF != Uniforms.end() &&
1059            "VF not yet analyzed for uniformity");
1060     return UniformsPerVF->second.find(I) != UniformsPerVF->second.end();
1061   }
1062 
1063   /// Returns true if \p I is known to be scalar after vectorization.
1064   bool isScalarAfterVectorization(Instruction *I, unsigned VF) const {
1065     if (VF == 1)
1066       return true;
1067 
1068     // Cost model is not run in the VPlan-native path - return conservative
1069     // result until this changes.
1070     if (EnableVPlanNativePath)
1071       return false;
1072 
1073     auto ScalarsPerVF = Scalars.find(VF);
1074     assert(ScalarsPerVF != Scalars.end() &&
1075            "Scalar values are not calculated for VF");
1076     return ScalarsPerVF->second.find(I) != ScalarsPerVF->second.end();
1077   }
1078 
1079   /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1080   /// for vectorization factor \p VF.
1081   bool canTruncateToMinimalBitwidth(Instruction *I, unsigned VF) const {
1082     return VF > 1 && MinBWs.find(I) != MinBWs.end() &&
1083            !isProfitableToScalarize(I, VF) &&
1084            !isScalarAfterVectorization(I, VF);
1085   }
1086 
1087   /// Decision that was taken during cost calculation for memory instruction.
1088   enum InstWidening {
1089     CM_Unknown,
1090     CM_Widen,         // For consecutive accesses with stride +1.
1091     CM_Widen_Reverse, // For consecutive accesses with stride -1.
1092     CM_Interleave,
1093     CM_GatherScatter,
1094     CM_Scalarize
1095   };
1096 
1097   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1098   /// instruction \p I and vector width \p VF.
1099   void setWideningDecision(Instruction *I, unsigned VF, InstWidening W,
1100                            unsigned Cost) {
1101     assert(VF >= 2 && "Expected VF >=2");
1102     WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1103   }
1104 
1105   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1106   /// interleaving group \p Grp and vector width \p VF.
1107   void setWideningDecision(const InterleaveGroup<Instruction> *Grp, unsigned VF,
1108                            InstWidening W, unsigned Cost) {
1109     assert(VF >= 2 && "Expected VF >=2");
1110     /// Broadcast this decicion to all instructions inside the group.
1111     /// But the cost will be assigned to one instruction only.
1112     for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1113       if (auto *I = Grp->getMember(i)) {
1114         if (Grp->getInsertPos() == I)
1115           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1116         else
1117           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1118       }
1119     }
1120   }
1121 
1122   /// Return the cost model decision for the given instruction \p I and vector
1123   /// width \p VF. Return CM_Unknown if this instruction did not pass
1124   /// through the cost modeling.
1125   InstWidening getWideningDecision(Instruction *I, unsigned VF) {
1126     assert(VF >= 2 && "Expected VF >=2");
1127 
1128     // Cost model is not run in the VPlan-native path - return conservative
1129     // result until this changes.
1130     if (EnableVPlanNativePath)
1131       return CM_GatherScatter;
1132 
1133     std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
1134     auto Itr = WideningDecisions.find(InstOnVF);
1135     if (Itr == WideningDecisions.end())
1136       return CM_Unknown;
1137     return Itr->second.first;
1138   }
1139 
1140   /// Return the vectorization cost for the given instruction \p I and vector
1141   /// width \p VF.
1142   unsigned getWideningCost(Instruction *I, unsigned VF) {
1143     assert(VF >= 2 && "Expected VF >=2");
1144     std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
1145     assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&
1146            "The cost is not calculated");
1147     return WideningDecisions[InstOnVF].second;
1148   }
1149 
1150   /// Return True if instruction \p I is an optimizable truncate whose operand
1151   /// is an induction variable. Such a truncate will be removed by adding a new
1152   /// induction variable with the destination type.
1153   bool isOptimizableIVTruncate(Instruction *I, unsigned VF) {
1154     // If the instruction is not a truncate, return false.
1155     auto *Trunc = dyn_cast<TruncInst>(I);
1156     if (!Trunc)
1157       return false;
1158 
1159     // Get the source and destination types of the truncate.
1160     Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1161     Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1162 
1163     // If the truncate is free for the given types, return false. Replacing a
1164     // free truncate with an induction variable would add an induction variable
1165     // update instruction to each iteration of the loop. We exclude from this
1166     // check the primary induction variable since it will need an update
1167     // instruction regardless.
1168     Value *Op = Trunc->getOperand(0);
1169     if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1170       return false;
1171 
1172     // If the truncated value is not an induction variable, return false.
1173     return Legal->isInductionPhi(Op);
1174   }
1175 
1176   /// Collects the instructions to scalarize for each predicated instruction in
1177   /// the loop.
1178   void collectInstsToScalarize(unsigned VF);
1179 
1180   /// Collect Uniform and Scalar values for the given \p VF.
1181   /// The sets depend on CM decision for Load/Store instructions
1182   /// that may be vectorized as interleave, gather-scatter or scalarized.
1183   void collectUniformsAndScalars(unsigned VF) {
1184     // Do the analysis once.
1185     if (VF == 1 || Uniforms.find(VF) != Uniforms.end())
1186       return;
1187     setCostBasedWideningDecision(VF);
1188     collectLoopUniforms(VF);
1189     collectLoopScalars(VF);
1190   }
1191 
1192   /// Returns true if the target machine supports masked store operation
1193   /// for the given \p DataType and kind of access to \p Ptr.
1194   bool isLegalMaskedStore(Type *DataType, Value *Ptr, MaybeAlign Alignment) {
1195     return Legal->isConsecutivePtr(Ptr) &&
1196            TTI.isLegalMaskedStore(DataType, Alignment);
1197   }
1198 
1199   /// Returns true if the target machine supports masked load operation
1200   /// for the given \p DataType and kind of access to \p Ptr.
1201   bool isLegalMaskedLoad(Type *DataType, Value *Ptr, MaybeAlign Alignment) {
1202     return Legal->isConsecutivePtr(Ptr) &&
1203            TTI.isLegalMaskedLoad(DataType, Alignment);
1204   }
1205 
1206   /// Returns true if the target machine supports masked scatter operation
1207   /// for the given \p DataType.
1208   bool isLegalMaskedScatter(Type *DataType) {
1209     return TTI.isLegalMaskedScatter(DataType);
1210   }
1211 
1212   /// Returns true if the target machine supports masked gather operation
1213   /// for the given \p DataType.
1214   bool isLegalMaskedGather(Type *DataType) {
1215     return TTI.isLegalMaskedGather(DataType);
1216   }
1217 
1218   /// Returns true if the target machine can represent \p V as a masked gather
1219   /// or scatter operation.
1220   bool isLegalGatherOrScatter(Value *V) {
1221     bool LI = isa<LoadInst>(V);
1222     bool SI = isa<StoreInst>(V);
1223     if (!LI && !SI)
1224       return false;
1225     auto *Ty = getMemInstValueType(V);
1226     return (LI && isLegalMaskedGather(Ty)) || (SI && isLegalMaskedScatter(Ty));
1227   }
1228 
1229   /// Returns true if \p I is an instruction that will be scalarized with
1230   /// predication. Such instructions include conditional stores and
1231   /// instructions that may divide by zero.
1232   /// If a non-zero VF has been calculated, we check if I will be scalarized
1233   /// predication for that VF.
1234   bool isScalarWithPredication(Instruction *I, unsigned VF = 1);
1235 
1236   // Returns true if \p I is an instruction that will be predicated either
1237   // through scalar predication or masked load/store or masked gather/scatter.
1238   // Superset of instructions that return true for isScalarWithPredication.
1239   bool isPredicatedInst(Instruction *I) {
1240     if (!blockNeedsPredication(I->getParent()))
1241       return false;
1242     // Loads and stores that need some form of masked operation are predicated
1243     // instructions.
1244     if (isa<LoadInst>(I) || isa<StoreInst>(I))
1245       return Legal->isMaskRequired(I);
1246     return isScalarWithPredication(I);
1247   }
1248 
1249   /// Returns true if \p I is a memory instruction with consecutive memory
1250   /// access that can be widened.
1251   bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1);
1252 
1253   /// Returns true if \p I is a memory instruction in an interleaved-group
1254   /// of memory accesses that can be vectorized with wide vector loads/stores
1255   /// and shuffles.
1256   bool interleavedAccessCanBeWidened(Instruction *I, unsigned VF = 1);
1257 
1258   /// Check if \p Instr belongs to any interleaved access group.
1259   bool isAccessInterleaved(Instruction *Instr) {
1260     return InterleaveInfo.isInterleaved(Instr);
1261   }
1262 
1263   /// Get the interleaved access group that \p Instr belongs to.
1264   const InterleaveGroup<Instruction> *
1265   getInterleavedAccessGroup(Instruction *Instr) {
1266     return InterleaveInfo.getInterleaveGroup(Instr);
1267   }
1268 
1269   /// Returns true if an interleaved group requires a scalar iteration
1270   /// to handle accesses with gaps, and there is nothing preventing us from
1271   /// creating a scalar epilogue.
1272   bool requiresScalarEpilogue() const {
1273     return isScalarEpilogueAllowed() && InterleaveInfo.requiresScalarEpilogue();
1274   }
1275 
1276   /// Returns true if a scalar epilogue is not allowed due to optsize or a
1277   /// loop hint annotation.
1278   bool isScalarEpilogueAllowed() const {
1279     return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1280   }
1281 
1282   /// Returns true if all loop blocks should be masked to fold tail loop.
1283   bool foldTailByMasking() const { return FoldTailByMasking; }
1284 
1285   bool blockNeedsPredication(BasicBlock *BB) {
1286     return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1287   }
1288 
1289   /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1290   /// with factor VF.  Return the cost of the instruction, including
1291   /// scalarization overhead if it's needed.
1292   unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF);
1293 
1294   /// Estimate cost of a call instruction CI if it were vectorized with factor
1295   /// VF. Return the cost of the instruction, including scalarization overhead
1296   /// if it's needed. The flag NeedToScalarize shows if the call needs to be
1297   /// scalarized -
1298   /// i.e. either vector version isn't available, or is too expensive.
1299   unsigned getVectorCallCost(CallInst *CI, unsigned VF, bool &NeedToScalarize);
1300 
1301 private:
1302   unsigned NumPredStores = 0;
1303 
1304   /// \return An upper bound for the vectorization factor, larger than zero.
1305   /// One is returned if vectorization should best be avoided due to cost.
1306   unsigned computeFeasibleMaxVF(unsigned ConstTripCount);
1307 
1308   /// The vectorization cost is a combination of the cost itself and a boolean
1309   /// indicating whether any of the contributing operations will actually
1310   /// operate on
1311   /// vector values after type legalization in the backend. If this latter value
1312   /// is
1313   /// false, then all operations will be scalarized (i.e. no vectorization has
1314   /// actually taken place).
1315   using VectorizationCostTy = std::pair<unsigned, bool>;
1316 
1317   /// Returns the expected execution cost. The unit of the cost does
1318   /// not matter because we use the 'cost' units to compare different
1319   /// vector widths. The cost that is returned is *not* normalized by
1320   /// the factor width.
1321   VectorizationCostTy expectedCost(unsigned VF);
1322 
1323   /// Returns the execution time cost of an instruction for a given vector
1324   /// width. Vector width of one means scalar.
1325   VectorizationCostTy getInstructionCost(Instruction *I, unsigned VF);
1326 
1327   /// The cost-computation logic from getInstructionCost which provides
1328   /// the vector type as an output parameter.
1329   unsigned getInstructionCost(Instruction *I, unsigned VF, Type *&VectorTy);
1330 
1331   /// Calculate vectorization cost of memory instruction \p I.
1332   unsigned getMemoryInstructionCost(Instruction *I, unsigned VF);
1333 
1334   /// The cost computation for scalarized memory instruction.
1335   unsigned getMemInstScalarizationCost(Instruction *I, unsigned VF);
1336 
1337   /// The cost computation for interleaving group of memory instructions.
1338   unsigned getInterleaveGroupCost(Instruction *I, unsigned VF);
1339 
1340   /// The cost computation for Gather/Scatter instruction.
1341   unsigned getGatherScatterCost(Instruction *I, unsigned VF);
1342 
1343   /// The cost computation for widening instruction \p I with consecutive
1344   /// memory access.
1345   unsigned getConsecutiveMemOpCost(Instruction *I, unsigned VF);
1346 
1347   /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1348   /// Load: scalar load + broadcast.
1349   /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1350   /// element)
1351   unsigned getUniformMemOpCost(Instruction *I, unsigned VF);
1352 
1353   /// Estimate the overhead of scalarizing an instruction. This is a
1354   /// convenience wrapper for the type-based getScalarizationOverhead API.
1355   unsigned getScalarizationOverhead(Instruction *I, unsigned VF);
1356 
1357   /// Returns whether the instruction is a load or store and will be a emitted
1358   /// as a vector operation.
1359   bool isConsecutiveLoadOrStore(Instruction *I);
1360 
1361   /// Returns true if an artificially high cost for emulated masked memrefs
1362   /// should be used.
1363   bool useEmulatedMaskMemRefHack(Instruction *I);
1364 
1365   /// Map of scalar integer values to the smallest bitwidth they can be legally
1366   /// represented as. The vector equivalents of these values should be truncated
1367   /// to this type.
1368   MapVector<Instruction *, uint64_t> MinBWs;
1369 
1370   /// A type representing the costs for instructions if they were to be
1371   /// scalarized rather than vectorized. The entries are Instruction-Cost
1372   /// pairs.
1373   using ScalarCostsTy = DenseMap<Instruction *, unsigned>;
1374 
1375   /// A set containing all BasicBlocks that are known to present after
1376   /// vectorization as a predicated block.
1377   SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;
1378 
1379   /// Records whether it is allowed to have the original scalar loop execute at
1380   /// least once. This may be needed as a fallback loop in case runtime
1381   /// aliasing/dependence checks fail, or to handle the tail/remainder
1382   /// iterations when the trip count is unknown or doesn't divide by the VF,
1383   /// or as a peel-loop to handle gaps in interleave-groups.
1384   /// Under optsize and when the trip count is very small we don't allow any
1385   /// iterations to execute in the scalar loop.
1386   ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1387 
1388   /// All blocks of loop are to be masked to fold tail of scalar iterations.
1389   bool FoldTailByMasking = false;
1390 
1391   /// A map holding scalar costs for different vectorization factors. The
1392   /// presence of a cost for an instruction in the mapping indicates that the
1393   /// instruction will be scalarized when vectorizing with the associated
1394   /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1395   DenseMap<unsigned, ScalarCostsTy> InstsToScalarize;
1396 
1397   /// Holds the instructions known to be uniform after vectorization.
1398   /// The data is collected per VF.
1399   DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Uniforms;
1400 
1401   /// Holds the instructions known to be scalar after vectorization.
1402   /// The data is collected per VF.
1403   DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Scalars;
1404 
1405   /// Holds the instructions (address computations) that are forced to be
1406   /// scalarized.
1407   DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1408 
1409   /// Returns the expected difference in cost from scalarizing the expression
1410   /// feeding a predicated instruction \p PredInst. The instructions to
1411   /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1412   /// non-negative return value implies the expression will be scalarized.
1413   /// Currently, only single-use chains are considered for scalarization.
1414   int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
1415                               unsigned VF);
1416 
1417   /// Collect the instructions that are uniform after vectorization. An
1418   /// instruction is uniform if we represent it with a single scalar value in
1419   /// the vectorized loop corresponding to each vector iteration. Examples of
1420   /// uniform instructions include pointer operands of consecutive or
1421   /// interleaved memory accesses. Note that although uniformity implies an
1422   /// instruction will be scalar, the reverse is not true. In general, a
1423   /// scalarized instruction will be represented by VF scalar values in the
1424   /// vectorized loop, each corresponding to an iteration of the original
1425   /// scalar loop.
1426   void collectLoopUniforms(unsigned VF);
1427 
1428   /// Collect the instructions that are scalar after vectorization. An
1429   /// instruction is scalar if it is known to be uniform or will be scalarized
1430   /// during vectorization. Non-uniform scalarized instructions will be
1431   /// represented by VF values in the vectorized loop, each corresponding to an
1432   /// iteration of the original scalar loop.
1433   void collectLoopScalars(unsigned VF);
1434 
1435   /// Keeps cost model vectorization decision and cost for instructions.
1436   /// Right now it is used for memory instructions only.
1437   using DecisionList = DenseMap<std::pair<Instruction *, unsigned>,
1438                                 std::pair<InstWidening, unsigned>>;
1439 
1440   DecisionList WideningDecisions;
1441 
1442   /// Returns true if \p V is expected to be vectorized and it needs to be
1443   /// extracted.
1444   bool needsExtract(Value *V, unsigned VF) const {
1445     Instruction *I = dyn_cast<Instruction>(V);
1446     if (VF == 1 || !I || !TheLoop->contains(I) || TheLoop->isLoopInvariant(I))
1447       return false;
1448 
1449     // Assume we can vectorize V (and hence we need extraction) if the
1450     // scalars are not computed yet. This can happen, because it is called
1451     // via getScalarizationOverhead from setCostBasedWideningDecision, before
1452     // the scalars are collected. That should be a safe assumption in most
1453     // cases, because we check if the operands have vectorizable types
1454     // beforehand in LoopVectorizationLegality.
1455     return Scalars.find(VF) == Scalars.end() ||
1456            !isScalarAfterVectorization(I, VF);
1457   };
1458 
1459   /// Returns a range containing only operands needing to be extracted.
1460   SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1461                                                    unsigned VF) {
1462     return SmallVector<Value *, 4>(make_filter_range(
1463         Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1464   }
1465 
1466 public:
1467   /// The loop that we evaluate.
1468   Loop *TheLoop;
1469 
1470   /// Predicated scalar evolution analysis.
1471   PredicatedScalarEvolution &PSE;
1472 
1473   /// Loop Info analysis.
1474   LoopInfo *LI;
1475 
1476   /// Vectorization legality.
1477   LoopVectorizationLegality *Legal;
1478 
1479   /// Vector target information.
1480   const TargetTransformInfo &TTI;
1481 
1482   /// Target Library Info.
1483   const TargetLibraryInfo *TLI;
1484 
1485   /// Demanded bits analysis.
1486   DemandedBits *DB;
1487 
1488   /// Assumption cache.
1489   AssumptionCache *AC;
1490 
1491   /// Interface to emit optimization remarks.
1492   OptimizationRemarkEmitter *ORE;
1493 
1494   const Function *TheFunction;
1495 
1496   /// Loop Vectorize Hint.
1497   const LoopVectorizeHints *Hints;
1498 
1499   /// The interleave access information contains groups of interleaved accesses
1500   /// with the same stride and close to each other.
1501   InterleavedAccessInfo &InterleaveInfo;
1502 
1503   /// Values to ignore in the cost model.
1504   SmallPtrSet<const Value *, 16> ValuesToIgnore;
1505 
1506   /// Values to ignore in the cost model when VF > 1.
1507   SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1508 };
1509 
1510 } // end namespace llvm
1511 
1512 // Return true if \p OuterLp is an outer loop annotated with hints for explicit
1513 // vectorization. The loop needs to be annotated with #pragma omp simd
1514 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
1515 // vector length information is not provided, vectorization is not considered
1516 // explicit. Interleave hints are not allowed either. These limitations will be
1517 // relaxed in the future.
1518 // Please, note that we are currently forced to abuse the pragma 'clang
1519 // vectorize' semantics. This pragma provides *auto-vectorization hints*
1520 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
1521 // provides *explicit vectorization hints* (LV can bypass legal checks and
1522 // assume that vectorization is legal). However, both hints are implemented
1523 // using the same metadata (llvm.loop.vectorize, processed by
1524 // LoopVectorizeHints). This will be fixed in the future when the native IR
1525 // representation for pragma 'omp simd' is introduced.
1526 static bool isExplicitVecOuterLoop(Loop *OuterLp,
1527                                    OptimizationRemarkEmitter *ORE) {
1528   assert(!OuterLp->empty() && "This is not an outer loop");
1529   LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
1530 
1531   // Only outer loops with an explicit vectorization hint are supported.
1532   // Unannotated outer loops are ignored.
1533   if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
1534     return false;
1535 
1536   Function *Fn = OuterLp->getHeader()->getParent();
1537   if (!Hints.allowVectorization(Fn, OuterLp,
1538                                 true /*VectorizeOnlyWhenForced*/)) {
1539     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
1540     return false;
1541   }
1542 
1543   if (Hints.getInterleave() > 1) {
1544     // TODO: Interleave support is future work.
1545     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
1546                          "outer loops.\n");
1547     Hints.emitRemarkWithHints();
1548     return false;
1549   }
1550 
1551   return true;
1552 }
1553 
1554 static void collectSupportedLoops(Loop &L, LoopInfo *LI,
1555                                   OptimizationRemarkEmitter *ORE,
1556                                   SmallVectorImpl<Loop *> &V) {
1557   // Collect inner loops and outer loops without irreducible control flow. For
1558   // now, only collect outer loops that have explicit vectorization hints. If we
1559   // are stress testing the VPlan H-CFG construction, we collect the outermost
1560   // loop of every loop nest.
1561   if (L.empty() || VPlanBuildStressTest ||
1562       (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
1563     LoopBlocksRPO RPOT(&L);
1564     RPOT.perform(LI);
1565     if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
1566       V.push_back(&L);
1567       // TODO: Collect inner loops inside marked outer loops in case
1568       // vectorization fails for the outer loop. Do not invoke
1569       // 'containsIrreducibleCFG' again for inner loops when the outer loop is
1570       // already known to be reducible. We can use an inherited attribute for
1571       // that.
1572       return;
1573     }
1574   }
1575   for (Loop *InnerL : L)
1576     collectSupportedLoops(*InnerL, LI, ORE, V);
1577 }
1578 
1579 namespace {
1580 
1581 /// The LoopVectorize Pass.
1582 struct LoopVectorize : public FunctionPass {
1583   /// Pass identification, replacement for typeid
1584   static char ID;
1585 
1586   LoopVectorizePass Impl;
1587 
1588   explicit LoopVectorize(bool InterleaveOnlyWhenForced = false,
1589                          bool VectorizeOnlyWhenForced = false)
1590       : FunctionPass(ID) {
1591     Impl.InterleaveOnlyWhenForced = InterleaveOnlyWhenForced;
1592     Impl.VectorizeOnlyWhenForced = VectorizeOnlyWhenForced;
1593     initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
1594   }
1595 
1596   bool runOnFunction(Function &F) override {
1597     if (skipFunction(F))
1598       return false;
1599 
1600     auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
1601     auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
1602     auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
1603     auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
1604     auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
1605     auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
1606     auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
1607     auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
1608     auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
1609     auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
1610     auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
1611     auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
1612     auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
1613 
1614     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
1615         [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
1616 
1617     return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC,
1618                         GetLAA, *ORE, PSI);
1619   }
1620 
1621   void getAnalysisUsage(AnalysisUsage &AU) const override {
1622     AU.addRequired<AssumptionCacheTracker>();
1623     AU.addRequired<BlockFrequencyInfoWrapperPass>();
1624     AU.addRequired<DominatorTreeWrapperPass>();
1625     AU.addRequired<LoopInfoWrapperPass>();
1626     AU.addRequired<ScalarEvolutionWrapperPass>();
1627     AU.addRequired<TargetTransformInfoWrapperPass>();
1628     AU.addRequired<AAResultsWrapperPass>();
1629     AU.addRequired<LoopAccessLegacyAnalysis>();
1630     AU.addRequired<DemandedBitsWrapperPass>();
1631     AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
1632 
1633     // We currently do not preserve loopinfo/dominator analyses with outer loop
1634     // vectorization. Until this is addressed, mark these analyses as preserved
1635     // only for non-VPlan-native path.
1636     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
1637     if (!EnableVPlanNativePath) {
1638       AU.addPreserved<LoopInfoWrapperPass>();
1639       AU.addPreserved<DominatorTreeWrapperPass>();
1640     }
1641 
1642     AU.addPreserved<BasicAAWrapperPass>();
1643     AU.addPreserved<GlobalsAAWrapperPass>();
1644     AU.addRequired<ProfileSummaryInfoWrapperPass>();
1645   }
1646 };
1647 
1648 } // end anonymous namespace
1649 
1650 //===----------------------------------------------------------------------===//
1651 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
1652 // LoopVectorizationCostModel and LoopVectorizationPlanner.
1653 //===----------------------------------------------------------------------===//
1654 
1655 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
1656   // We need to place the broadcast of invariant variables outside the loop,
1657   // but only if it's proven safe to do so. Else, broadcast will be inside
1658   // vector loop body.
1659   Instruction *Instr = dyn_cast<Instruction>(V);
1660   bool SafeToHoist = OrigLoop->isLoopInvariant(V) &&
1661                      (!Instr ||
1662                       DT->dominates(Instr->getParent(), LoopVectorPreHeader));
1663   // Place the code for broadcasting invariant variables in the new preheader.
1664   IRBuilder<>::InsertPointGuard Guard(Builder);
1665   if (SafeToHoist)
1666     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
1667 
1668   // Broadcast the scalar into all locations in the vector.
1669   Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
1670 
1671   return Shuf;
1672 }
1673 
1674 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
1675     const InductionDescriptor &II, Value *Step, Instruction *EntryVal) {
1676   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1677          "Expected either an induction phi-node or a truncate of it!");
1678   Value *Start = II.getStartValue();
1679 
1680   // Construct the initial value of the vector IV in the vector loop preheader
1681   auto CurrIP = Builder.saveIP();
1682   Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
1683   if (isa<TruncInst>(EntryVal)) {
1684     assert(Start->getType()->isIntegerTy() &&
1685            "Truncation requires an integer type");
1686     auto *TruncType = cast<IntegerType>(EntryVal->getType());
1687     Step = Builder.CreateTrunc(Step, TruncType);
1688     Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
1689   }
1690   Value *SplatStart = Builder.CreateVectorSplat(VF, Start);
1691   Value *SteppedStart =
1692       getStepVector(SplatStart, 0, Step, II.getInductionOpcode());
1693 
1694   // We create vector phi nodes for both integer and floating-point induction
1695   // variables. Here, we determine the kind of arithmetic we will perform.
1696   Instruction::BinaryOps AddOp;
1697   Instruction::BinaryOps MulOp;
1698   if (Step->getType()->isIntegerTy()) {
1699     AddOp = Instruction::Add;
1700     MulOp = Instruction::Mul;
1701   } else {
1702     AddOp = II.getInductionOpcode();
1703     MulOp = Instruction::FMul;
1704   }
1705 
1706   // Multiply the vectorization factor by the step using integer or
1707   // floating-point arithmetic as appropriate.
1708   Value *ConstVF = getSignedIntOrFpConstant(Step->getType(), VF);
1709   Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF));
1710 
1711   // Create a vector splat to use in the induction update.
1712   //
1713   // FIXME: If the step is non-constant, we create the vector splat with
1714   //        IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
1715   //        handle a constant vector splat.
1716   Value *SplatVF = isa<Constant>(Mul)
1717                        ? ConstantVector::getSplat(VF, cast<Constant>(Mul))
1718                        : Builder.CreateVectorSplat(VF, Mul);
1719   Builder.restoreIP(CurrIP);
1720 
1721   // We may need to add the step a number of times, depending on the unroll
1722   // factor. The last of those goes into the PHI.
1723   PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
1724                                     &*LoopVectorBody->getFirstInsertionPt());
1725   VecInd->setDebugLoc(EntryVal->getDebugLoc());
1726   Instruction *LastInduction = VecInd;
1727   for (unsigned Part = 0; Part < UF; ++Part) {
1728     VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction);
1729 
1730     if (isa<TruncInst>(EntryVal))
1731       addMetadata(LastInduction, EntryVal);
1732     recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, Part);
1733 
1734     LastInduction = cast<Instruction>(addFastMathFlag(
1735         Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")));
1736     LastInduction->setDebugLoc(EntryVal->getDebugLoc());
1737   }
1738 
1739   // Move the last step to the end of the latch block. This ensures consistent
1740   // placement of all induction updates.
1741   auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
1742   auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator());
1743   auto *ICmp = cast<Instruction>(Br->getCondition());
1744   LastInduction->moveBefore(ICmp);
1745   LastInduction->setName("vec.ind.next");
1746 
1747   VecInd->addIncoming(SteppedStart, LoopVectorPreHeader);
1748   VecInd->addIncoming(LastInduction, LoopVectorLatch);
1749 }
1750 
1751 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const {
1752   return Cost->isScalarAfterVectorization(I, VF) ||
1753          Cost->isProfitableToScalarize(I, VF);
1754 }
1755 
1756 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const {
1757   if (shouldScalarizeInstruction(IV))
1758     return true;
1759   auto isScalarInst = [&](User *U) -> bool {
1760     auto *I = cast<Instruction>(U);
1761     return (OrigLoop->contains(I) && shouldScalarizeInstruction(I));
1762   };
1763   return llvm::any_of(IV->users(), isScalarInst);
1764 }
1765 
1766 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast(
1767     const InductionDescriptor &ID, const Instruction *EntryVal,
1768     Value *VectorLoopVal, unsigned Part, unsigned Lane) {
1769   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1770          "Expected either an induction phi-node or a truncate of it!");
1771 
1772   // This induction variable is not the phi from the original loop but the
1773   // newly-created IV based on the proof that casted Phi is equal to the
1774   // uncasted Phi in the vectorized loop (under a runtime guard possibly). It
1775   // re-uses the same InductionDescriptor that original IV uses but we don't
1776   // have to do any recording in this case - that is done when original IV is
1777   // processed.
1778   if (isa<TruncInst>(EntryVal))
1779     return;
1780 
1781   const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts();
1782   if (Casts.empty())
1783     return;
1784   // Only the first Cast instruction in the Casts vector is of interest.
1785   // The rest of the Casts (if exist) have no uses outside the
1786   // induction update chain itself.
1787   Instruction *CastInst = *Casts.begin();
1788   if (Lane < UINT_MAX)
1789     VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal);
1790   else
1791     VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal);
1792 }
1793 
1794 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) {
1795   assert((IV->getType()->isIntegerTy() || IV != OldInduction) &&
1796          "Primary induction variable must have an integer type");
1797 
1798   auto II = Legal->getInductionVars()->find(IV);
1799   assert(II != Legal->getInductionVars()->end() && "IV is not an induction");
1800 
1801   auto ID = II->second;
1802   assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
1803 
1804   // The scalar value to broadcast. This will be derived from the canonical
1805   // induction variable.
1806   Value *ScalarIV = nullptr;
1807 
1808   // The value from the original loop to which we are mapping the new induction
1809   // variable.
1810   Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
1811 
1812   // True if we have vectorized the induction variable.
1813   auto VectorizedIV = false;
1814 
1815   // Determine if we want a scalar version of the induction variable. This is
1816   // true if the induction variable itself is not widened, or if it has at
1817   // least one user in the loop that is not widened.
1818   auto NeedsScalarIV = VF > 1 && needsScalarInduction(EntryVal);
1819 
1820   // Generate code for the induction step. Note that induction steps are
1821   // required to be loop-invariant
1822   assert(PSE.getSE()->isLoopInvariant(ID.getStep(), OrigLoop) &&
1823          "Induction step should be loop invariant");
1824   auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
1825   Value *Step = nullptr;
1826   if (PSE.getSE()->isSCEVable(IV->getType())) {
1827     SCEVExpander Exp(*PSE.getSE(), DL, "induction");
1828     Step = Exp.expandCodeFor(ID.getStep(), ID.getStep()->getType(),
1829                              LoopVectorPreHeader->getTerminator());
1830   } else {
1831     Step = cast<SCEVUnknown>(ID.getStep())->getValue();
1832   }
1833 
1834   // Try to create a new independent vector induction variable. If we can't
1835   // create the phi node, we will splat the scalar induction variable in each
1836   // loop iteration.
1837   if (VF > 1 && !shouldScalarizeInstruction(EntryVal)) {
1838     createVectorIntOrFpInductionPHI(ID, Step, EntryVal);
1839     VectorizedIV = true;
1840   }
1841 
1842   // If we haven't yet vectorized the induction variable, or if we will create
1843   // a scalar one, we need to define the scalar induction variable and step
1844   // values. If we were given a truncation type, truncate the canonical
1845   // induction variable and step. Otherwise, derive these values from the
1846   // induction descriptor.
1847   if (!VectorizedIV || NeedsScalarIV) {
1848     ScalarIV = Induction;
1849     if (IV != OldInduction) {
1850       ScalarIV = IV->getType()->isIntegerTy()
1851                      ? Builder.CreateSExtOrTrunc(Induction, IV->getType())
1852                      : Builder.CreateCast(Instruction::SIToFP, Induction,
1853                                           IV->getType());
1854       ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID);
1855       ScalarIV->setName("offset.idx");
1856     }
1857     if (Trunc) {
1858       auto *TruncType = cast<IntegerType>(Trunc->getType());
1859       assert(Step->getType()->isIntegerTy() &&
1860              "Truncation requires an integer step");
1861       ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType);
1862       Step = Builder.CreateTrunc(Step, TruncType);
1863     }
1864   }
1865 
1866   // If we haven't yet vectorized the induction variable, splat the scalar
1867   // induction variable, and build the necessary step vectors.
1868   // TODO: Don't do it unless the vectorized IV is really required.
1869   if (!VectorizedIV) {
1870     Value *Broadcasted = getBroadcastInstrs(ScalarIV);
1871     for (unsigned Part = 0; Part < UF; ++Part) {
1872       Value *EntryPart =
1873           getStepVector(Broadcasted, VF * Part, Step, ID.getInductionOpcode());
1874       VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart);
1875       if (Trunc)
1876         addMetadata(EntryPart, Trunc);
1877       recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part);
1878     }
1879   }
1880 
1881   // If an induction variable is only used for counting loop iterations or
1882   // calculating addresses, it doesn't need to be widened. Create scalar steps
1883   // that can be used by instructions we will later scalarize. Note that the
1884   // addition of the scalar steps will not increase the number of instructions
1885   // in the loop in the common case prior to InstCombine. We will be trading
1886   // one vector extract for each scalar step.
1887   if (NeedsScalarIV)
1888     buildScalarSteps(ScalarIV, Step, EntryVal, ID);
1889 }
1890 
1891 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step,
1892                                           Instruction::BinaryOps BinOp) {
1893   // Create and check the types.
1894   assert(Val->getType()->isVectorTy() && "Must be a vector");
1895   int VLen = Val->getType()->getVectorNumElements();
1896 
1897   Type *STy = Val->getType()->getScalarType();
1898   assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
1899          "Induction Step must be an integer or FP");
1900   assert(Step->getType() == STy && "Step has wrong type");
1901 
1902   SmallVector<Constant *, 8> Indices;
1903 
1904   if (STy->isIntegerTy()) {
1905     // Create a vector of consecutive numbers from zero to VF.
1906     for (int i = 0; i < VLen; ++i)
1907       Indices.push_back(ConstantInt::get(STy, StartIdx + i));
1908 
1909     // Add the consecutive indices to the vector value.
1910     Constant *Cv = ConstantVector::get(Indices);
1911     assert(Cv->getType() == Val->getType() && "Invalid consecutive vec");
1912     Step = Builder.CreateVectorSplat(VLen, Step);
1913     assert(Step->getType() == Val->getType() && "Invalid step vec");
1914     // FIXME: The newly created binary instructions should contain nsw/nuw flags,
1915     // which can be found from the original scalar operations.
1916     Step = Builder.CreateMul(Cv, Step);
1917     return Builder.CreateAdd(Val, Step, "induction");
1918   }
1919 
1920   // Floating point induction.
1921   assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
1922          "Binary Opcode should be specified for FP induction");
1923   // Create a vector of consecutive numbers from zero to VF.
1924   for (int i = 0; i < VLen; ++i)
1925     Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i)));
1926 
1927   // Add the consecutive indices to the vector value.
1928   Constant *Cv = ConstantVector::get(Indices);
1929 
1930   Step = Builder.CreateVectorSplat(VLen, Step);
1931 
1932   // Floating point operations had to be 'fast' to enable the induction.
1933   FastMathFlags Flags;
1934   Flags.setFast();
1935 
1936   Value *MulOp = Builder.CreateFMul(Cv, Step);
1937   if (isa<Instruction>(MulOp))
1938     // Have to check, MulOp may be a constant
1939     cast<Instruction>(MulOp)->setFastMathFlags(Flags);
1940 
1941   Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
1942   if (isa<Instruction>(BOp))
1943     cast<Instruction>(BOp)->setFastMathFlags(Flags);
1944   return BOp;
1945 }
1946 
1947 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
1948                                            Instruction *EntryVal,
1949                                            const InductionDescriptor &ID) {
1950   // We shouldn't have to build scalar steps if we aren't vectorizing.
1951   assert(VF > 1 && "VF should be greater than one");
1952 
1953   // Get the value type and ensure it and the step have the same integer type.
1954   Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
1955   assert(ScalarIVTy == Step->getType() &&
1956          "Val and Step should have the same type");
1957 
1958   // We build scalar steps for both integer and floating-point induction
1959   // variables. Here, we determine the kind of arithmetic we will perform.
1960   Instruction::BinaryOps AddOp;
1961   Instruction::BinaryOps MulOp;
1962   if (ScalarIVTy->isIntegerTy()) {
1963     AddOp = Instruction::Add;
1964     MulOp = Instruction::Mul;
1965   } else {
1966     AddOp = ID.getInductionOpcode();
1967     MulOp = Instruction::FMul;
1968   }
1969 
1970   // Determine the number of scalars we need to generate for each unroll
1971   // iteration. If EntryVal is uniform, we only need to generate the first
1972   // lane. Otherwise, we generate all VF values.
1973   unsigned Lanes =
1974       Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) ? 1
1975                                                                          : VF;
1976   // Compute the scalar steps and save the results in VectorLoopValueMap.
1977   for (unsigned Part = 0; Part < UF; ++Part) {
1978     for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
1979       auto *StartIdx = getSignedIntOrFpConstant(ScalarIVTy, VF * Part + Lane);
1980       auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step));
1981       auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul));
1982       VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add);
1983       recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane);
1984     }
1985   }
1986 }
1987 
1988 Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) {
1989   assert(V != Induction && "The new induction variable should not be used.");
1990   assert(!V->getType()->isVectorTy() && "Can't widen a vector");
1991   assert(!V->getType()->isVoidTy() && "Type does not produce a value");
1992 
1993   // If we have a stride that is replaced by one, do it here. Defer this for
1994   // the VPlan-native path until we start running Legal checks in that path.
1995   if (!EnableVPlanNativePath && Legal->hasStride(V))
1996     V = ConstantInt::get(V->getType(), 1);
1997 
1998   // If we have a vector mapped to this value, return it.
1999   if (VectorLoopValueMap.hasVectorValue(V, Part))
2000     return VectorLoopValueMap.getVectorValue(V, Part);
2001 
2002   // If the value has not been vectorized, check if it has been scalarized
2003   // instead. If it has been scalarized, and we actually need the value in
2004   // vector form, we will construct the vector values on demand.
2005   if (VectorLoopValueMap.hasAnyScalarValue(V)) {
2006     Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0});
2007 
2008     // If we've scalarized a value, that value should be an instruction.
2009     auto *I = cast<Instruction>(V);
2010 
2011     // If we aren't vectorizing, we can just copy the scalar map values over to
2012     // the vector map.
2013     if (VF == 1) {
2014       VectorLoopValueMap.setVectorValue(V, Part, ScalarValue);
2015       return ScalarValue;
2016     }
2017 
2018     // Get the last scalar instruction we generated for V and Part. If the value
2019     // is known to be uniform after vectorization, this corresponds to lane zero
2020     // of the Part unroll iteration. Otherwise, the last instruction is the one
2021     // we created for the last vector lane of the Part unroll iteration.
2022     unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) ? 0 : VF - 1;
2023     auto *LastInst = cast<Instruction>(
2024         VectorLoopValueMap.getScalarValue(V, {Part, LastLane}));
2025 
2026     // Set the insert point after the last scalarized instruction. This ensures
2027     // the insertelement sequence will directly follow the scalar definitions.
2028     auto OldIP = Builder.saveIP();
2029     auto NewIP = std::next(BasicBlock::iterator(LastInst));
2030     Builder.SetInsertPoint(&*NewIP);
2031 
2032     // However, if we are vectorizing, we need to construct the vector values.
2033     // If the value is known to be uniform after vectorization, we can just
2034     // broadcast the scalar value corresponding to lane zero for each unroll
2035     // iteration. Otherwise, we construct the vector values using insertelement
2036     // instructions. Since the resulting vectors are stored in
2037     // VectorLoopValueMap, we will only generate the insertelements once.
2038     Value *VectorValue = nullptr;
2039     if (Cost->isUniformAfterVectorization(I, VF)) {
2040       VectorValue = getBroadcastInstrs(ScalarValue);
2041       VectorLoopValueMap.setVectorValue(V, Part, VectorValue);
2042     } else {
2043       // Initialize packing with insertelements to start from undef.
2044       Value *Undef = UndefValue::get(VectorType::get(V->getType(), VF));
2045       VectorLoopValueMap.setVectorValue(V, Part, Undef);
2046       for (unsigned Lane = 0; Lane < VF; ++Lane)
2047         packScalarIntoVectorValue(V, {Part, Lane});
2048       VectorValue = VectorLoopValueMap.getVectorValue(V, Part);
2049     }
2050     Builder.restoreIP(OldIP);
2051     return VectorValue;
2052   }
2053 
2054   // If this scalar is unknown, assume that it is a constant or that it is
2055   // loop invariant. Broadcast V and save the value for future uses.
2056   Value *B = getBroadcastInstrs(V);
2057   VectorLoopValueMap.setVectorValue(V, Part, B);
2058   return B;
2059 }
2060 
2061 Value *
2062 InnerLoopVectorizer::getOrCreateScalarValue(Value *V,
2063                                             const VPIteration &Instance) {
2064   // If the value is not an instruction contained in the loop, it should
2065   // already be scalar.
2066   if (OrigLoop->isLoopInvariant(V))
2067     return V;
2068 
2069   assert(Instance.Lane > 0
2070              ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF)
2071              : true && "Uniform values only have lane zero");
2072 
2073   // If the value from the original loop has not been vectorized, it is
2074   // represented by UF x VF scalar values in the new loop. Return the requested
2075   // scalar value.
2076   if (VectorLoopValueMap.hasScalarValue(V, Instance))
2077     return VectorLoopValueMap.getScalarValue(V, Instance);
2078 
2079   // If the value has not been scalarized, get its entry in VectorLoopValueMap
2080   // for the given unroll part. If this entry is not a vector type (i.e., the
2081   // vectorization factor is one), there is no need to generate an
2082   // extractelement instruction.
2083   auto *U = getOrCreateVectorValue(V, Instance.Part);
2084   if (!U->getType()->isVectorTy()) {
2085     assert(VF == 1 && "Value not scalarized has non-vector type");
2086     return U;
2087   }
2088 
2089   // Otherwise, the value from the original loop has been vectorized and is
2090   // represented by UF vector values. Extract and return the requested scalar
2091   // value from the appropriate vector lane.
2092   return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane));
2093 }
2094 
2095 void InnerLoopVectorizer::packScalarIntoVectorValue(
2096     Value *V, const VPIteration &Instance) {
2097   assert(V != Induction && "The new induction variable should not be used.");
2098   assert(!V->getType()->isVectorTy() && "Can't pack a vector");
2099   assert(!V->getType()->isVoidTy() && "Type does not produce a value");
2100 
2101   Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance);
2102   Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part);
2103   VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst,
2104                                             Builder.getInt32(Instance.Lane));
2105   VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue);
2106 }
2107 
2108 Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
2109   assert(Vec->getType()->isVectorTy() && "Invalid type");
2110   SmallVector<Constant *, 8> ShuffleMask;
2111   for (unsigned i = 0; i < VF; ++i)
2112     ShuffleMask.push_back(Builder.getInt32(VF - i - 1));
2113 
2114   return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()),
2115                                      ConstantVector::get(ShuffleMask),
2116                                      "reverse");
2117 }
2118 
2119 // Return whether we allow using masked interleave-groups (for dealing with
2120 // strided loads/stores that reside in predicated blocks, or for dealing
2121 // with gaps).
2122 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2123   // If an override option has been passed in for interleaved accesses, use it.
2124   if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2125     return EnableMaskedInterleavedMemAccesses;
2126 
2127   return TTI.enableMaskedInterleavedAccessVectorization();
2128 }
2129 
2130 // Try to vectorize the interleave group that \p Instr belongs to.
2131 //
2132 // E.g. Translate following interleaved load group (factor = 3):
2133 //   for (i = 0; i < N; i+=3) {
2134 //     R = Pic[i];             // Member of index 0
2135 //     G = Pic[i+1];           // Member of index 1
2136 //     B = Pic[i+2];           // Member of index 2
2137 //     ... // do something to R, G, B
2138 //   }
2139 // To:
2140 //   %wide.vec = load <12 x i32>                       ; Read 4 tuples of R,G,B
2141 //   %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9>   ; R elements
2142 //   %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10>  ; G elements
2143 //   %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11>  ; B elements
2144 //
2145 // Or translate following interleaved store group (factor = 3):
2146 //   for (i = 0; i < N; i+=3) {
2147 //     ... do something to R, G, B
2148 //     Pic[i]   = R;           // Member of index 0
2149 //     Pic[i+1] = G;           // Member of index 1
2150 //     Pic[i+2] = B;           // Member of index 2
2151 //   }
2152 // To:
2153 //   %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2154 //   %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u>
2155 //   %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2156 //        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements
2157 //   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
2158 void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr,
2159                                                    VectorParts *BlockInMask) {
2160   const InterleaveGroup<Instruction> *Group =
2161       Cost->getInterleavedAccessGroup(Instr);
2162   assert(Group && "Fail to get an interleaved access group.");
2163 
2164   // Skip if current instruction is not the insert position.
2165   if (Instr != Group->getInsertPos())
2166     return;
2167 
2168   const DataLayout &DL = Instr->getModule()->getDataLayout();
2169   Value *Ptr = getLoadStorePointerOperand(Instr);
2170 
2171   // Prepare for the vector type of the interleaved load/store.
2172   Type *ScalarTy = getMemInstValueType(Instr);
2173   unsigned InterleaveFactor = Group->getFactor();
2174   Type *VecTy = VectorType::get(ScalarTy, InterleaveFactor * VF);
2175   Type *PtrTy = VecTy->getPointerTo(getLoadStoreAddressSpace(Instr));
2176 
2177   // Prepare for the new pointers.
2178   setDebugLocFromInst(Builder, Ptr);
2179   SmallVector<Value *, 2> NewPtrs;
2180   unsigned Index = Group->getIndex(Instr);
2181 
2182   VectorParts Mask;
2183   bool IsMaskForCondRequired = BlockInMask;
2184   if (IsMaskForCondRequired) {
2185     Mask = *BlockInMask;
2186     // TODO: extend the masked interleaved-group support to reversed access.
2187     assert(!Group->isReverse() && "Reversed masked interleave-group "
2188                                   "not supported.");
2189   }
2190 
2191   // If the group is reverse, adjust the index to refer to the last vector lane
2192   // instead of the first. We adjust the index from the first vector lane,
2193   // rather than directly getting the pointer for lane VF - 1, because the
2194   // pointer operand of the interleaved access is supposed to be uniform. For
2195   // uniform instructions, we're only required to generate a value for the
2196   // first vector lane in each unroll iteration.
2197   if (Group->isReverse())
2198     Index += (VF - 1) * Group->getFactor();
2199 
2200   bool InBounds = false;
2201   if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
2202     InBounds = gep->isInBounds();
2203 
2204   for (unsigned Part = 0; Part < UF; Part++) {
2205     Value *NewPtr = getOrCreateScalarValue(Ptr, {Part, 0});
2206 
2207     // Notice current instruction could be any index. Need to adjust the address
2208     // to the member of index 0.
2209     //
2210     // E.g.  a = A[i+1];     // Member of index 1 (Current instruction)
2211     //       b = A[i];       // Member of index 0
2212     // Current pointer is pointed to A[i+1], adjust it to A[i].
2213     //
2214     // E.g.  A[i+1] = a;     // Member of index 1
2215     //       A[i]   = b;     // Member of index 0
2216     //       A[i+2] = c;     // Member of index 2 (Current instruction)
2217     // Current pointer is pointed to A[i+2], adjust it to A[i].
2218     NewPtr = Builder.CreateGEP(ScalarTy, NewPtr, Builder.getInt32(-Index));
2219     if (InBounds)
2220       cast<GetElementPtrInst>(NewPtr)->setIsInBounds(true);
2221 
2222     // Cast to the vector pointer type.
2223     NewPtrs.push_back(Builder.CreateBitCast(NewPtr, PtrTy));
2224   }
2225 
2226   setDebugLocFromInst(Builder, Instr);
2227   Value *UndefVec = UndefValue::get(VecTy);
2228 
2229   Value *MaskForGaps = nullptr;
2230   if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) {
2231     MaskForGaps = createBitMaskForGaps(Builder, VF, *Group);
2232     assert(MaskForGaps && "Mask for Gaps is required but it is null");
2233   }
2234 
2235   // Vectorize the interleaved load group.
2236   if (isa<LoadInst>(Instr)) {
2237     // For each unroll part, create a wide load for the group.
2238     SmallVector<Value *, 2> NewLoads;
2239     for (unsigned Part = 0; Part < UF; Part++) {
2240       Instruction *NewLoad;
2241       if (IsMaskForCondRequired || MaskForGaps) {
2242         assert(useMaskedInterleavedAccesses(*TTI) &&
2243                "masked interleaved groups are not allowed.");
2244         Value *GroupMask = MaskForGaps;
2245         if (IsMaskForCondRequired) {
2246           auto *Undefs = UndefValue::get(Mask[Part]->getType());
2247           auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF);
2248           Value *ShuffledMask = Builder.CreateShuffleVector(
2249               Mask[Part], Undefs, RepMask, "interleaved.mask");
2250           GroupMask = MaskForGaps
2251                           ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
2252                                                 MaskForGaps)
2253                           : ShuffledMask;
2254         }
2255         NewLoad =
2256             Builder.CreateMaskedLoad(NewPtrs[Part], Group->getAlignment(),
2257                                      GroupMask, UndefVec, "wide.masked.vec");
2258       }
2259       else
2260         NewLoad = Builder.CreateAlignedLoad(VecTy, NewPtrs[Part],
2261                                             Group->getAlignment(), "wide.vec");
2262       Group->addMetadata(NewLoad);
2263       NewLoads.push_back(NewLoad);
2264     }
2265 
2266     // For each member in the group, shuffle out the appropriate data from the
2267     // wide loads.
2268     for (unsigned I = 0; I < InterleaveFactor; ++I) {
2269       Instruction *Member = Group->getMember(I);
2270 
2271       // Skip the gaps in the group.
2272       if (!Member)
2273         continue;
2274 
2275       Constant *StrideMask = createStrideMask(Builder, I, InterleaveFactor, VF);
2276       for (unsigned Part = 0; Part < UF; Part++) {
2277         Value *StridedVec = Builder.CreateShuffleVector(
2278             NewLoads[Part], UndefVec, StrideMask, "strided.vec");
2279 
2280         // If this member has different type, cast the result type.
2281         if (Member->getType() != ScalarTy) {
2282           VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2283           StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2284         }
2285 
2286         if (Group->isReverse())
2287           StridedVec = reverseVector(StridedVec);
2288 
2289         VectorLoopValueMap.setVectorValue(Member, Part, StridedVec);
2290       }
2291     }
2292     return;
2293   }
2294 
2295   // The sub vector type for current instruction.
2296   VectorType *SubVT = VectorType::get(ScalarTy, VF);
2297 
2298   // Vectorize the interleaved store group.
2299   for (unsigned Part = 0; Part < UF; Part++) {
2300     // Collect the stored vector from each member.
2301     SmallVector<Value *, 4> StoredVecs;
2302     for (unsigned i = 0; i < InterleaveFactor; i++) {
2303       // Interleaved store group doesn't allow a gap, so each index has a member
2304       Instruction *Member = Group->getMember(i);
2305       assert(Member && "Fail to get a member from an interleaved store group");
2306 
2307       Value *StoredVec = getOrCreateVectorValue(
2308           cast<StoreInst>(Member)->getValueOperand(), Part);
2309       if (Group->isReverse())
2310         StoredVec = reverseVector(StoredVec);
2311 
2312       // If this member has different type, cast it to a unified type.
2313 
2314       if (StoredVec->getType() != SubVT)
2315         StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2316 
2317       StoredVecs.push_back(StoredVec);
2318     }
2319 
2320     // Concatenate all vectors into a wide vector.
2321     Value *WideVec = concatenateVectors(Builder, StoredVecs);
2322 
2323     // Interleave the elements in the wide vector.
2324     Constant *IMask = createInterleaveMask(Builder, VF, InterleaveFactor);
2325     Value *IVec = Builder.CreateShuffleVector(WideVec, UndefVec, IMask,
2326                                               "interleaved.vec");
2327 
2328     Instruction *NewStoreInstr;
2329     if (IsMaskForCondRequired) {
2330       auto *Undefs = UndefValue::get(Mask[Part]->getType());
2331       auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF);
2332       Value *ShuffledMask = Builder.CreateShuffleVector(
2333           Mask[Part], Undefs, RepMask, "interleaved.mask");
2334       NewStoreInstr = Builder.CreateMaskedStore(
2335           IVec, NewPtrs[Part], Group->getAlignment(), ShuffledMask);
2336     }
2337     else
2338       NewStoreInstr = Builder.CreateAlignedStore(IVec, NewPtrs[Part],
2339         Group->getAlignment());
2340 
2341     Group->addMetadata(NewStoreInstr);
2342   }
2343 }
2344 
2345 void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
2346                                                      VectorParts *BlockInMask) {
2347   // Attempt to issue a wide load.
2348   LoadInst *LI = dyn_cast<LoadInst>(Instr);
2349   StoreInst *SI = dyn_cast<StoreInst>(Instr);
2350 
2351   assert((LI || SI) && "Invalid Load/Store instruction");
2352 
2353   LoopVectorizationCostModel::InstWidening Decision =
2354       Cost->getWideningDecision(Instr, VF);
2355   assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
2356          "CM decision should be taken at this point");
2357   if (Decision == LoopVectorizationCostModel::CM_Interleave)
2358     return vectorizeInterleaveGroup(Instr);
2359 
2360   Type *ScalarDataTy = getMemInstValueType(Instr);
2361   Type *DataTy = VectorType::get(ScalarDataTy, VF);
2362   Value *Ptr = getLoadStorePointerOperand(Instr);
2363   // An alignment of 0 means target abi alignment. We need to use the scalar's
2364   // target abi alignment in such a case.
2365   const DataLayout &DL = Instr->getModule()->getDataLayout();
2366   const Align Alignment =
2367       DL.getValueOrABITypeAlignment(getLoadStoreAlignment(Instr), ScalarDataTy);
2368   unsigned AddressSpace = getLoadStoreAddressSpace(Instr);
2369 
2370   // Determine if the pointer operand of the access is either consecutive or
2371   // reverse consecutive.
2372   bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse);
2373   bool ConsecutiveStride =
2374       Reverse || (Decision == LoopVectorizationCostModel::CM_Widen);
2375   bool CreateGatherScatter =
2376       (Decision == LoopVectorizationCostModel::CM_GatherScatter);
2377 
2378   // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector
2379   // gather/scatter. Otherwise Decision should have been to Scalarize.
2380   assert((ConsecutiveStride || CreateGatherScatter) &&
2381          "The instruction should be scalarized");
2382 
2383   // Handle consecutive loads/stores.
2384   if (ConsecutiveStride)
2385     Ptr = getOrCreateScalarValue(Ptr, {0, 0});
2386 
2387   VectorParts Mask;
2388   bool isMaskRequired = BlockInMask;
2389   if (isMaskRequired)
2390     Mask = *BlockInMask;
2391 
2392   bool InBounds = false;
2393   if (auto *gep = dyn_cast<GetElementPtrInst>(
2394           getLoadStorePointerOperand(Instr)->stripPointerCasts()))
2395     InBounds = gep->isInBounds();
2396 
2397   const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
2398     // Calculate the pointer for the specific unroll-part.
2399     GetElementPtrInst *PartPtr = nullptr;
2400 
2401     if (Reverse) {
2402       // If the address is consecutive but reversed, then the
2403       // wide store needs to start at the last vector element.
2404       PartPtr = cast<GetElementPtrInst>(
2405           Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(-Part * VF)));
2406       PartPtr->setIsInBounds(InBounds);
2407       PartPtr = cast<GetElementPtrInst>(
2408           Builder.CreateGEP(ScalarDataTy, PartPtr, Builder.getInt32(1 - VF)));
2409       PartPtr->setIsInBounds(InBounds);
2410       if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
2411         Mask[Part] = reverseVector(Mask[Part]);
2412     } else {
2413       PartPtr = cast<GetElementPtrInst>(
2414           Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(Part * VF)));
2415       PartPtr->setIsInBounds(InBounds);
2416     }
2417 
2418     return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
2419   };
2420 
2421   // Handle Stores:
2422   if (SI) {
2423     setDebugLocFromInst(Builder, SI);
2424 
2425     for (unsigned Part = 0; Part < UF; ++Part) {
2426       Instruction *NewSI = nullptr;
2427       Value *StoredVal = getOrCreateVectorValue(SI->getValueOperand(), Part);
2428       if (CreateGatherScatter) {
2429         Value *MaskPart = isMaskRequired ? Mask[Part] : nullptr;
2430         Value *VectorGep = getOrCreateVectorValue(Ptr, Part);
2431         NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep,
2432                                             Alignment.value(), MaskPart);
2433       } else {
2434         if (Reverse) {
2435           // If we store to reverse consecutive memory locations, then we need
2436           // to reverse the order of elements in the stored value.
2437           StoredVal = reverseVector(StoredVal);
2438           // We don't want to update the value in the map as it might be used in
2439           // another expression. So don't call resetVectorValue(StoredVal).
2440         }
2441         auto *VecPtr = CreateVecPtr(Part, Ptr);
2442         if (isMaskRequired)
2443           NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr,
2444                                             Alignment.value(), Mask[Part]);
2445         else
2446           NewSI =
2447               Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment.value());
2448       }
2449       addMetadata(NewSI, SI);
2450     }
2451     return;
2452   }
2453 
2454   // Handle loads.
2455   assert(LI && "Must have a load instruction");
2456   setDebugLocFromInst(Builder, LI);
2457   for (unsigned Part = 0; Part < UF; ++Part) {
2458     Value *NewLI;
2459     if (CreateGatherScatter) {
2460       Value *MaskPart = isMaskRequired ? Mask[Part] : nullptr;
2461       Value *VectorGep = getOrCreateVectorValue(Ptr, Part);
2462       NewLI = Builder.CreateMaskedGather(VectorGep, Alignment.value(), MaskPart,
2463                                          nullptr, "wide.masked.gather");
2464       addMetadata(NewLI, LI);
2465     } else {
2466       auto *VecPtr = CreateVecPtr(Part, Ptr);
2467       if (isMaskRequired)
2468         NewLI = Builder.CreateMaskedLoad(VecPtr, Alignment.value(), Mask[Part],
2469                                          UndefValue::get(DataTy),
2470                                          "wide.masked.load");
2471       else
2472         NewLI = Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment.value(),
2473                                           "wide.load");
2474 
2475       // Add metadata to the load, but setVectorValue to the reverse shuffle.
2476       addMetadata(NewLI, LI);
2477       if (Reverse)
2478         NewLI = reverseVector(NewLI);
2479     }
2480     VectorLoopValueMap.setVectorValue(Instr, Part, NewLI);
2481   }
2482 }
2483 
2484 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
2485                                                const VPIteration &Instance,
2486                                                bool IfPredicateInstr) {
2487   assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
2488 
2489   setDebugLocFromInst(Builder, Instr);
2490 
2491   // Does this instruction return a value ?
2492   bool IsVoidRetTy = Instr->getType()->isVoidTy();
2493 
2494   Instruction *Cloned = Instr->clone();
2495   if (!IsVoidRetTy)
2496     Cloned->setName(Instr->getName() + ".cloned");
2497 
2498   // Replace the operands of the cloned instructions with their scalar
2499   // equivalents in the new loop.
2500   for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
2501     auto *NewOp = getOrCreateScalarValue(Instr->getOperand(op), Instance);
2502     Cloned->setOperand(op, NewOp);
2503   }
2504   addNewMetadata(Cloned, Instr);
2505 
2506   // Place the cloned scalar in the new loop.
2507   Builder.Insert(Cloned);
2508 
2509   // Add the cloned scalar to the scalar map entry.
2510   VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned);
2511 
2512   // If we just cloned a new assumption, add it the assumption cache.
2513   if (auto *II = dyn_cast<IntrinsicInst>(Cloned))
2514     if (II->getIntrinsicID() == Intrinsic::assume)
2515       AC->registerAssumption(II);
2516 
2517   // End if-block.
2518   if (IfPredicateInstr)
2519     PredicatedInstructions.push_back(Cloned);
2520 }
2521 
2522 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start,
2523                                                       Value *End, Value *Step,
2524                                                       Instruction *DL) {
2525   BasicBlock *Header = L->getHeader();
2526   BasicBlock *Latch = L->getLoopLatch();
2527   // As we're just creating this loop, it's possible no latch exists
2528   // yet. If so, use the header as this will be a single block loop.
2529   if (!Latch)
2530     Latch = Header;
2531 
2532   IRBuilder<> Builder(&*Header->getFirstInsertionPt());
2533   Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction);
2534   setDebugLocFromInst(Builder, OldInst);
2535   auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index");
2536 
2537   Builder.SetInsertPoint(Latch->getTerminator());
2538   setDebugLocFromInst(Builder, OldInst);
2539 
2540   // Create i+1 and fill the PHINode.
2541   Value *Next = Builder.CreateAdd(Induction, Step, "index.next");
2542   Induction->addIncoming(Start, L->getLoopPreheader());
2543   Induction->addIncoming(Next, Latch);
2544   // Create the compare.
2545   Value *ICmp = Builder.CreateICmpEQ(Next, End);
2546   Builder.CreateCondBr(ICmp, L->getExitBlock(), Header);
2547 
2548   // Now we have two terminators. Remove the old one from the block.
2549   Latch->getTerminator()->eraseFromParent();
2550 
2551   return Induction;
2552 }
2553 
2554 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
2555   if (TripCount)
2556     return TripCount;
2557 
2558   assert(L && "Create Trip Count for null loop.");
2559   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2560   // Find the loop boundaries.
2561   ScalarEvolution *SE = PSE.getSE();
2562   const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
2563   assert(BackedgeTakenCount != SE->getCouldNotCompute() &&
2564          "Invalid loop count");
2565 
2566   Type *IdxTy = Legal->getWidestInductionType();
2567   assert(IdxTy && "No type for induction");
2568 
2569   // The exit count might have the type of i64 while the phi is i32. This can
2570   // happen if we have an induction variable that is sign extended before the
2571   // compare. The only way that we get a backedge taken count is that the
2572   // induction variable was signed and as such will not overflow. In such a case
2573   // truncation is legal.
2574   if (BackedgeTakenCount->getType()->getPrimitiveSizeInBits() >
2575       IdxTy->getPrimitiveSizeInBits())
2576     BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
2577   BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
2578 
2579   // Get the total trip count from the count by adding 1.
2580   const SCEV *ExitCount = SE->getAddExpr(
2581       BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
2582 
2583   const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
2584 
2585   // Expand the trip count and place the new instructions in the preheader.
2586   // Notice that the pre-header does not change, only the loop body.
2587   SCEVExpander Exp(*SE, DL, "induction");
2588 
2589   // Count holds the overall loop count (N).
2590   TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
2591                                 L->getLoopPreheader()->getTerminator());
2592 
2593   if (TripCount->getType()->isPointerTy())
2594     TripCount =
2595         CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
2596                                     L->getLoopPreheader()->getTerminator());
2597 
2598   return TripCount;
2599 }
2600 
2601 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
2602   if (VectorTripCount)
2603     return VectorTripCount;
2604 
2605   Value *TC = getOrCreateTripCount(L);
2606   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2607 
2608   Type *Ty = TC->getType();
2609   Constant *Step = ConstantInt::get(Ty, VF * UF);
2610 
2611   // If the tail is to be folded by masking, round the number of iterations N
2612   // up to a multiple of Step instead of rounding down. This is done by first
2613   // adding Step-1 and then rounding down. Note that it's ok if this addition
2614   // overflows: the vector induction variable will eventually wrap to zero given
2615   // that it starts at zero and its Step is a power of two; the loop will then
2616   // exit, with the last early-exit vector comparison also producing all-true.
2617   if (Cost->foldTailByMasking()) {
2618     assert(isPowerOf2_32(VF * UF) &&
2619            "VF*UF must be a power of 2 when folding tail by masking");
2620     TC = Builder.CreateAdd(TC, ConstantInt::get(Ty, VF * UF - 1), "n.rnd.up");
2621   }
2622 
2623   // Now we need to generate the expression for the part of the loop that the
2624   // vectorized body will execute. This is equal to N - (N % Step) if scalar
2625   // iterations are not required for correctness, or N - Step, otherwise. Step
2626   // is equal to the vectorization factor (number of SIMD elements) times the
2627   // unroll factor (number of SIMD instructions).
2628   Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
2629 
2630   // If there is a non-reversed interleaved group that may speculatively access
2631   // memory out-of-bounds, we need to ensure that there will be at least one
2632   // iteration of the scalar epilogue loop. Thus, if the step evenly divides
2633   // the trip count, we set the remainder to be equal to the step. If the step
2634   // does not evenly divide the trip count, no adjustment is necessary since
2635   // there will already be scalar iterations. Note that the minimum iterations
2636   // check ensures that N >= Step.
2637   if (VF > 1 && Cost->requiresScalarEpilogue()) {
2638     auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
2639     R = Builder.CreateSelect(IsZero, Step, R);
2640   }
2641 
2642   VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
2643 
2644   return VectorTripCount;
2645 }
2646 
2647 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
2648                                                    const DataLayout &DL) {
2649   // Verify that V is a vector type with same number of elements as DstVTy.
2650   unsigned VF = DstVTy->getNumElements();
2651   VectorType *SrcVecTy = cast<VectorType>(V->getType());
2652   assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match");
2653   Type *SrcElemTy = SrcVecTy->getElementType();
2654   Type *DstElemTy = DstVTy->getElementType();
2655   assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
2656          "Vector elements must have same size");
2657 
2658   // Do a direct cast if element types are castable.
2659   if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
2660     return Builder.CreateBitOrPointerCast(V, DstVTy);
2661   }
2662   // V cannot be directly casted to desired vector type.
2663   // May happen when V is a floating point vector but DstVTy is a vector of
2664   // pointers or vice-versa. Handle this using a two-step bitcast using an
2665   // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
2666   assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
2667          "Only one type should be a pointer type");
2668   assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
2669          "Only one type should be a floating point type");
2670   Type *IntTy =
2671       IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
2672   VectorType *VecIntTy = VectorType::get(IntTy, VF);
2673   Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
2674   return Builder.CreateBitOrPointerCast(CastVal, DstVTy);
2675 }
2676 
2677 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
2678                                                          BasicBlock *Bypass) {
2679   Value *Count = getOrCreateTripCount(L);
2680   BasicBlock *BB = L->getLoopPreheader();
2681   IRBuilder<> Builder(BB->getTerminator());
2682 
2683   // Generate code to check if the loop's trip count is less than VF * UF, or
2684   // equal to it in case a scalar epilogue is required; this implies that the
2685   // vector trip count is zero. This check also covers the case where adding one
2686   // to the backedge-taken count overflowed leading to an incorrect trip count
2687   // of zero. In this case we will also jump to the scalar loop.
2688   auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE
2689                                           : ICmpInst::ICMP_ULT;
2690 
2691   // If tail is to be folded, vector loop takes care of all iterations.
2692   Value *CheckMinIters = Builder.getFalse();
2693   if (!Cost->foldTailByMasking())
2694     CheckMinIters = Builder.CreateICmp(
2695         P, Count, ConstantInt::get(Count->getType(), VF * UF),
2696         "min.iters.check");
2697 
2698   BasicBlock *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
2699   // Update dominator tree immediately if the generated block is a
2700   // LoopBypassBlock because SCEV expansions to generate loop bypass
2701   // checks may query it before the current function is finished.
2702   DT->addNewBlock(NewBB, BB);
2703   if (L->getParentLoop())
2704     L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
2705   ReplaceInstWithInst(BB->getTerminator(),
2706                       BranchInst::Create(Bypass, NewBB, CheckMinIters));
2707   LoopBypassBlocks.push_back(BB);
2708 }
2709 
2710 void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
2711   BasicBlock *BB = L->getLoopPreheader();
2712 
2713   // Generate the code to check that the SCEV assumptions that we made.
2714   // We want the new basic block to start at the first instruction in a
2715   // sequence of instructions that form a check.
2716   SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(),
2717                    "scev.check");
2718   Value *SCEVCheck =
2719       Exp.expandCodeForPredicate(&PSE.getUnionPredicate(), BB->getTerminator());
2720 
2721   if (auto *C = dyn_cast<ConstantInt>(SCEVCheck))
2722     if (C->isZero())
2723       return;
2724 
2725   assert(!BB->getParent()->hasOptSize() &&
2726          "Cannot SCEV check stride or overflow when optimizing for size");
2727 
2728   // Create a new block containing the stride check.
2729   BB->setName("vector.scevcheck");
2730   auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
2731   // Update dominator tree immediately if the generated block is a
2732   // LoopBypassBlock because SCEV expansions to generate loop bypass
2733   // checks may query it before the current function is finished.
2734   DT->addNewBlock(NewBB, BB);
2735   if (L->getParentLoop())
2736     L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
2737   ReplaceInstWithInst(BB->getTerminator(),
2738                       BranchInst::Create(Bypass, NewBB, SCEVCheck));
2739   LoopBypassBlocks.push_back(BB);
2740   AddedSafetyChecks = true;
2741 }
2742 
2743 void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {
2744   // VPlan-native path does not do any analysis for runtime checks currently.
2745   if (EnableVPlanNativePath)
2746     return;
2747 
2748   BasicBlock *BB = L->getLoopPreheader();
2749 
2750   // Generate the code that checks in runtime if arrays overlap. We put the
2751   // checks into a separate block to make the more common case of few elements
2752   // faster.
2753   Instruction *FirstCheckInst;
2754   Instruction *MemRuntimeCheck;
2755   std::tie(FirstCheckInst, MemRuntimeCheck) =
2756       Legal->getLAI()->addRuntimeChecks(BB->getTerminator());
2757   if (!MemRuntimeCheck)
2758     return;
2759 
2760   if (BB->getParent()->hasOptSize()) {
2761     assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
2762            "Cannot emit memory checks when optimizing for size, unless forced "
2763            "to vectorize.");
2764     ORE->emit([&]() {
2765       return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
2766                                         L->getStartLoc(), L->getHeader())
2767              << "Code-size may be reduced by not forcing "
2768                 "vectorization, or by source-code modifications "
2769                 "eliminating the need for runtime checks "
2770                 "(e.g., adding 'restrict').";
2771     });
2772   }
2773 
2774   // Create a new block containing the memory check.
2775   BB->setName("vector.memcheck");
2776   auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
2777   // Update dominator tree immediately if the generated block is a
2778   // LoopBypassBlock because SCEV expansions to generate loop bypass
2779   // checks may query it before the current function is finished.
2780   DT->addNewBlock(NewBB, BB);
2781   if (L->getParentLoop())
2782     L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
2783   ReplaceInstWithInst(BB->getTerminator(),
2784                       BranchInst::Create(Bypass, NewBB, MemRuntimeCheck));
2785   LoopBypassBlocks.push_back(BB);
2786   AddedSafetyChecks = true;
2787 
2788   // We currently don't use LoopVersioning for the actual loop cloning but we
2789   // still use it to add the noalias metadata.
2790   LVer = std::make_unique<LoopVersioning>(*Legal->getLAI(), OrigLoop, LI, DT,
2791                                            PSE.getSE());
2792   LVer->prepareNoAliasMetadata();
2793 }
2794 
2795 Value *InnerLoopVectorizer::emitTransformedIndex(
2796     IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL,
2797     const InductionDescriptor &ID) const {
2798 
2799   SCEVExpander Exp(*SE, DL, "induction");
2800   auto Step = ID.getStep();
2801   auto StartValue = ID.getStartValue();
2802   assert(Index->getType() == Step->getType() &&
2803          "Index type does not match StepValue type");
2804 
2805   // Note: the IR at this point is broken. We cannot use SE to create any new
2806   // SCEV and then expand it, hoping that SCEV's simplification will give us
2807   // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
2808   // lead to various SCEV crashes. So all we can do is to use builder and rely
2809   // on InstCombine for future simplifications. Here we handle some trivial
2810   // cases only.
2811   auto CreateAdd = [&B](Value *X, Value *Y) {
2812     assert(X->getType() == Y->getType() && "Types don't match!");
2813     if (auto *CX = dyn_cast<ConstantInt>(X))
2814       if (CX->isZero())
2815         return Y;
2816     if (auto *CY = dyn_cast<ConstantInt>(Y))
2817       if (CY->isZero())
2818         return X;
2819     return B.CreateAdd(X, Y);
2820   };
2821 
2822   auto CreateMul = [&B](Value *X, Value *Y) {
2823     assert(X->getType() == Y->getType() && "Types don't match!");
2824     if (auto *CX = dyn_cast<ConstantInt>(X))
2825       if (CX->isOne())
2826         return Y;
2827     if (auto *CY = dyn_cast<ConstantInt>(Y))
2828       if (CY->isOne())
2829         return X;
2830     return B.CreateMul(X, Y);
2831   };
2832 
2833   switch (ID.getKind()) {
2834   case InductionDescriptor::IK_IntInduction: {
2835     assert(Index->getType() == StartValue->getType() &&
2836            "Index type does not match StartValue type");
2837     if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne())
2838       return B.CreateSub(StartValue, Index);
2839     auto *Offset = CreateMul(
2840         Index, Exp.expandCodeFor(Step, Index->getType(), &*B.GetInsertPoint()));
2841     return CreateAdd(StartValue, Offset);
2842   }
2843   case InductionDescriptor::IK_PtrInduction: {
2844     assert(isa<SCEVConstant>(Step) &&
2845            "Expected constant step for pointer induction");
2846     return B.CreateGEP(
2847         StartValue->getType()->getPointerElementType(), StartValue,
2848         CreateMul(Index, Exp.expandCodeFor(Step, Index->getType(),
2849                                            &*B.GetInsertPoint())));
2850   }
2851   case InductionDescriptor::IK_FpInduction: {
2852     assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
2853     auto InductionBinOp = ID.getInductionBinOp();
2854     assert(InductionBinOp &&
2855            (InductionBinOp->getOpcode() == Instruction::FAdd ||
2856             InductionBinOp->getOpcode() == Instruction::FSub) &&
2857            "Original bin op should be defined for FP induction");
2858 
2859     Value *StepValue = cast<SCEVUnknown>(Step)->getValue();
2860 
2861     // Floating point operations had to be 'fast' to enable the induction.
2862     FastMathFlags Flags;
2863     Flags.setFast();
2864 
2865     Value *MulExp = B.CreateFMul(StepValue, Index);
2866     if (isa<Instruction>(MulExp))
2867       // We have to check, the MulExp may be a constant.
2868       cast<Instruction>(MulExp)->setFastMathFlags(Flags);
2869 
2870     Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
2871                                "induction");
2872     if (isa<Instruction>(BOp))
2873       cast<Instruction>(BOp)->setFastMathFlags(Flags);
2874 
2875     return BOp;
2876   }
2877   case InductionDescriptor::IK_NoInduction:
2878     return nullptr;
2879   }
2880   llvm_unreachable("invalid enum");
2881 }
2882 
2883 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
2884   /*
2885    In this function we generate a new loop. The new loop will contain
2886    the vectorized instructions while the old loop will continue to run the
2887    scalar remainder.
2888 
2889        [ ] <-- loop iteration number check.
2890     /   |
2891    /    v
2892   |    [ ] <-- vector loop bypass (may consist of multiple blocks).
2893   |  /  |
2894   | /   v
2895   ||   [ ]     <-- vector pre header.
2896   |/    |
2897   |     v
2898   |    [  ] \
2899   |    [  ]_|   <-- vector loop.
2900   |     |
2901   |     v
2902   |   -[ ]   <--- middle-block.
2903   |  /  |
2904   | /   v
2905   -|- >[ ]     <--- new preheader.
2906    |    |
2907    |    v
2908    |   [ ] \
2909    |   [ ]_|   <-- old scalar loop to handle remainder.
2910     \   |
2911      \  v
2912       >[ ]     <-- exit block.
2913    ...
2914    */
2915 
2916   BasicBlock *OldBasicBlock = OrigLoop->getHeader();
2917   BasicBlock *VectorPH = OrigLoop->getLoopPreheader();
2918   BasicBlock *ExitBlock = OrigLoop->getExitBlock();
2919   MDNode *OrigLoopID = OrigLoop->getLoopID();
2920   assert(VectorPH && "Invalid loop structure");
2921   assert(ExitBlock && "Must have an exit block");
2922 
2923   // Some loops have a single integer induction variable, while other loops
2924   // don't. One example is c++ iterators that often have multiple pointer
2925   // induction variables. In the code below we also support a case where we
2926   // don't have a single induction variable.
2927   //
2928   // We try to obtain an induction variable from the original loop as hard
2929   // as possible. However if we don't find one that:
2930   //   - is an integer
2931   //   - counts from zero, stepping by one
2932   //   - is the size of the widest induction variable type
2933   // then we create a new one.
2934   OldInduction = Legal->getPrimaryInduction();
2935   Type *IdxTy = Legal->getWidestInductionType();
2936 
2937   // Split the single block loop into the two loop structure described above.
2938   BasicBlock *VecBody =
2939       VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.body");
2940   BasicBlock *MiddleBlock =
2941       VecBody->splitBasicBlock(VecBody->getTerminator(), "middle.block");
2942   BasicBlock *ScalarPH =
2943       MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), "scalar.ph");
2944 
2945   // Create and register the new vector loop.
2946   Loop *Lp = LI->AllocateLoop();
2947   Loop *ParentLoop = OrigLoop->getParentLoop();
2948 
2949   // Insert the new loop into the loop nest and register the new basic blocks
2950   // before calling any utilities such as SCEV that require valid LoopInfo.
2951   if (ParentLoop) {
2952     ParentLoop->addChildLoop(Lp);
2953     ParentLoop->addBasicBlockToLoop(ScalarPH, *LI);
2954     ParentLoop->addBasicBlockToLoop(MiddleBlock, *LI);
2955   } else {
2956     LI->addTopLevelLoop(Lp);
2957   }
2958   Lp->addBasicBlockToLoop(VecBody, *LI);
2959 
2960   // Find the loop boundaries.
2961   Value *Count = getOrCreateTripCount(Lp);
2962 
2963   Value *StartIdx = ConstantInt::get(IdxTy, 0);
2964 
2965   // Now, compare the new count to zero. If it is zero skip the vector loop and
2966   // jump to the scalar loop. This check also covers the case where the
2967   // backedge-taken count is uint##_max: adding one to it will overflow leading
2968   // to an incorrect trip count of zero. In this (rare) case we will also jump
2969   // to the scalar loop.
2970   emitMinimumIterationCountCheck(Lp, ScalarPH);
2971 
2972   // Generate the code to check any assumptions that we've made for SCEV
2973   // expressions.
2974   emitSCEVChecks(Lp, ScalarPH);
2975 
2976   // Generate the code that checks in runtime if arrays overlap. We put the
2977   // checks into a separate block to make the more common case of few elements
2978   // faster.
2979   emitMemRuntimeChecks(Lp, ScalarPH);
2980 
2981   // Generate the induction variable.
2982   // The loop step is equal to the vectorization factor (num of SIMD elements)
2983   // times the unroll factor (num of SIMD instructions).
2984   Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
2985   Constant *Step = ConstantInt::get(IdxTy, VF * UF);
2986   Induction =
2987       createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
2988                               getDebugLocFromInstOrOperands(OldInduction));
2989 
2990   // We are going to resume the execution of the scalar loop.
2991   // Go over all of the induction variables that we found and fix the
2992   // PHIs that are left in the scalar version of the loop.
2993   // The starting values of PHI nodes depend on the counter of the last
2994   // iteration in the vectorized loop.
2995   // If we come from a bypass edge then we need to start from the original
2996   // start value.
2997 
2998   // This variable saves the new starting index for the scalar loop. It is used
2999   // to test if there are any tail iterations left once the vector loop has
3000   // completed.
3001   LoopVectorizationLegality::InductionList *List = Legal->getInductionVars();
3002   for (auto &InductionEntry : *List) {
3003     PHINode *OrigPhi = InductionEntry.first;
3004     InductionDescriptor II = InductionEntry.second;
3005 
3006     // Create phi nodes to merge from the  backedge-taken check block.
3007     PHINode *BCResumeVal = PHINode::Create(
3008         OrigPhi->getType(), 3, "bc.resume.val", ScalarPH->getTerminator());
3009     // Copy original phi DL over to the new one.
3010     BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
3011     Value *&EndValue = IVEndValues[OrigPhi];
3012     if (OrigPhi == OldInduction) {
3013       // We know what the end value is.
3014       EndValue = CountRoundDown;
3015     } else {
3016       IRBuilder<> B(Lp->getLoopPreheader()->getTerminator());
3017       Type *StepType = II.getStep()->getType();
3018       Instruction::CastOps CastOp =
3019         CastInst::getCastOpcode(CountRoundDown, true, StepType, true);
3020       Value *CRD = B.CreateCast(CastOp, CountRoundDown, StepType, "cast.crd");
3021       const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
3022       EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
3023       EndValue->setName("ind.end");
3024     }
3025 
3026     // The new PHI merges the original incoming value, in case of a bypass,
3027     // or the value at the end of the vectorized loop.
3028     BCResumeVal->addIncoming(EndValue, MiddleBlock);
3029 
3030     // Fix the scalar body counter (PHI node).
3031     // The old induction's phi node in the scalar body needs the truncated
3032     // value.
3033     for (BasicBlock *BB : LoopBypassBlocks)
3034       BCResumeVal->addIncoming(II.getStartValue(), BB);
3035     OrigPhi->setIncomingValueForBlock(ScalarPH, BCResumeVal);
3036   }
3037 
3038   // We need the OrigLoop (scalar loop part) latch terminator to help
3039   // produce correct debug info for the middle block BB instructions.
3040   // The legality check stage guarantees that the loop will have a single
3041   // latch.
3042   assert(isa<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()) &&
3043          "Scalar loop latch terminator isn't a branch");
3044   BranchInst *ScalarLatchBr =
3045       cast<BranchInst>(OrigLoop->getLoopLatch()->getTerminator());
3046 
3047   // Add a check in the middle block to see if we have completed
3048   // all of the iterations in the first vector loop.
3049   // If (N - N%VF) == N, then we *don't* need to run the remainder.
3050   // If tail is to be folded, we know we don't need to run the remainder.
3051   Value *CmpN = Builder.getTrue();
3052   if (!Cost->foldTailByMasking()) {
3053     CmpN =
3054         CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count,
3055                         CountRoundDown, "cmp.n", MiddleBlock->getTerminator());
3056 
3057     // Here we use the same DebugLoc as the scalar loop latch branch instead
3058     // of the corresponding compare because they may have ended up with
3059     // different line numbers and we want to avoid awkward line stepping while
3060     // debugging. Eg. if the compare has got a line number inside the loop.
3061     cast<Instruction>(CmpN)->setDebugLoc(ScalarLatchBr->getDebugLoc());
3062   }
3063 
3064   BranchInst *BrInst = BranchInst::Create(ExitBlock, ScalarPH, CmpN);
3065   BrInst->setDebugLoc(ScalarLatchBr->getDebugLoc());
3066   ReplaceInstWithInst(MiddleBlock->getTerminator(), BrInst);
3067 
3068   // Get ready to start creating new instructions into the vectorized body.
3069   Builder.SetInsertPoint(&*VecBody->getFirstInsertionPt());
3070 
3071   // Save the state.
3072   LoopVectorPreHeader = Lp->getLoopPreheader();
3073   LoopScalarPreHeader = ScalarPH;
3074   LoopMiddleBlock = MiddleBlock;
3075   LoopExitBlock = ExitBlock;
3076   LoopVectorBody = VecBody;
3077   LoopScalarBody = OldBasicBlock;
3078 
3079   Optional<MDNode *> VectorizedLoopID =
3080       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
3081                                       LLVMLoopVectorizeFollowupVectorized});
3082   if (VectorizedLoopID.hasValue()) {
3083     Lp->setLoopID(VectorizedLoopID.getValue());
3084 
3085     // Do not setAlreadyVectorized if loop attributes have been defined
3086     // explicitly.
3087     return LoopVectorPreHeader;
3088   }
3089 
3090   // Keep all loop hints from the original loop on the vector loop (we'll
3091   // replace the vectorizer-specific hints below).
3092   if (MDNode *LID = OrigLoop->getLoopID())
3093     Lp->setLoopID(LID);
3094 
3095   LoopVectorizeHints Hints(Lp, true, *ORE);
3096   Hints.setAlreadyVectorized();
3097 
3098   return LoopVectorPreHeader;
3099 }
3100 
3101 // Fix up external users of the induction variable. At this point, we are
3102 // in LCSSA form, with all external PHIs that use the IV having one input value,
3103 // coming from the remainder loop. We need those PHIs to also have a correct
3104 // value for the IV when arriving directly from the middle block.
3105 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
3106                                        const InductionDescriptor &II,
3107                                        Value *CountRoundDown, Value *EndValue,
3108                                        BasicBlock *MiddleBlock) {
3109   // There are two kinds of external IV usages - those that use the value
3110   // computed in the last iteration (the PHI) and those that use the penultimate
3111   // value (the value that feeds into the phi from the loop latch).
3112   // We allow both, but they, obviously, have different values.
3113 
3114   assert(OrigLoop->getExitBlock() && "Expected a single exit block");
3115 
3116   DenseMap<Value *, Value *> MissingVals;
3117 
3118   // An external user of the last iteration's value should see the value that
3119   // the remainder loop uses to initialize its own IV.
3120   Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
3121   for (User *U : PostInc->users()) {
3122     Instruction *UI = cast<Instruction>(U);
3123     if (!OrigLoop->contains(UI)) {
3124       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3125       MissingVals[UI] = EndValue;
3126     }
3127   }
3128 
3129   // An external user of the penultimate value need to see EndValue - Step.
3130   // The simplest way to get this is to recompute it from the constituent SCEVs,
3131   // that is Start + (Step * (CRD - 1)).
3132   for (User *U : OrigPhi->users()) {
3133     auto *UI = cast<Instruction>(U);
3134     if (!OrigLoop->contains(UI)) {
3135       const DataLayout &DL =
3136           OrigLoop->getHeader()->getModule()->getDataLayout();
3137       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3138 
3139       IRBuilder<> B(MiddleBlock->getTerminator());
3140       Value *CountMinusOne = B.CreateSub(
3141           CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1));
3142       Value *CMO =
3143           !II.getStep()->getType()->isIntegerTy()
3144               ? B.CreateCast(Instruction::SIToFP, CountMinusOne,
3145                              II.getStep()->getType())
3146               : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType());
3147       CMO->setName("cast.cmo");
3148       Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II);
3149       Escape->setName("ind.escape");
3150       MissingVals[UI] = Escape;
3151     }
3152   }
3153 
3154   for (auto &I : MissingVals) {
3155     PHINode *PHI = cast<PHINode>(I.first);
3156     // One corner case we have to handle is two IVs "chasing" each-other,
3157     // that is %IV2 = phi [...], [ %IV1, %latch ]
3158     // In this case, if IV1 has an external use, we need to avoid adding both
3159     // "last value of IV1" and "penultimate value of IV2". So, verify that we
3160     // don't already have an incoming value for the middle block.
3161     if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
3162       PHI->addIncoming(I.second, MiddleBlock);
3163   }
3164 }
3165 
3166 namespace {
3167 
3168 struct CSEDenseMapInfo {
3169   static bool canHandle(const Instruction *I) {
3170     return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
3171            isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
3172   }
3173 
3174   static inline Instruction *getEmptyKey() {
3175     return DenseMapInfo<Instruction *>::getEmptyKey();
3176   }
3177 
3178   static inline Instruction *getTombstoneKey() {
3179     return DenseMapInfo<Instruction *>::getTombstoneKey();
3180   }
3181 
3182   static unsigned getHashValue(const Instruction *I) {
3183     assert(canHandle(I) && "Unknown instruction!");
3184     return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
3185                                                            I->value_op_end()));
3186   }
3187 
3188   static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
3189     if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3190         LHS == getTombstoneKey() || RHS == getTombstoneKey())
3191       return LHS == RHS;
3192     return LHS->isIdenticalTo(RHS);
3193   }
3194 };
3195 
3196 } // end anonymous namespace
3197 
3198 ///Perform cse of induction variable instructions.
3199 static void cse(BasicBlock *BB) {
3200   // Perform simple cse.
3201   SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
3202   for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
3203     Instruction *In = &*I++;
3204 
3205     if (!CSEDenseMapInfo::canHandle(In))
3206       continue;
3207 
3208     // Check if we can replace this instruction with any of the
3209     // visited instructions.
3210     if (Instruction *V = CSEMap.lookup(In)) {
3211       In->replaceAllUsesWith(V);
3212       In->eraseFromParent();
3213       continue;
3214     }
3215 
3216     CSEMap[In] = In;
3217   }
3218 }
3219 
3220 unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
3221                                                        unsigned VF,
3222                                                        bool &NeedToScalarize) {
3223   Function *F = CI->getCalledFunction();
3224   StringRef FnName = CI->getCalledFunction()->getName();
3225   Type *ScalarRetTy = CI->getType();
3226   SmallVector<Type *, 4> Tys, ScalarTys;
3227   for (auto &ArgOp : CI->arg_operands())
3228     ScalarTys.push_back(ArgOp->getType());
3229 
3230   // Estimate cost of scalarized vector call. The source operands are assumed
3231   // to be vectors, so we need to extract individual elements from there,
3232   // execute VF scalar calls, and then gather the result into the vector return
3233   // value.
3234   unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys);
3235   if (VF == 1)
3236     return ScalarCallCost;
3237 
3238   // Compute corresponding vector type for return value and arguments.
3239   Type *RetTy = ToVectorTy(ScalarRetTy, VF);
3240   for (Type *ScalarTy : ScalarTys)
3241     Tys.push_back(ToVectorTy(ScalarTy, VF));
3242 
3243   // Compute costs of unpacking argument values for the scalar calls and
3244   // packing the return values to a vector.
3245   unsigned ScalarizationCost = getScalarizationOverhead(CI, VF);
3246 
3247   unsigned Cost = ScalarCallCost * VF + ScalarizationCost;
3248 
3249   // If we can't emit a vector call for this function, then the currently found
3250   // cost is the cost we need to return.
3251   NeedToScalarize = true;
3252   if (!TLI || !TLI->isFunctionVectorizable(FnName, VF) || CI->isNoBuiltin())
3253     return Cost;
3254 
3255   // If the corresponding vector cost is cheaper, return its cost.
3256   unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys);
3257   if (VectorCallCost < Cost) {
3258     NeedToScalarize = false;
3259     return VectorCallCost;
3260   }
3261   return Cost;
3262 }
3263 
3264 unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
3265                                                             unsigned VF) {
3266   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
3267   assert(ID && "Expected intrinsic call!");
3268 
3269   FastMathFlags FMF;
3270   if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
3271     FMF = FPMO->getFastMathFlags();
3272 
3273   SmallVector<Value *, 4> Operands(CI->arg_operands());
3274   return TTI.getIntrinsicInstrCost(ID, CI->getType(), Operands, FMF, VF);
3275 }
3276 
3277 static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
3278   auto *I1 = cast<IntegerType>(T1->getVectorElementType());
3279   auto *I2 = cast<IntegerType>(T2->getVectorElementType());
3280   return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3281 }
3282 static Type *largestIntegerVectorType(Type *T1, Type *T2) {
3283   auto *I1 = cast<IntegerType>(T1->getVectorElementType());
3284   auto *I2 = cast<IntegerType>(T2->getVectorElementType());
3285   return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3286 }
3287 
3288 void InnerLoopVectorizer::truncateToMinimalBitwidths() {
3289   // For every instruction `I` in MinBWs, truncate the operands, create a
3290   // truncated version of `I` and reextend its result. InstCombine runs
3291   // later and will remove any ext/trunc pairs.
3292   SmallPtrSet<Value *, 4> Erased;
3293   for (const auto &KV : Cost->getMinimalBitwidths()) {
3294     // If the value wasn't vectorized, we must maintain the original scalar
3295     // type. The absence of the value from VectorLoopValueMap indicates that it
3296     // wasn't vectorized.
3297     if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3298       continue;
3299     for (unsigned Part = 0; Part < UF; ++Part) {
3300       Value *I = getOrCreateVectorValue(KV.first, Part);
3301       if (Erased.find(I) != Erased.end() || I->use_empty() ||
3302           !isa<Instruction>(I))
3303         continue;
3304       Type *OriginalTy = I->getType();
3305       Type *ScalarTruncatedTy =
3306           IntegerType::get(OriginalTy->getContext(), KV.second);
3307       Type *TruncatedTy = VectorType::get(ScalarTruncatedTy,
3308                                           OriginalTy->getVectorNumElements());
3309       if (TruncatedTy == OriginalTy)
3310         continue;
3311 
3312       IRBuilder<> B(cast<Instruction>(I));
3313       auto ShrinkOperand = [&](Value *V) -> Value * {
3314         if (auto *ZI = dyn_cast<ZExtInst>(V))
3315           if (ZI->getSrcTy() == TruncatedTy)
3316             return ZI->getOperand(0);
3317         return B.CreateZExtOrTrunc(V, TruncatedTy);
3318       };
3319 
3320       // The actual instruction modification depends on the instruction type,
3321       // unfortunately.
3322       Value *NewI = nullptr;
3323       if (auto *BO = dyn_cast<BinaryOperator>(I)) {
3324         NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
3325                              ShrinkOperand(BO->getOperand(1)));
3326 
3327         // Any wrapping introduced by shrinking this operation shouldn't be
3328         // considered undefined behavior. So, we can't unconditionally copy
3329         // arithmetic wrapping flags to NewI.
3330         cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
3331       } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
3332         NewI =
3333             B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
3334                          ShrinkOperand(CI->getOperand(1)));
3335       } else if (auto *SI = dyn_cast<SelectInst>(I)) {
3336         NewI = B.CreateSelect(SI->getCondition(),
3337                               ShrinkOperand(SI->getTrueValue()),
3338                               ShrinkOperand(SI->getFalseValue()));
3339       } else if (auto *CI = dyn_cast<CastInst>(I)) {
3340         switch (CI->getOpcode()) {
3341         default:
3342           llvm_unreachable("Unhandled cast!");
3343         case Instruction::Trunc:
3344           NewI = ShrinkOperand(CI->getOperand(0));
3345           break;
3346         case Instruction::SExt:
3347           NewI = B.CreateSExtOrTrunc(
3348               CI->getOperand(0),
3349               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3350           break;
3351         case Instruction::ZExt:
3352           NewI = B.CreateZExtOrTrunc(
3353               CI->getOperand(0),
3354               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3355           break;
3356         }
3357       } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
3358         auto Elements0 = SI->getOperand(0)->getType()->getVectorNumElements();
3359         auto *O0 = B.CreateZExtOrTrunc(
3360             SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0));
3361         auto Elements1 = SI->getOperand(1)->getType()->getVectorNumElements();
3362         auto *O1 = B.CreateZExtOrTrunc(
3363             SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1));
3364 
3365         NewI = B.CreateShuffleVector(O0, O1, SI->getMask());
3366       } else if (isa<LoadInst>(I) || isa<PHINode>(I)) {
3367         // Don't do anything with the operands, just extend the result.
3368         continue;
3369       } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
3370         auto Elements = IE->getOperand(0)->getType()->getVectorNumElements();
3371         auto *O0 = B.CreateZExtOrTrunc(
3372             IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3373         auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
3374         NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
3375       } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
3376         auto Elements = EE->getOperand(0)->getType()->getVectorNumElements();
3377         auto *O0 = B.CreateZExtOrTrunc(
3378             EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3379         NewI = B.CreateExtractElement(O0, EE->getOperand(2));
3380       } else {
3381         // If we don't know what to do, be conservative and don't do anything.
3382         continue;
3383       }
3384 
3385       // Lastly, extend the result.
3386       NewI->takeName(cast<Instruction>(I));
3387       Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
3388       I->replaceAllUsesWith(Res);
3389       cast<Instruction>(I)->eraseFromParent();
3390       Erased.insert(I);
3391       VectorLoopValueMap.resetVectorValue(KV.first, Part, Res);
3392     }
3393   }
3394 
3395   // We'll have created a bunch of ZExts that are now parentless. Clean up.
3396   for (const auto &KV : Cost->getMinimalBitwidths()) {
3397     // If the value wasn't vectorized, we must maintain the original scalar
3398     // type. The absence of the value from VectorLoopValueMap indicates that it
3399     // wasn't vectorized.
3400     if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3401       continue;
3402     for (unsigned Part = 0; Part < UF; ++Part) {
3403       Value *I = getOrCreateVectorValue(KV.first, Part);
3404       ZExtInst *Inst = dyn_cast<ZExtInst>(I);
3405       if (Inst && Inst->use_empty()) {
3406         Value *NewI = Inst->getOperand(0);
3407         Inst->eraseFromParent();
3408         VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI);
3409       }
3410     }
3411   }
3412 }
3413 
3414 void InnerLoopVectorizer::fixVectorizedLoop() {
3415   // Insert truncates and extends for any truncated instructions as hints to
3416   // InstCombine.
3417   if (VF > 1)
3418     truncateToMinimalBitwidths();
3419 
3420   // Fix widened non-induction PHIs by setting up the PHI operands.
3421   if (OrigPHIsToFix.size()) {
3422     assert(EnableVPlanNativePath &&
3423            "Unexpected non-induction PHIs for fixup in non VPlan-native path");
3424     fixNonInductionPHIs();
3425   }
3426 
3427   // At this point every instruction in the original loop is widened to a
3428   // vector form. Now we need to fix the recurrences in the loop. These PHI
3429   // nodes are currently empty because we did not want to introduce cycles.
3430   // This is the second stage of vectorizing recurrences.
3431   fixCrossIterationPHIs();
3432 
3433   // Update the dominator tree.
3434   //
3435   // FIXME: After creating the structure of the new loop, the dominator tree is
3436   //        no longer up-to-date, and it remains that way until we update it
3437   //        here. An out-of-date dominator tree is problematic for SCEV,
3438   //        because SCEVExpander uses it to guide code generation. The
3439   //        vectorizer use SCEVExpanders in several places. Instead, we should
3440   //        keep the dominator tree up-to-date as we go.
3441   updateAnalysis();
3442 
3443   // Fix-up external users of the induction variables.
3444   for (auto &Entry : *Legal->getInductionVars())
3445     fixupIVUsers(Entry.first, Entry.second,
3446                  getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)),
3447                  IVEndValues[Entry.first], LoopMiddleBlock);
3448 
3449   fixLCSSAPHIs();
3450   for (Instruction *PI : PredicatedInstructions)
3451     sinkScalarOperands(&*PI);
3452 
3453   // Remove redundant induction instructions.
3454   cse(LoopVectorBody);
3455 }
3456 
3457 void InnerLoopVectorizer::fixCrossIterationPHIs() {
3458   // In order to support recurrences we need to be able to vectorize Phi nodes.
3459   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3460   // stage #2: We now need to fix the recurrences by adding incoming edges to
3461   // the currently empty PHI nodes. At this point every instruction in the
3462   // original loop is widened to a vector form so we can use them to construct
3463   // the incoming edges.
3464   for (PHINode &Phi : OrigLoop->getHeader()->phis()) {
3465     // Handle first-order recurrences and reductions that need to be fixed.
3466     if (Legal->isFirstOrderRecurrence(&Phi))
3467       fixFirstOrderRecurrence(&Phi);
3468     else if (Legal->isReductionVariable(&Phi))
3469       fixReduction(&Phi);
3470   }
3471 }
3472 
3473 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
3474   // This is the second phase of vectorizing first-order recurrences. An
3475   // overview of the transformation is described below. Suppose we have the
3476   // following loop.
3477   //
3478   //   for (int i = 0; i < n; ++i)
3479   //     b[i] = a[i] - a[i - 1];
3480   //
3481   // There is a first-order recurrence on "a". For this loop, the shorthand
3482   // scalar IR looks like:
3483   //
3484   //   scalar.ph:
3485   //     s_init = a[-1]
3486   //     br scalar.body
3487   //
3488   //   scalar.body:
3489   //     i = phi [0, scalar.ph], [i+1, scalar.body]
3490   //     s1 = phi [s_init, scalar.ph], [s2, scalar.body]
3491   //     s2 = a[i]
3492   //     b[i] = s2 - s1
3493   //     br cond, scalar.body, ...
3494   //
3495   // In this example, s1 is a recurrence because it's value depends on the
3496   // previous iteration. In the first phase of vectorization, we created a
3497   // temporary value for s1. We now complete the vectorization and produce the
3498   // shorthand vector IR shown below (for VF = 4, UF = 1).
3499   //
3500   //   vector.ph:
3501   //     v_init = vector(..., ..., ..., a[-1])
3502   //     br vector.body
3503   //
3504   //   vector.body
3505   //     i = phi [0, vector.ph], [i+4, vector.body]
3506   //     v1 = phi [v_init, vector.ph], [v2, vector.body]
3507   //     v2 = a[i, i+1, i+2, i+3];
3508   //     v3 = vector(v1(3), v2(0, 1, 2))
3509   //     b[i, i+1, i+2, i+3] = v2 - v3
3510   //     br cond, vector.body, middle.block
3511   //
3512   //   middle.block:
3513   //     x = v2(3)
3514   //     br scalar.ph
3515   //
3516   //   scalar.ph:
3517   //     s_init = phi [x, middle.block], [a[-1], otherwise]
3518   //     br scalar.body
3519   //
3520   // After execution completes the vector loop, we extract the next value of
3521   // the recurrence (x) to use as the initial value in the scalar loop.
3522 
3523   // Get the original loop preheader and single loop latch.
3524   auto *Preheader = OrigLoop->getLoopPreheader();
3525   auto *Latch = OrigLoop->getLoopLatch();
3526 
3527   // Get the initial and previous values of the scalar recurrence.
3528   auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader);
3529   auto *Previous = Phi->getIncomingValueForBlock(Latch);
3530 
3531   // Create a vector from the initial value.
3532   auto *VectorInit = ScalarInit;
3533   if (VF > 1) {
3534     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
3535     VectorInit = Builder.CreateInsertElement(
3536         UndefValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit,
3537         Builder.getInt32(VF - 1), "vector.recur.init");
3538   }
3539 
3540   // We constructed a temporary phi node in the first phase of vectorization.
3541   // This phi node will eventually be deleted.
3542   Builder.SetInsertPoint(
3543       cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0)));
3544 
3545   // Create a phi node for the new recurrence. The current value will either be
3546   // the initial value inserted into a vector or loop-varying vector value.
3547   auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur");
3548   VecPhi->addIncoming(VectorInit, LoopVectorPreHeader);
3549 
3550   // Get the vectorized previous value of the last part UF - 1. It appears last
3551   // among all unrolled iterations, due to the order of their construction.
3552   Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1);
3553 
3554   // Set the insertion point after the previous value if it is an instruction.
3555   // Note that the previous value may have been constant-folded so it is not
3556   // guaranteed to be an instruction in the vector loop. Also, if the previous
3557   // value is a phi node, we should insert after all the phi nodes to avoid
3558   // breaking basic block verification.
3559   if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart) ||
3560       isa<PHINode>(PreviousLastPart))
3561     Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt());
3562   else
3563     Builder.SetInsertPoint(
3564         &*++BasicBlock::iterator(cast<Instruction>(PreviousLastPart)));
3565 
3566   // We will construct a vector for the recurrence by combining the values for
3567   // the current and previous iterations. This is the required shuffle mask.
3568   SmallVector<Constant *, 8> ShuffleMask(VF);
3569   ShuffleMask[0] = Builder.getInt32(VF - 1);
3570   for (unsigned I = 1; I < VF; ++I)
3571     ShuffleMask[I] = Builder.getInt32(I + VF - 1);
3572 
3573   // The vector from which to take the initial value for the current iteration
3574   // (actual or unrolled). Initially, this is the vector phi node.
3575   Value *Incoming = VecPhi;
3576 
3577   // Shuffle the current and previous vector and update the vector parts.
3578   for (unsigned Part = 0; Part < UF; ++Part) {
3579     Value *PreviousPart = getOrCreateVectorValue(Previous, Part);
3580     Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part);
3581     auto *Shuffle =
3582         VF > 1 ? Builder.CreateShuffleVector(Incoming, PreviousPart,
3583                                              ConstantVector::get(ShuffleMask))
3584                : Incoming;
3585     PhiPart->replaceAllUsesWith(Shuffle);
3586     cast<Instruction>(PhiPart)->eraseFromParent();
3587     VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle);
3588     Incoming = PreviousPart;
3589   }
3590 
3591   // Fix the latch value of the new recurrence in the vector loop.
3592   VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
3593 
3594   // Extract the last vector element in the middle block. This will be the
3595   // initial value for the recurrence when jumping to the scalar loop.
3596   auto *ExtractForScalar = Incoming;
3597   if (VF > 1) {
3598     Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
3599     ExtractForScalar = Builder.CreateExtractElement(
3600         ExtractForScalar, Builder.getInt32(VF - 1), "vector.recur.extract");
3601   }
3602   // Extract the second last element in the middle block if the
3603   // Phi is used outside the loop. We need to extract the phi itself
3604   // and not the last element (the phi update in the current iteration). This
3605   // will be the value when jumping to the exit block from the LoopMiddleBlock,
3606   // when the scalar loop is not run at all.
3607   Value *ExtractForPhiUsedOutsideLoop = nullptr;
3608   if (VF > 1)
3609     ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
3610         Incoming, Builder.getInt32(VF - 2), "vector.recur.extract.for.phi");
3611   // When loop is unrolled without vectorizing, initialize
3612   // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of
3613   // `Incoming`. This is analogous to the vectorized case above: extracting the
3614   // second last element when VF > 1.
3615   else if (UF > 1)
3616     ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2);
3617 
3618   // Fix the initial value of the original recurrence in the scalar loop.
3619   Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
3620   auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
3621   for (auto *BB : predecessors(LoopScalarPreHeader)) {
3622     auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
3623     Start->addIncoming(Incoming, BB);
3624   }
3625 
3626   Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start);
3627   Phi->setName("scalar.recur");
3628 
3629   // Finally, fix users of the recurrence outside the loop. The users will need
3630   // either the last value of the scalar recurrence or the last value of the
3631   // vector recurrence we extracted in the middle block. Since the loop is in
3632   // LCSSA form, we just need to find all the phi nodes for the original scalar
3633   // recurrence in the exit block, and then add an edge for the middle block.
3634   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3635     if (LCSSAPhi.getIncomingValue(0) == Phi) {
3636       LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
3637     }
3638   }
3639 }
3640 
3641 void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
3642   Constant *Zero = Builder.getInt32(0);
3643 
3644   // Get it's reduction variable descriptor.
3645   assert(Legal->isReductionVariable(Phi) &&
3646          "Unable to find the reduction variable");
3647   RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[Phi];
3648 
3649   RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();
3650   TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
3651   Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
3652   RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind =
3653     RdxDesc.getMinMaxRecurrenceKind();
3654   setDebugLocFromInst(Builder, ReductionStartValue);
3655 
3656   // We need to generate a reduction vector from the incoming scalar.
3657   // To do so, we need to generate the 'identity' vector and override
3658   // one of the elements with the incoming scalar reduction. We need
3659   // to do it in the vector-loop preheader.
3660   Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
3661 
3662   // This is the vector-clone of the value that leaves the loop.
3663   Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType();
3664 
3665   // Find the reduction identity variable. Zero for addition, or, xor,
3666   // one for multiplication, -1 for And.
3667   Value *Identity;
3668   Value *VectorStart;
3669   if (RK == RecurrenceDescriptor::RK_IntegerMinMax ||
3670       RK == RecurrenceDescriptor::RK_FloatMinMax) {
3671     // MinMax reduction have the start value as their identify.
3672     if (VF == 1) {
3673       VectorStart = Identity = ReductionStartValue;
3674     } else {
3675       VectorStart = Identity =
3676         Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident");
3677     }
3678   } else {
3679     // Handle other reduction kinds:
3680     Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity(
3681         RK, VecTy->getScalarType());
3682     if (VF == 1) {
3683       Identity = Iden;
3684       // This vector is the Identity vector where the first element is the
3685       // incoming scalar reduction.
3686       VectorStart = ReductionStartValue;
3687     } else {
3688       Identity = ConstantVector::getSplat(VF, Iden);
3689 
3690       // This vector is the Identity vector where the first element is the
3691       // incoming scalar reduction.
3692       VectorStart =
3693         Builder.CreateInsertElement(Identity, ReductionStartValue, Zero);
3694     }
3695   }
3696 
3697   // Fix the vector-loop phi.
3698 
3699   // Reductions do not have to start at zero. They can start with
3700   // any loop invariant values.
3701   BasicBlock *Latch = OrigLoop->getLoopLatch();
3702   Value *LoopVal = Phi->getIncomingValueForBlock(Latch);
3703   for (unsigned Part = 0; Part < UF; ++Part) {
3704     Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part);
3705     Value *Val = getOrCreateVectorValue(LoopVal, Part);
3706     // Make sure to add the reduction stat value only to the
3707     // first unroll part.
3708     Value *StartVal = (Part == 0) ? VectorStart : Identity;
3709     cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader);
3710     cast<PHINode>(VecRdxPhi)
3711       ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
3712   }
3713 
3714   // Before each round, move the insertion point right between
3715   // the PHIs and the values we are going to write.
3716   // This allows us to write both PHINodes and the extractelement
3717   // instructions.
3718   Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3719 
3720   setDebugLocFromInst(Builder, LoopExitInst);
3721 
3722   // If tail is folded by masking, the vector value to leave the loop should be
3723   // a Select choosing between the vectorized LoopExitInst and vectorized Phi,
3724   // instead of the former.
3725   if (Cost->foldTailByMasking()) {
3726     for (unsigned Part = 0; Part < UF; ++Part) {
3727       Value *VecLoopExitInst =
3728           VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3729       Value *Sel = nullptr;
3730       for (User *U : VecLoopExitInst->users()) {
3731         if (isa<SelectInst>(U)) {
3732           assert(!Sel && "Reduction exit feeding two selects");
3733           Sel = U;
3734         } else
3735           assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select");
3736       }
3737       assert(Sel && "Reduction exit feeds no select");
3738       VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, Sel);
3739     }
3740   }
3741 
3742   // If the vector reduction can be performed in a smaller type, we truncate
3743   // then extend the loop exit value to enable InstCombine to evaluate the
3744   // entire expression in the smaller type.
3745   if (VF > 1 && Phi->getType() != RdxDesc.getRecurrenceType()) {
3746     Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
3747     Builder.SetInsertPoint(
3748         LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator());
3749     VectorParts RdxParts(UF);
3750     for (unsigned Part = 0; Part < UF; ++Part) {
3751       RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3752       Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3753       Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
3754                                         : Builder.CreateZExt(Trunc, VecTy);
3755       for (Value::user_iterator UI = RdxParts[Part]->user_begin();
3756            UI != RdxParts[Part]->user_end();)
3757         if (*UI != Trunc) {
3758           (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd);
3759           RdxParts[Part] = Extnd;
3760         } else {
3761           ++UI;
3762         }
3763     }
3764     Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3765     for (unsigned Part = 0; Part < UF; ++Part) {
3766       RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3767       VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]);
3768     }
3769   }
3770 
3771   // Reduce all of the unrolled parts into a single vector.
3772   Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0);
3773   unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK);
3774 
3775   // The middle block terminator has already been assigned a DebugLoc here (the
3776   // OrigLoop's single latch terminator). We want the whole middle block to
3777   // appear to execute on this line because: (a) it is all compiler generated,
3778   // (b) these instructions are always executed after evaluating the latch
3779   // conditional branch, and (c) other passes may add new predecessors which
3780   // terminate on this line. This is the easiest way to ensure we don't
3781   // accidentally cause an extra step back into the loop while debugging.
3782   setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator());
3783   for (unsigned Part = 1; Part < UF; ++Part) {
3784     Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3785     if (Op != Instruction::ICmp && Op != Instruction::FCmp)
3786       // Floating point operations had to be 'fast' to enable the reduction.
3787       ReducedPartRdx = addFastMathFlag(
3788           Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxPart,
3789                               ReducedPartRdx, "bin.rdx"),
3790           RdxDesc.getFastMathFlags());
3791     else
3792       ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx,
3793                                       RdxPart);
3794   }
3795 
3796   if (VF > 1) {
3797     bool NoNaN = Legal->hasFunNoNaNAttr();
3798     ReducedPartRdx =
3799         createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN);
3800     // If the reduction can be performed in a smaller type, we need to extend
3801     // the reduction to the wider type before we branch to the original loop.
3802     if (Phi->getType() != RdxDesc.getRecurrenceType())
3803       ReducedPartRdx =
3804         RdxDesc.isSigned()
3805         ? Builder.CreateSExt(ReducedPartRdx, Phi->getType())
3806         : Builder.CreateZExt(ReducedPartRdx, Phi->getType());
3807   }
3808 
3809   // Create a phi node that merges control-flow from the backedge-taken check
3810   // block and the middle block.
3811   PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx",
3812                                         LoopScalarPreHeader->getTerminator());
3813   for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
3814     BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]);
3815   BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
3816 
3817   // Now, we need to fix the users of the reduction variable
3818   // inside and outside of the scalar remainder loop.
3819   // We know that the loop is in LCSSA form. We need to update the
3820   // PHI nodes in the exit blocks.
3821   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3822     // All PHINodes need to have a single entry edge, or two if
3823     // we already fixed them.
3824     assert(LCSSAPhi.getNumIncomingValues() < 3 && "Invalid LCSSA PHI");
3825 
3826     // We found a reduction value exit-PHI. Update it with the
3827     // incoming bypass edge.
3828     if (LCSSAPhi.getIncomingValue(0) == LoopExitInst)
3829       LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
3830   } // end of the LCSSA phi scan.
3831 
3832     // Fix the scalar loop reduction variable with the incoming reduction sum
3833     // from the vector body and from the backedge value.
3834   int IncomingEdgeBlockIdx =
3835     Phi->getBasicBlockIndex(OrigLoop->getLoopLatch());
3836   assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
3837   // Pick the other block.
3838   int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
3839   Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
3840   Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
3841 }
3842 
3843 void InnerLoopVectorizer::fixLCSSAPHIs() {
3844   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3845     if (LCSSAPhi.getNumIncomingValues() == 1) {
3846       auto *IncomingValue = LCSSAPhi.getIncomingValue(0);
3847       // Non-instruction incoming values will have only one value.
3848       unsigned LastLane = 0;
3849       if (isa<Instruction>(IncomingValue))
3850           LastLane = Cost->isUniformAfterVectorization(
3851                          cast<Instruction>(IncomingValue), VF)
3852                          ? 0
3853                          : VF - 1;
3854       // Can be a loop invariant incoming value or the last scalar value to be
3855       // extracted from the vectorized loop.
3856       Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
3857       Value *lastIncomingValue =
3858           getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane });
3859       LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock);
3860     }
3861   }
3862 }
3863 
3864 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
3865   // The basic block and loop containing the predicated instruction.
3866   auto *PredBB = PredInst->getParent();
3867   auto *VectorLoop = LI->getLoopFor(PredBB);
3868 
3869   // Initialize a worklist with the operands of the predicated instruction.
3870   SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
3871 
3872   // Holds instructions that we need to analyze again. An instruction may be
3873   // reanalyzed if we don't yet know if we can sink it or not.
3874   SmallVector<Instruction *, 8> InstsToReanalyze;
3875 
3876   // Returns true if a given use occurs in the predicated block. Phi nodes use
3877   // their operands in their corresponding predecessor blocks.
3878   auto isBlockOfUsePredicated = [&](Use &U) -> bool {
3879     auto *I = cast<Instruction>(U.getUser());
3880     BasicBlock *BB = I->getParent();
3881     if (auto *Phi = dyn_cast<PHINode>(I))
3882       BB = Phi->getIncomingBlock(
3883           PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
3884     return BB == PredBB;
3885   };
3886 
3887   // Iteratively sink the scalarized operands of the predicated instruction
3888   // into the block we created for it. When an instruction is sunk, it's
3889   // operands are then added to the worklist. The algorithm ends after one pass
3890   // through the worklist doesn't sink a single instruction.
3891   bool Changed;
3892   do {
3893     // Add the instructions that need to be reanalyzed to the worklist, and
3894     // reset the changed indicator.
3895     Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
3896     InstsToReanalyze.clear();
3897     Changed = false;
3898 
3899     while (!Worklist.empty()) {
3900       auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
3901 
3902       // We can't sink an instruction if it is a phi node, is already in the
3903       // predicated block, is not in the loop, or may have side effects.
3904       if (!I || isa<PHINode>(I) || I->getParent() == PredBB ||
3905           !VectorLoop->contains(I) || I->mayHaveSideEffects())
3906         continue;
3907 
3908       // It's legal to sink the instruction if all its uses occur in the
3909       // predicated block. Otherwise, there's nothing to do yet, and we may
3910       // need to reanalyze the instruction.
3911       if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
3912         InstsToReanalyze.push_back(I);
3913         continue;
3914       }
3915 
3916       // Move the instruction to the beginning of the predicated block, and add
3917       // it's operands to the worklist.
3918       I->moveBefore(&*PredBB->getFirstInsertionPt());
3919       Worklist.insert(I->op_begin(), I->op_end());
3920 
3921       // The sinking may have enabled other instructions to be sunk, so we will
3922       // need to iterate.
3923       Changed = true;
3924     }
3925   } while (Changed);
3926 }
3927 
3928 void InnerLoopVectorizer::fixNonInductionPHIs() {
3929   for (PHINode *OrigPhi : OrigPHIsToFix) {
3930     PHINode *NewPhi =
3931         cast<PHINode>(VectorLoopValueMap.getVectorValue(OrigPhi, 0));
3932     unsigned NumIncomingValues = OrigPhi->getNumIncomingValues();
3933 
3934     SmallVector<BasicBlock *, 2> ScalarBBPredecessors(
3935         predecessors(OrigPhi->getParent()));
3936     SmallVector<BasicBlock *, 2> VectorBBPredecessors(
3937         predecessors(NewPhi->getParent()));
3938     assert(ScalarBBPredecessors.size() == VectorBBPredecessors.size() &&
3939            "Scalar and Vector BB should have the same number of predecessors");
3940 
3941     // The insertion point in Builder may be invalidated by the time we get
3942     // here. Force the Builder insertion point to something valid so that we do
3943     // not run into issues during insertion point restore in
3944     // getOrCreateVectorValue calls below.
3945     Builder.SetInsertPoint(NewPhi);
3946 
3947     // The predecessor order is preserved and we can rely on mapping between
3948     // scalar and vector block predecessors.
3949     for (unsigned i = 0; i < NumIncomingValues; ++i) {
3950       BasicBlock *NewPredBB = VectorBBPredecessors[i];
3951 
3952       // When looking up the new scalar/vector values to fix up, use incoming
3953       // values from original phi.
3954       Value *ScIncV =
3955           OrigPhi->getIncomingValueForBlock(ScalarBBPredecessors[i]);
3956 
3957       // Scalar incoming value may need a broadcast
3958       Value *NewIncV = getOrCreateVectorValue(ScIncV, 0);
3959       NewPhi->addIncoming(NewIncV, NewPredBB);
3960     }
3961   }
3962 }
3963 
3964 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
3965                                               unsigned VF) {
3966   PHINode *P = cast<PHINode>(PN);
3967   if (EnableVPlanNativePath) {
3968     // Currently we enter here in the VPlan-native path for non-induction
3969     // PHIs where all control flow is uniform. We simply widen these PHIs.
3970     // Create a vector phi with no operands - the vector phi operands will be
3971     // set at the end of vector code generation.
3972     Type *VecTy =
3973         (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF);
3974     Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi");
3975     VectorLoopValueMap.setVectorValue(P, 0, VecPhi);
3976     OrigPHIsToFix.push_back(P);
3977 
3978     return;
3979   }
3980 
3981   assert(PN->getParent() == OrigLoop->getHeader() &&
3982          "Non-header phis should have been handled elsewhere");
3983 
3984   // In order to support recurrences we need to be able to vectorize Phi nodes.
3985   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3986   // stage #1: We create a new vector PHI node with no incoming edges. We'll use
3987   // this value when we vectorize all of the instructions that use the PHI.
3988   if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) {
3989     for (unsigned Part = 0; Part < UF; ++Part) {
3990       // This is phase one of vectorizing PHIs.
3991       Type *VecTy =
3992           (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF);
3993       Value *EntryPart = PHINode::Create(
3994           VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt());
3995       VectorLoopValueMap.setVectorValue(P, Part, EntryPart);
3996     }
3997     return;
3998   }
3999 
4000   setDebugLocFromInst(Builder, P);
4001 
4002   // This PHINode must be an induction variable.
4003   // Make sure that we know about it.
4004   assert(Legal->getInductionVars()->count(P) && "Not an induction variable");
4005 
4006   InductionDescriptor II = Legal->getInductionVars()->lookup(P);
4007   const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
4008 
4009   // FIXME: The newly created binary instructions should contain nsw/nuw flags,
4010   // which can be found from the original scalar operations.
4011   switch (II.getKind()) {
4012   case InductionDescriptor::IK_NoInduction:
4013     llvm_unreachable("Unknown induction");
4014   case InductionDescriptor::IK_IntInduction:
4015   case InductionDescriptor::IK_FpInduction:
4016     llvm_unreachable("Integer/fp induction is handled elsewhere.");
4017   case InductionDescriptor::IK_PtrInduction: {
4018     // Handle the pointer induction variable case.
4019     assert(P->getType()->isPointerTy() && "Unexpected type.");
4020     // This is the normalized GEP that starts counting at zero.
4021     Value *PtrInd = Induction;
4022     PtrInd = Builder.CreateSExtOrTrunc(PtrInd, II.getStep()->getType());
4023     // Determine the number of scalars we need to generate for each unroll
4024     // iteration. If the instruction is uniform, we only need to generate the
4025     // first lane. Otherwise, we generate all VF values.
4026     unsigned Lanes = Cost->isUniformAfterVectorization(P, VF) ? 1 : VF;
4027     // These are the scalar results. Notice that we don't generate vector GEPs
4028     // because scalar GEPs result in better code.
4029     for (unsigned Part = 0; Part < UF; ++Part) {
4030       for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
4031         Constant *Idx = ConstantInt::get(PtrInd->getType(), Lane + Part * VF);
4032         Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
4033         Value *SclrGep =
4034             emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II);
4035         SclrGep->setName("next.gep");
4036         VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep);
4037       }
4038     }
4039     return;
4040   }
4041   }
4042 }
4043 
4044 /// A helper function for checking whether an integer division-related
4045 /// instruction may divide by zero (in which case it must be predicated if
4046 /// executed conditionally in the scalar code).
4047 /// TODO: It may be worthwhile to generalize and check isKnownNonZero().
4048 /// Non-zero divisors that are non compile-time constants will not be
4049 /// converted into multiplication, so we will still end up scalarizing
4050 /// the division, but can do so w/o predication.
4051 static bool mayDivideByZero(Instruction &I) {
4052   assert((I.getOpcode() == Instruction::UDiv ||
4053           I.getOpcode() == Instruction::SDiv ||
4054           I.getOpcode() == Instruction::URem ||
4055           I.getOpcode() == Instruction::SRem) &&
4056          "Unexpected instruction");
4057   Value *Divisor = I.getOperand(1);
4058   auto *CInt = dyn_cast<ConstantInt>(Divisor);
4059   return !CInt || CInt->isZero();
4060 }
4061 
4062 void InnerLoopVectorizer::widenInstruction(Instruction &I) {
4063   switch (I.getOpcode()) {
4064   case Instruction::Br:
4065   case Instruction::PHI:
4066     llvm_unreachable("This instruction is handled by a different recipe.");
4067   case Instruction::GetElementPtr: {
4068     // Construct a vector GEP by widening the operands of the scalar GEP as
4069     // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
4070     // results in a vector of pointers when at least one operand of the GEP
4071     // is vector-typed. Thus, to keep the representation compact, we only use
4072     // vector-typed operands for loop-varying values.
4073     auto *GEP = cast<GetElementPtrInst>(&I);
4074 
4075     if (VF > 1 && OrigLoop->hasLoopInvariantOperands(GEP)) {
4076       // If we are vectorizing, but the GEP has only loop-invariant operands,
4077       // the GEP we build (by only using vector-typed operands for
4078       // loop-varying values) would be a scalar pointer. Thus, to ensure we
4079       // produce a vector of pointers, we need to either arbitrarily pick an
4080       // operand to broadcast, or broadcast a clone of the original GEP.
4081       // Here, we broadcast a clone of the original.
4082       //
4083       // TODO: If at some point we decide to scalarize instructions having
4084       //       loop-invariant operands, this special case will no longer be
4085       //       required. We would add the scalarization decision to
4086       //       collectLoopScalars() and teach getVectorValue() to broadcast
4087       //       the lane-zero scalar value.
4088       auto *Clone = Builder.Insert(GEP->clone());
4089       for (unsigned Part = 0; Part < UF; ++Part) {
4090         Value *EntryPart = Builder.CreateVectorSplat(VF, Clone);
4091         VectorLoopValueMap.setVectorValue(&I, Part, EntryPart);
4092         addMetadata(EntryPart, GEP);
4093       }
4094     } else {
4095       // If the GEP has at least one loop-varying operand, we are sure to
4096       // produce a vector of pointers. But if we are only unrolling, we want
4097       // to produce a scalar GEP for each unroll part. Thus, the GEP we
4098       // produce with the code below will be scalar (if VF == 1) or vector
4099       // (otherwise). Note that for the unroll-only case, we still maintain
4100       // values in the vector mapping with initVector, as we do for other
4101       // instructions.
4102       for (unsigned Part = 0; Part < UF; ++Part) {
4103         // The pointer operand of the new GEP. If it's loop-invariant, we
4104         // won't broadcast it.
4105         auto *Ptr =
4106             OrigLoop->isLoopInvariant(GEP->getPointerOperand())
4107                 ? GEP->getPointerOperand()
4108                 : getOrCreateVectorValue(GEP->getPointerOperand(), Part);
4109 
4110         // Collect all the indices for the new GEP. If any index is
4111         // loop-invariant, we won't broadcast it.
4112         SmallVector<Value *, 4> Indices;
4113         for (auto &U : make_range(GEP->idx_begin(), GEP->idx_end())) {
4114           if (OrigLoop->isLoopInvariant(U.get()))
4115             Indices.push_back(U.get());
4116           else
4117             Indices.push_back(getOrCreateVectorValue(U.get(), Part));
4118         }
4119 
4120         // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
4121         // but it should be a vector, otherwise.
4122         auto *NewGEP =
4123             GEP->isInBounds()
4124                 ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr,
4125                                             Indices)
4126                 : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices);
4127         assert((VF == 1 || NewGEP->getType()->isVectorTy()) &&
4128                "NewGEP is not a pointer vector");
4129         VectorLoopValueMap.setVectorValue(&I, Part, NewGEP);
4130         addMetadata(NewGEP, GEP);
4131       }
4132     }
4133 
4134     break;
4135   }
4136   case Instruction::UDiv:
4137   case Instruction::SDiv:
4138   case Instruction::SRem:
4139   case Instruction::URem:
4140   case Instruction::Add:
4141   case Instruction::FAdd:
4142   case Instruction::Sub:
4143   case Instruction::FSub:
4144   case Instruction::FNeg:
4145   case Instruction::Mul:
4146   case Instruction::FMul:
4147   case Instruction::FDiv:
4148   case Instruction::FRem:
4149   case Instruction::Shl:
4150   case Instruction::LShr:
4151   case Instruction::AShr:
4152   case Instruction::And:
4153   case Instruction::Or:
4154   case Instruction::Xor: {
4155     // Just widen unops and binops.
4156     setDebugLocFromInst(Builder, &I);
4157 
4158     for (unsigned Part = 0; Part < UF; ++Part) {
4159       SmallVector<Value *, 2> Ops;
4160       for (Value *Op : I.operands())
4161         Ops.push_back(getOrCreateVectorValue(Op, Part));
4162 
4163       Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops);
4164 
4165       if (auto *VecOp = dyn_cast<Instruction>(V))
4166         VecOp->copyIRFlags(&I);
4167 
4168       // Use this vector value for all users of the original instruction.
4169       VectorLoopValueMap.setVectorValue(&I, Part, V);
4170       addMetadata(V, &I);
4171     }
4172 
4173     break;
4174   }
4175   case Instruction::Select: {
4176     // Widen selects.
4177     // If the selector is loop invariant we can create a select
4178     // instruction with a scalar condition. Otherwise, use vector-select.
4179     auto *SE = PSE.getSE();
4180     bool InvariantCond =
4181         SE->isLoopInvariant(PSE.getSCEV(I.getOperand(0)), OrigLoop);
4182     setDebugLocFromInst(Builder, &I);
4183 
4184     // The condition can be loop invariant  but still defined inside the
4185     // loop. This means that we can't just use the original 'cond' value.
4186     // We have to take the 'vectorized' value and pick the first lane.
4187     // Instcombine will make this a no-op.
4188 
4189     auto *ScalarCond = getOrCreateScalarValue(I.getOperand(0), {0, 0});
4190 
4191     for (unsigned Part = 0; Part < UF; ++Part) {
4192       Value *Cond = getOrCreateVectorValue(I.getOperand(0), Part);
4193       Value *Op0 = getOrCreateVectorValue(I.getOperand(1), Part);
4194       Value *Op1 = getOrCreateVectorValue(I.getOperand(2), Part);
4195       Value *Sel =
4196           Builder.CreateSelect(InvariantCond ? ScalarCond : Cond, Op0, Op1);
4197       VectorLoopValueMap.setVectorValue(&I, Part, Sel);
4198       addMetadata(Sel, &I);
4199     }
4200 
4201     break;
4202   }
4203 
4204   case Instruction::ICmp:
4205   case Instruction::FCmp: {
4206     // Widen compares. Generate vector compares.
4207     bool FCmp = (I.getOpcode() == Instruction::FCmp);
4208     auto *Cmp = cast<CmpInst>(&I);
4209     setDebugLocFromInst(Builder, Cmp);
4210     for (unsigned Part = 0; Part < UF; ++Part) {
4211       Value *A = getOrCreateVectorValue(Cmp->getOperand(0), Part);
4212       Value *B = getOrCreateVectorValue(Cmp->getOperand(1), Part);
4213       Value *C = nullptr;
4214       if (FCmp) {
4215         // Propagate fast math flags.
4216         IRBuilder<>::FastMathFlagGuard FMFG(Builder);
4217         Builder.setFastMathFlags(Cmp->getFastMathFlags());
4218         C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
4219       } else {
4220         C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
4221       }
4222       VectorLoopValueMap.setVectorValue(&I, Part, C);
4223       addMetadata(C, &I);
4224     }
4225 
4226     break;
4227   }
4228 
4229   case Instruction::ZExt:
4230   case Instruction::SExt:
4231   case Instruction::FPToUI:
4232   case Instruction::FPToSI:
4233   case Instruction::FPExt:
4234   case Instruction::PtrToInt:
4235   case Instruction::IntToPtr:
4236   case Instruction::SIToFP:
4237   case Instruction::UIToFP:
4238   case Instruction::Trunc:
4239   case Instruction::FPTrunc:
4240   case Instruction::BitCast: {
4241     auto *CI = cast<CastInst>(&I);
4242     setDebugLocFromInst(Builder, CI);
4243 
4244     /// Vectorize casts.
4245     Type *DestTy =
4246         (VF == 1) ? CI->getType() : VectorType::get(CI->getType(), VF);
4247 
4248     for (unsigned Part = 0; Part < UF; ++Part) {
4249       Value *A = getOrCreateVectorValue(CI->getOperand(0), Part);
4250       Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
4251       VectorLoopValueMap.setVectorValue(&I, Part, Cast);
4252       addMetadata(Cast, &I);
4253     }
4254     break;
4255   }
4256 
4257   case Instruction::Call: {
4258     // Ignore dbg intrinsics.
4259     if (isa<DbgInfoIntrinsic>(I))
4260       break;
4261     setDebugLocFromInst(Builder, &I);
4262 
4263     Module *M = I.getParent()->getParent()->getParent();
4264     auto *CI = cast<CallInst>(&I);
4265 
4266     StringRef FnName = CI->getCalledFunction()->getName();
4267     Function *F = CI->getCalledFunction();
4268     Type *RetTy = ToVectorTy(CI->getType(), VF);
4269     SmallVector<Type *, 4> Tys;
4270     for (Value *ArgOperand : CI->arg_operands())
4271       Tys.push_back(ToVectorTy(ArgOperand->getType(), VF));
4272 
4273     Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
4274 
4275     // The flag shows whether we use Intrinsic or a usual Call for vectorized
4276     // version of the instruction.
4277     // Is it beneficial to perform intrinsic call compared to lib call?
4278     bool NeedToScalarize;
4279     unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize);
4280     bool UseVectorIntrinsic =
4281         ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost;
4282     assert((UseVectorIntrinsic || !NeedToScalarize) &&
4283            "Instruction should be scalarized elsewhere.");
4284 
4285     for (unsigned Part = 0; Part < UF; ++Part) {
4286       SmallVector<Value *, 4> Args;
4287       for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) {
4288         Value *Arg = CI->getArgOperand(i);
4289         // Some intrinsics have a scalar argument - don't replace it with a
4290         // vector.
4291         if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, i))
4292           Arg = getOrCreateVectorValue(CI->getArgOperand(i), Part);
4293         Args.push_back(Arg);
4294       }
4295 
4296       Function *VectorF;
4297       if (UseVectorIntrinsic) {
4298         // Use vector version of the intrinsic.
4299         Type *TysForDecl[] = {CI->getType()};
4300         if (VF > 1)
4301           TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
4302         VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
4303       } else {
4304         // Use vector version of the library call.
4305         StringRef VFnName = TLI->getVectorizedFunction(FnName, VF);
4306         assert(!VFnName.empty() && "Vector function name is empty.");
4307         VectorF = M->getFunction(VFnName);
4308         if (!VectorF) {
4309           // Generate a declaration
4310           FunctionType *FTy = FunctionType::get(RetTy, Tys, false);
4311           VectorF =
4312               Function::Create(FTy, Function::ExternalLinkage, VFnName, M);
4313           VectorF->copyAttributesFrom(F);
4314         }
4315       }
4316       assert(VectorF && "Can't create vector function.");
4317 
4318       SmallVector<OperandBundleDef, 1> OpBundles;
4319       CI->getOperandBundlesAsDefs(OpBundles);
4320       CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
4321 
4322       if (isa<FPMathOperator>(V))
4323         V->copyFastMathFlags(CI);
4324 
4325       VectorLoopValueMap.setVectorValue(&I, Part, V);
4326       addMetadata(V, &I);
4327     }
4328 
4329     break;
4330   }
4331 
4332   default:
4333     // This instruction is not vectorized by simple widening.
4334     LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
4335     llvm_unreachable("Unhandled instruction!");
4336   } // end of switch.
4337 }
4338 
4339 void InnerLoopVectorizer::updateAnalysis() {
4340   // Forget the original basic block.
4341   PSE.getSE()->forgetLoop(OrigLoop);
4342 
4343   // DT is not kept up-to-date for outer loop vectorization
4344   if (EnableVPlanNativePath)
4345     return;
4346 
4347   // Update the dominator tree information.
4348   assert(DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) &&
4349          "Entry does not dominate exit.");
4350 
4351   DT->addNewBlock(LoopMiddleBlock,
4352                   LI->getLoopFor(LoopVectorBody)->getLoopLatch());
4353   DT->addNewBlock(LoopScalarPreHeader, LoopBypassBlocks[0]);
4354   DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader);
4355   DT->changeImmediateDominator(LoopExitBlock, LoopBypassBlocks[0]);
4356   assert(DT->verify(DominatorTree::VerificationLevel::Fast));
4357 }
4358 
4359 void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
4360   // We should not collect Scalars more than once per VF. Right now, this
4361   // function is called from collectUniformsAndScalars(), which already does
4362   // this check. Collecting Scalars for VF=1 does not make any sense.
4363   assert(VF >= 2 && Scalars.find(VF) == Scalars.end() &&
4364          "This function should not be visited twice for the same VF");
4365 
4366   SmallSetVector<Instruction *, 8> Worklist;
4367 
4368   // These sets are used to seed the analysis with pointers used by memory
4369   // accesses that will remain scalar.
4370   SmallSetVector<Instruction *, 8> ScalarPtrs;
4371   SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
4372 
4373   // A helper that returns true if the use of Ptr by MemAccess will be scalar.
4374   // The pointer operands of loads and stores will be scalar as long as the
4375   // memory access is not a gather or scatter operation. The value operand of a
4376   // store will remain scalar if the store is scalarized.
4377   auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
4378     InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
4379     assert(WideningDecision != CM_Unknown &&
4380            "Widening decision should be ready at this moment");
4381     if (auto *Store = dyn_cast<StoreInst>(MemAccess))
4382       if (Ptr == Store->getValueOperand())
4383         return WideningDecision == CM_Scalarize;
4384     assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
4385            "Ptr is neither a value or pointer operand");
4386     return WideningDecision != CM_GatherScatter;
4387   };
4388 
4389   // A helper that returns true if the given value is a bitcast or
4390   // getelementptr instruction contained in the loop.
4391   auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
4392     return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
4393             isa<GetElementPtrInst>(V)) &&
4394            !TheLoop->isLoopInvariant(V);
4395   };
4396 
4397   // A helper that evaluates a memory access's use of a pointer. If the use
4398   // will be a scalar use, and the pointer is only used by memory accesses, we
4399   // place the pointer in ScalarPtrs. Otherwise, the pointer is placed in
4400   // PossibleNonScalarPtrs.
4401   auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
4402     // We only care about bitcast and getelementptr instructions contained in
4403     // the loop.
4404     if (!isLoopVaryingBitCastOrGEP(Ptr))
4405       return;
4406 
4407     // If the pointer has already been identified as scalar (e.g., if it was
4408     // also identified as uniform), there's nothing to do.
4409     auto *I = cast<Instruction>(Ptr);
4410     if (Worklist.count(I))
4411       return;
4412 
4413     // If the use of the pointer will be a scalar use, and all users of the
4414     // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
4415     // place the pointer in PossibleNonScalarPtrs.
4416     if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
4417           return isa<LoadInst>(U) || isa<StoreInst>(U);
4418         }))
4419       ScalarPtrs.insert(I);
4420     else
4421       PossibleNonScalarPtrs.insert(I);
4422   };
4423 
4424   // We seed the scalars analysis with three classes of instructions: (1)
4425   // instructions marked uniform-after-vectorization, (2) bitcast and
4426   // getelementptr instructions used by memory accesses requiring a scalar use,
4427   // and (3) pointer induction variables and their update instructions (we
4428   // currently only scalarize these).
4429   //
4430   // (1) Add to the worklist all instructions that have been identified as
4431   // uniform-after-vectorization.
4432   Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
4433 
4434   // (2) Add to the worklist all bitcast and getelementptr instructions used by
4435   // memory accesses requiring a scalar use. The pointer operands of loads and
4436   // stores will be scalar as long as the memory accesses is not a gather or
4437   // scatter operation. The value operand of a store will remain scalar if the
4438   // store is scalarized.
4439   for (auto *BB : TheLoop->blocks())
4440     for (auto &I : *BB) {
4441       if (auto *Load = dyn_cast<LoadInst>(&I)) {
4442         evaluatePtrUse(Load, Load->getPointerOperand());
4443       } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
4444         evaluatePtrUse(Store, Store->getPointerOperand());
4445         evaluatePtrUse(Store, Store->getValueOperand());
4446       }
4447     }
4448   for (auto *I : ScalarPtrs)
4449     if (PossibleNonScalarPtrs.find(I) == PossibleNonScalarPtrs.end()) {
4450       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
4451       Worklist.insert(I);
4452     }
4453 
4454   // (3) Add to the worklist all pointer induction variables and their update
4455   // instructions.
4456   //
4457   // TODO: Once we are able to vectorize pointer induction variables we should
4458   //       no longer insert them into the worklist here.
4459   auto *Latch = TheLoop->getLoopLatch();
4460   for (auto &Induction : *Legal->getInductionVars()) {
4461     auto *Ind = Induction.first;
4462     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4463     if (Induction.second.getKind() != InductionDescriptor::IK_PtrInduction)
4464       continue;
4465     Worklist.insert(Ind);
4466     Worklist.insert(IndUpdate);
4467     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
4468     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
4469                       << "\n");
4470   }
4471 
4472   // Insert the forced scalars.
4473   // FIXME: Currently widenPHIInstruction() often creates a dead vector
4474   // induction variable when the PHI user is scalarized.
4475   auto ForcedScalar = ForcedScalars.find(VF);
4476   if (ForcedScalar != ForcedScalars.end())
4477     for (auto *I : ForcedScalar->second)
4478       Worklist.insert(I);
4479 
4480   // Expand the worklist by looking through any bitcasts and getelementptr
4481   // instructions we've already identified as scalar. This is similar to the
4482   // expansion step in collectLoopUniforms(); however, here we're only
4483   // expanding to include additional bitcasts and getelementptr instructions.
4484   unsigned Idx = 0;
4485   while (Idx != Worklist.size()) {
4486     Instruction *Dst = Worklist[Idx++];
4487     if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
4488       continue;
4489     auto *Src = cast<Instruction>(Dst->getOperand(0));
4490     if (llvm::all_of(Src->users(), [&](User *U) -> bool {
4491           auto *J = cast<Instruction>(U);
4492           return !TheLoop->contains(J) || Worklist.count(J) ||
4493                  ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
4494                   isScalarUse(J, Src));
4495         })) {
4496       Worklist.insert(Src);
4497       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
4498     }
4499   }
4500 
4501   // An induction variable will remain scalar if all users of the induction
4502   // variable and induction variable update remain scalar.
4503   for (auto &Induction : *Legal->getInductionVars()) {
4504     auto *Ind = Induction.first;
4505     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4506 
4507     // We already considered pointer induction variables, so there's no reason
4508     // to look at their users again.
4509     //
4510     // TODO: Once we are able to vectorize pointer induction variables we
4511     //       should no longer skip over them here.
4512     if (Induction.second.getKind() == InductionDescriptor::IK_PtrInduction)
4513       continue;
4514 
4515     // Determine if all users of the induction variable are scalar after
4516     // vectorization.
4517     auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4518       auto *I = cast<Instruction>(U);
4519       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I);
4520     });
4521     if (!ScalarInd)
4522       continue;
4523 
4524     // Determine if all users of the induction variable update instruction are
4525     // scalar after vectorization.
4526     auto ScalarIndUpdate =
4527         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4528           auto *I = cast<Instruction>(U);
4529           return I == Ind || !TheLoop->contains(I) || Worklist.count(I);
4530         });
4531     if (!ScalarIndUpdate)
4532       continue;
4533 
4534     // The induction variable and its update instruction will remain scalar.
4535     Worklist.insert(Ind);
4536     Worklist.insert(IndUpdate);
4537     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
4538     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
4539                       << "\n");
4540   }
4541 
4542   Scalars[VF].insert(Worklist.begin(), Worklist.end());
4543 }
4544 
4545 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, unsigned VF) {
4546   if (!blockNeedsPredication(I->getParent()))
4547     return false;
4548   switch(I->getOpcode()) {
4549   default:
4550     break;
4551   case Instruction::Load:
4552   case Instruction::Store: {
4553     if (!Legal->isMaskRequired(I))
4554       return false;
4555     auto *Ptr = getLoadStorePointerOperand(I);
4556     auto *Ty = getMemInstValueType(I);
4557     // We have already decided how to vectorize this instruction, get that
4558     // result.
4559     if (VF > 1) {
4560       InstWidening WideningDecision = getWideningDecision(I, VF);
4561       assert(WideningDecision != CM_Unknown &&
4562              "Widening decision should be ready at this moment");
4563       return WideningDecision == CM_Scalarize;
4564     }
4565     const MaybeAlign Alignment = getLoadStoreAlignment(I);
4566     return isa<LoadInst>(I) ?
4567         !(isLegalMaskedLoad(Ty, Ptr, Alignment) || isLegalMaskedGather(Ty))
4568       : !(isLegalMaskedStore(Ty, Ptr, Alignment) || isLegalMaskedScatter(Ty));
4569   }
4570   case Instruction::UDiv:
4571   case Instruction::SDiv:
4572   case Instruction::SRem:
4573   case Instruction::URem:
4574     return mayDivideByZero(*I);
4575   }
4576   return false;
4577 }
4578 
4579 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(Instruction *I,
4580                                                                unsigned VF) {
4581   assert(isAccessInterleaved(I) && "Expecting interleaved access.");
4582   assert(getWideningDecision(I, VF) == CM_Unknown &&
4583          "Decision should not be set yet.");
4584   auto *Group = getInterleavedAccessGroup(I);
4585   assert(Group && "Must have a group.");
4586 
4587   // If the instruction's allocated size doesn't equal it's type size, it
4588   // requires padding and will be scalarized.
4589   auto &DL = I->getModule()->getDataLayout();
4590   auto *ScalarTy = getMemInstValueType(I);
4591   if (hasIrregularType(ScalarTy, DL, VF))
4592     return false;
4593 
4594   // Check if masking is required.
4595   // A Group may need masking for one of two reasons: it resides in a block that
4596   // needs predication, or it was decided to use masking to deal with gaps.
4597   bool PredicatedAccessRequiresMasking =
4598       Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I);
4599   bool AccessWithGapsRequiresMasking =
4600       Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
4601   if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking)
4602     return true;
4603 
4604   // If masked interleaving is required, we expect that the user/target had
4605   // enabled it, because otherwise it either wouldn't have been created or
4606   // it should have been invalidated by the CostModel.
4607   assert(useMaskedInterleavedAccesses(TTI) &&
4608          "Masked interleave-groups for predicated accesses are not enabled.");
4609 
4610   auto *Ty = getMemInstValueType(I);
4611   const MaybeAlign Alignment = getLoadStoreAlignment(I);
4612   return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
4613                           : TTI.isLegalMaskedStore(Ty, Alignment);
4614 }
4615 
4616 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(Instruction *I,
4617                                                                unsigned VF) {
4618   // Get and ensure we have a valid memory instruction.
4619   LoadInst *LI = dyn_cast<LoadInst>(I);
4620   StoreInst *SI = dyn_cast<StoreInst>(I);
4621   assert((LI || SI) && "Invalid memory instruction");
4622 
4623   auto *Ptr = getLoadStorePointerOperand(I);
4624 
4625   // In order to be widened, the pointer should be consecutive, first of all.
4626   if (!Legal->isConsecutivePtr(Ptr))
4627     return false;
4628 
4629   // If the instruction is a store located in a predicated block, it will be
4630   // scalarized.
4631   if (isScalarWithPredication(I))
4632     return false;
4633 
4634   // If the instruction's allocated size doesn't equal it's type size, it
4635   // requires padding and will be scalarized.
4636   auto &DL = I->getModule()->getDataLayout();
4637   auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType();
4638   if (hasIrregularType(ScalarTy, DL, VF))
4639     return false;
4640 
4641   return true;
4642 }
4643 
4644 void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
4645   // We should not collect Uniforms more than once per VF. Right now,
4646   // this function is called from collectUniformsAndScalars(), which
4647   // already does this check. Collecting Uniforms for VF=1 does not make any
4648   // sense.
4649 
4650   assert(VF >= 2 && Uniforms.find(VF) == Uniforms.end() &&
4651          "This function should not be visited twice for the same VF");
4652 
4653   // Visit the list of Uniforms. If we'll not find any uniform value, we'll
4654   // not analyze again.  Uniforms.count(VF) will return 1.
4655   Uniforms[VF].clear();
4656 
4657   // We now know that the loop is vectorizable!
4658   // Collect instructions inside the loop that will remain uniform after
4659   // vectorization.
4660 
4661   // Global values, params and instructions outside of current loop are out of
4662   // scope.
4663   auto isOutOfScope = [&](Value *V) -> bool {
4664     Instruction *I = dyn_cast<Instruction>(V);
4665     return (!I || !TheLoop->contains(I));
4666   };
4667 
4668   SetVector<Instruction *> Worklist;
4669   BasicBlock *Latch = TheLoop->getLoopLatch();
4670 
4671   // Instructions that are scalar with predication must not be considered
4672   // uniform after vectorization, because that would create an erroneous
4673   // replicating region where only a single instance out of VF should be formed.
4674   // TODO: optimize such seldom cases if found important, see PR40816.
4675   auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
4676     if (isScalarWithPredication(I, VF)) {
4677       LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "
4678                         << *I << "\n");
4679       return;
4680     }
4681     LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
4682     Worklist.insert(I);
4683   };
4684 
4685   // Start with the conditional branch. If the branch condition is an
4686   // instruction contained in the loop that is only used by the branch, it is
4687   // uniform.
4688   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
4689   if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
4690     addToWorklistIfAllowed(Cmp);
4691 
4692   // Holds consecutive and consecutive-like pointers. Consecutive-like pointers
4693   // are pointers that are treated like consecutive pointers during
4694   // vectorization. The pointer operands of interleaved accesses are an
4695   // example.
4696   SmallSetVector<Instruction *, 8> ConsecutiveLikePtrs;
4697 
4698   // Holds pointer operands of instructions that are possibly non-uniform.
4699   SmallPtrSet<Instruction *, 8> PossibleNonUniformPtrs;
4700 
4701   auto isUniformDecision = [&](Instruction *I, unsigned VF) {
4702     InstWidening WideningDecision = getWideningDecision(I, VF);
4703     assert(WideningDecision != CM_Unknown &&
4704            "Widening decision should be ready at this moment");
4705 
4706     return (WideningDecision == CM_Widen ||
4707             WideningDecision == CM_Widen_Reverse ||
4708             WideningDecision == CM_Interleave);
4709   };
4710   // Iterate over the instructions in the loop, and collect all
4711   // consecutive-like pointer operands in ConsecutiveLikePtrs. If it's possible
4712   // that a consecutive-like pointer operand will be scalarized, we collect it
4713   // in PossibleNonUniformPtrs instead. We use two sets here because a single
4714   // getelementptr instruction can be used by both vectorized and scalarized
4715   // memory instructions. For example, if a loop loads and stores from the same
4716   // location, but the store is conditional, the store will be scalarized, and
4717   // the getelementptr won't remain uniform.
4718   for (auto *BB : TheLoop->blocks())
4719     for (auto &I : *BB) {
4720       // If there's no pointer operand, there's nothing to do.
4721       auto *Ptr = dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
4722       if (!Ptr)
4723         continue;
4724 
4725       // True if all users of Ptr are memory accesses that have Ptr as their
4726       // pointer operand.
4727       auto UsersAreMemAccesses =
4728           llvm::all_of(Ptr->users(), [&](User *U) -> bool {
4729             return getLoadStorePointerOperand(U) == Ptr;
4730           });
4731 
4732       // Ensure the memory instruction will not be scalarized or used by
4733       // gather/scatter, making its pointer operand non-uniform. If the pointer
4734       // operand is used by any instruction other than a memory access, we
4735       // conservatively assume the pointer operand may be non-uniform.
4736       if (!UsersAreMemAccesses || !isUniformDecision(&I, VF))
4737         PossibleNonUniformPtrs.insert(Ptr);
4738 
4739       // If the memory instruction will be vectorized and its pointer operand
4740       // is consecutive-like, or interleaving - the pointer operand should
4741       // remain uniform.
4742       else
4743         ConsecutiveLikePtrs.insert(Ptr);
4744     }
4745 
4746   // Add to the Worklist all consecutive and consecutive-like pointers that
4747   // aren't also identified as possibly non-uniform.
4748   for (auto *V : ConsecutiveLikePtrs)
4749     if (PossibleNonUniformPtrs.find(V) == PossibleNonUniformPtrs.end())
4750       addToWorklistIfAllowed(V);
4751 
4752   // Expand Worklist in topological order: whenever a new instruction
4753   // is added , its users should be already inside Worklist.  It ensures
4754   // a uniform instruction will only be used by uniform instructions.
4755   unsigned idx = 0;
4756   while (idx != Worklist.size()) {
4757     Instruction *I = Worklist[idx++];
4758 
4759     for (auto OV : I->operand_values()) {
4760       // isOutOfScope operands cannot be uniform instructions.
4761       if (isOutOfScope(OV))
4762         continue;
4763       // First order recurrence Phi's should typically be considered
4764       // non-uniform.
4765       auto *OP = dyn_cast<PHINode>(OV);
4766       if (OP && Legal->isFirstOrderRecurrence(OP))
4767         continue;
4768       // If all the users of the operand are uniform, then add the
4769       // operand into the uniform worklist.
4770       auto *OI = cast<Instruction>(OV);
4771       if (llvm::all_of(OI->users(), [&](User *U) -> bool {
4772             auto *J = cast<Instruction>(U);
4773             return Worklist.count(J) ||
4774                    (OI == getLoadStorePointerOperand(J) &&
4775                     isUniformDecision(J, VF));
4776           }))
4777         addToWorklistIfAllowed(OI);
4778     }
4779   }
4780 
4781   // Returns true if Ptr is the pointer operand of a memory access instruction
4782   // I, and I is known to not require scalarization.
4783   auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
4784     return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF);
4785   };
4786 
4787   // For an instruction to be added into Worklist above, all its users inside
4788   // the loop should also be in Worklist. However, this condition cannot be
4789   // true for phi nodes that form a cyclic dependence. We must process phi
4790   // nodes separately. An induction variable will remain uniform if all users
4791   // of the induction variable and induction variable update remain uniform.
4792   // The code below handles both pointer and non-pointer induction variables.
4793   for (auto &Induction : *Legal->getInductionVars()) {
4794     auto *Ind = Induction.first;
4795     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4796 
4797     // Determine if all users of the induction variable are uniform after
4798     // vectorization.
4799     auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4800       auto *I = cast<Instruction>(U);
4801       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
4802              isVectorizedMemAccessUse(I, Ind);
4803     });
4804     if (!UniformInd)
4805       continue;
4806 
4807     // Determine if all users of the induction variable update instruction are
4808     // uniform after vectorization.
4809     auto UniformIndUpdate =
4810         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4811           auto *I = cast<Instruction>(U);
4812           return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
4813                  isVectorizedMemAccessUse(I, IndUpdate);
4814         });
4815     if (!UniformIndUpdate)
4816       continue;
4817 
4818     // The induction variable and its update instruction will remain uniform.
4819     addToWorklistIfAllowed(Ind);
4820     addToWorklistIfAllowed(IndUpdate);
4821   }
4822 
4823   Uniforms[VF].insert(Worklist.begin(), Worklist.end());
4824 }
4825 
4826 bool LoopVectorizationCostModel::runtimeChecksRequired() {
4827   LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
4828 
4829   if (Legal->getRuntimePointerChecking()->Need) {
4830     reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
4831         "runtime pointer checks needed. Enable vectorization of this "
4832         "loop with '#pragma clang loop vectorize(enable)' when "
4833         "compiling with -Os/-Oz",
4834         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4835     return true;
4836   }
4837 
4838   if (!PSE.getUnionPredicate().getPredicates().empty()) {
4839     reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
4840         "runtime SCEV checks needed. Enable vectorization of this "
4841         "loop with '#pragma clang loop vectorize(enable)' when "
4842         "compiling with -Os/-Oz",
4843         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4844     return true;
4845   }
4846 
4847   // FIXME: Avoid specializing for stride==1 instead of bailing out.
4848   if (!Legal->getLAI()->getSymbolicStrides().empty()) {
4849     reportVectorizationFailure("Runtime stride check is required with -Os/-Oz",
4850         "runtime stride == 1 checks needed. Enable vectorization of "
4851         "this loop with '#pragma clang loop vectorize(enable)' when "
4852         "compiling with -Os/-Oz",
4853         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4854     return true;
4855   }
4856 
4857   return false;
4858 }
4859 
4860 Optional<unsigned> LoopVectorizationCostModel::computeMaxVF() {
4861   if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
4862     // TODO: It may by useful to do since it's still likely to be dynamically
4863     // uniform if the target can skip.
4864     reportVectorizationFailure(
4865         "Not inserting runtime ptr check for divergent target",
4866         "runtime pointer checks needed. Not enabled for divergent target",
4867         "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
4868     return None;
4869   }
4870 
4871   unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
4872   LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
4873   if (TC == 1) {
4874     reportVectorizationFailure("Single iteration (non) loop",
4875         "loop trip count is one, irrelevant for vectorization",
4876         "SingleIterationLoop", ORE, TheLoop);
4877     return None;
4878   }
4879 
4880   switch (ScalarEpilogueStatus) {
4881   case CM_ScalarEpilogueAllowed:
4882     return computeFeasibleMaxVF(TC);
4883   case CM_ScalarEpilogueNotNeededUsePredicate:
4884     LLVM_DEBUG(
4885         dbgs() << "LV: vector predicate hint/switch found.\n"
4886                << "LV: Not allowing scalar epilogue, creating predicated "
4887                << "vector loop.\n");
4888     break;
4889   case CM_ScalarEpilogueNotAllowedLowTripLoop:
4890     // fallthrough as a special case of OptForSize
4891   case CM_ScalarEpilogueNotAllowedOptSize:
4892     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
4893       LLVM_DEBUG(
4894           dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
4895     else
4896       LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
4897                         << "count.\n");
4898 
4899     // Bail if runtime checks are required, which are not good when optimising
4900     // for size.
4901     if (runtimeChecksRequired())
4902       return None;
4903     break;
4904   }
4905 
4906   // Now try the tail folding
4907 
4908   // Invalidate interleave groups that require an epilogue if we can't mask
4909   // the interleave-group.
4910   if (!useMaskedInterleavedAccesses(TTI))
4911     InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
4912 
4913   unsigned MaxVF = computeFeasibleMaxVF(TC);
4914   if (TC > 0 && TC % MaxVF == 0) {
4915     // Accept MaxVF if we do not have a tail.
4916     LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
4917     return MaxVF;
4918   }
4919 
4920   // If we don't know the precise trip count, or if the trip count that we
4921   // found modulo the vectorization factor is not zero, try to fold the tail
4922   // by masking.
4923   // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
4924   if (Legal->prepareToFoldTailByMasking()) {
4925     FoldTailByMasking = true;
4926     return MaxVF;
4927   }
4928 
4929   if (TC == 0) {
4930     reportVectorizationFailure(
4931         "Unable to calculate the loop count due to complex control flow",
4932         "unable to calculate the loop count due to complex control flow",
4933         "UnknownLoopCountComplexCFG", ORE, TheLoop);
4934     return None;
4935   }
4936 
4937   reportVectorizationFailure(
4938       "Cannot optimize for size and vectorize at the same time.",
4939       "cannot optimize for size and vectorize at the same time. "
4940       "Enable vectorization of this loop with '#pragma clang loop "
4941       "vectorize(enable)' when compiling with -Os/-Oz",
4942       "NoTailLoopWithOptForSize", ORE, TheLoop);
4943   return None;
4944 }
4945 
4946 unsigned
4947 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) {
4948   MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
4949   unsigned SmallestType, WidestType;
4950   std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
4951   unsigned WidestRegister = TTI.getRegisterBitWidth(true);
4952 
4953   // Get the maximum safe dependence distance in bits computed by LAA.
4954   // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
4955   // the memory accesses that is most restrictive (involved in the smallest
4956   // dependence distance).
4957   unsigned MaxSafeRegisterWidth = Legal->getMaxSafeRegisterWidth();
4958 
4959   WidestRegister = std::min(WidestRegister, MaxSafeRegisterWidth);
4960 
4961   unsigned MaxVectorSize = WidestRegister / WidestType;
4962 
4963   LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
4964                     << " / " << WidestType << " bits.\n");
4965   LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
4966                     << WidestRegister << " bits.\n");
4967 
4968   assert(MaxVectorSize <= 256 && "Did not expect to pack so many elements"
4969                                  " into one vector!");
4970   if (MaxVectorSize == 0) {
4971     LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n");
4972     MaxVectorSize = 1;
4973     return MaxVectorSize;
4974   } else if (ConstTripCount && ConstTripCount < MaxVectorSize &&
4975              isPowerOf2_32(ConstTripCount)) {
4976     // We need to clamp the VF to be the ConstTripCount. There is no point in
4977     // choosing a higher viable VF as done in the loop below.
4978     LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: "
4979                       << ConstTripCount << "\n");
4980     MaxVectorSize = ConstTripCount;
4981     return MaxVectorSize;
4982   }
4983 
4984   unsigned MaxVF = MaxVectorSize;
4985   if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) ||
4986       (MaximizeBandwidth && isScalarEpilogueAllowed())) {
4987     // Collect all viable vectorization factors larger than the default MaxVF
4988     // (i.e. MaxVectorSize).
4989     SmallVector<unsigned, 8> VFs;
4990     unsigned NewMaxVectorSize = WidestRegister / SmallestType;
4991     for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2)
4992       VFs.push_back(VS);
4993 
4994     // For each VF calculate its register usage.
4995     auto RUs = calculateRegisterUsage(VFs);
4996 
4997     // Select the largest VF which doesn't require more registers than existing
4998     // ones.
4999     for (int i = RUs.size() - 1; i >= 0; --i) {
5000       bool Selected = true;
5001       for (auto& pair : RUs[i].MaxLocalUsers) {
5002         unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5003         if (pair.second > TargetNumRegisters)
5004           Selected = false;
5005       }
5006       if (Selected) {
5007         MaxVF = VFs[i];
5008         break;
5009       }
5010     }
5011     if (unsigned MinVF = TTI.getMinimumVF(SmallestType)) {
5012       if (MaxVF < MinVF) {
5013         LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
5014                           << ") with target's minimum: " << MinVF << '\n');
5015         MaxVF = MinVF;
5016       }
5017     }
5018   }
5019   return MaxVF;
5020 }
5021 
5022 VectorizationFactor
5023 LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) {
5024   float Cost = expectedCost(1).first;
5025   const float ScalarCost = Cost;
5026   unsigned Width = 1;
5027   LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n");
5028 
5029   bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
5030   if (ForceVectorization && MaxVF > 1) {
5031     // Ignore scalar width, because the user explicitly wants vectorization.
5032     // Initialize cost to max so that VF = 2 is, at least, chosen during cost
5033     // evaluation.
5034     Cost = std::numeric_limits<float>::max();
5035   }
5036 
5037   for (unsigned i = 2; i <= MaxVF; i *= 2) {
5038     // Notice that the vector loop needs to be executed less times, so
5039     // we need to divide the cost of the vector loops by the width of
5040     // the vector elements.
5041     VectorizationCostTy C = expectedCost(i);
5042     float VectorCost = C.first / (float)i;
5043     LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
5044                       << " costs: " << (int)VectorCost << ".\n");
5045     if (!C.second && !ForceVectorization) {
5046       LLVM_DEBUG(
5047           dbgs() << "LV: Not considering vector loop of width " << i
5048                  << " because it will not generate any vector instructions.\n");
5049       continue;
5050     }
5051     if (VectorCost < Cost) {
5052       Cost = VectorCost;
5053       Width = i;
5054     }
5055   }
5056 
5057   if (!EnableCondStoresVectorization && NumPredStores) {
5058     reportVectorizationFailure("There are conditional stores.",
5059         "store that is conditionally executed prevents vectorization",
5060         "ConditionalStore", ORE, TheLoop);
5061     Width = 1;
5062     Cost = ScalarCost;
5063   }
5064 
5065   LLVM_DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs()
5066              << "LV: Vectorization seems to be not beneficial, "
5067              << "but was forced by a user.\n");
5068   LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n");
5069   VectorizationFactor Factor = {Width, (unsigned)(Width * Cost)};
5070   return Factor;
5071 }
5072 
5073 std::pair<unsigned, unsigned>
5074 LoopVectorizationCostModel::getSmallestAndWidestTypes() {
5075   unsigned MinWidth = -1U;
5076   unsigned MaxWidth = 8;
5077   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5078 
5079   // For each block.
5080   for (BasicBlock *BB : TheLoop->blocks()) {
5081     // For each instruction in the loop.
5082     for (Instruction &I : BB->instructionsWithoutDebug()) {
5083       Type *T = I.getType();
5084 
5085       // Skip ignored values.
5086       if (ValuesToIgnore.find(&I) != ValuesToIgnore.end())
5087         continue;
5088 
5089       // Only examine Loads, Stores and PHINodes.
5090       if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
5091         continue;
5092 
5093       // Examine PHI nodes that are reduction variables. Update the type to
5094       // account for the recurrence type.
5095       if (auto *PN = dyn_cast<PHINode>(&I)) {
5096         if (!Legal->isReductionVariable(PN))
5097           continue;
5098         RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[PN];
5099         T = RdxDesc.getRecurrenceType();
5100       }
5101 
5102       // Examine the stored values.
5103       if (auto *ST = dyn_cast<StoreInst>(&I))
5104         T = ST->getValueOperand()->getType();
5105 
5106       // Ignore loaded pointer types and stored pointer types that are not
5107       // vectorizable.
5108       //
5109       // FIXME: The check here attempts to predict whether a load or store will
5110       //        be vectorized. We only know this for certain after a VF has
5111       //        been selected. Here, we assume that if an access can be
5112       //        vectorized, it will be. We should also look at extending this
5113       //        optimization to non-pointer types.
5114       //
5115       if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) &&
5116           !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I))
5117         continue;
5118 
5119       MinWidth = std::min(MinWidth,
5120                           (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
5121       MaxWidth = std::max(MaxWidth,
5122                           (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
5123     }
5124   }
5125 
5126   return {MinWidth, MaxWidth};
5127 }
5128 
5129 unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF,
5130                                                            unsigned LoopCost) {
5131   // -- The interleave heuristics --
5132   // We interleave the loop in order to expose ILP and reduce the loop overhead.
5133   // There are many micro-architectural considerations that we can't predict
5134   // at this level. For example, frontend pressure (on decode or fetch) due to
5135   // code size, or the number and capabilities of the execution ports.
5136   //
5137   // We use the following heuristics to select the interleave count:
5138   // 1. If the code has reductions, then we interleave to break the cross
5139   // iteration dependency.
5140   // 2. If the loop is really small, then we interleave to reduce the loop
5141   // overhead.
5142   // 3. We don't interleave if we think that we will spill registers to memory
5143   // due to the increased register pressure.
5144 
5145   if (!isScalarEpilogueAllowed())
5146     return 1;
5147 
5148   // We used the distance for the interleave count.
5149   if (Legal->getMaxSafeDepDistBytes() != -1U)
5150     return 1;
5151 
5152   // Do not interleave loops with a relatively small known or estimated trip
5153   // count.
5154   auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
5155   if (BestKnownTC && *BestKnownTC < TinyTripCountInterleaveThreshold)
5156     return 1;
5157 
5158   RegisterUsage R = calculateRegisterUsage({VF})[0];
5159   // We divide by these constants so assume that we have at least one
5160   // instruction that uses at least one register.
5161   for (auto& pair : R.MaxLocalUsers) {
5162     pair.second = std::max(pair.second, 1U);
5163   }
5164 
5165   // We calculate the interleave count using the following formula.
5166   // Subtract the number of loop invariants from the number of available
5167   // registers. These registers are used by all of the interleaved instances.
5168   // Next, divide the remaining registers by the number of registers that is
5169   // required by the loop, in order to estimate how many parallel instances
5170   // fit without causing spills. All of this is rounded down if necessary to be
5171   // a power of two. We want power of two interleave count to simplify any
5172   // addressing operations or alignment considerations.
5173   // We also want power of two interleave counts to ensure that the induction
5174   // variable of the vector loop wraps to zero, when tail is folded by masking;
5175   // this currently happens when OptForSize, in which case IC is set to 1 above.
5176   unsigned IC = UINT_MAX;
5177 
5178   for (auto& pair : R.MaxLocalUsers) {
5179     unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5180     LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
5181                       << " registers of "
5182                       << TTI.getRegisterClassName(pair.first) << " register class\n");
5183     if (VF == 1) {
5184       if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
5185         TargetNumRegisters = ForceTargetNumScalarRegs;
5186     } else {
5187       if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
5188         TargetNumRegisters = ForceTargetNumVectorRegs;
5189     }
5190     unsigned MaxLocalUsers = pair.second;
5191     unsigned LoopInvariantRegs = 0;
5192     if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end())
5193       LoopInvariantRegs = R.LoopInvariantRegs[pair.first];
5194 
5195     unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers);
5196     // Don't count the induction variable as interleaved.
5197     if (EnableIndVarRegisterHeur) {
5198       TmpIC =
5199           PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) /
5200                         std::max(1U, (MaxLocalUsers - 1)));
5201     }
5202 
5203     IC = std::min(IC, TmpIC);
5204   }
5205 
5206   // Clamp the interleave ranges to reasonable counts.
5207   unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF);
5208 
5209   // Check if the user has overridden the max.
5210   if (VF == 1) {
5211     if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
5212       MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
5213   } else {
5214     if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
5215       MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
5216   }
5217 
5218   // If trip count is known or estimated compile time constant, limit the
5219   // interleave count to be less than the trip count divided by VF.
5220   if (BestKnownTC) {
5221     MaxInterleaveCount = std::min(*BestKnownTC / VF, MaxInterleaveCount);
5222   }
5223 
5224   // If we did not calculate the cost for VF (because the user selected the VF)
5225   // then we calculate the cost of VF here.
5226   if (LoopCost == 0)
5227     LoopCost = expectedCost(VF).first;
5228 
5229   assert(LoopCost && "Non-zero loop cost expected");
5230 
5231   // Clamp the calculated IC to be between the 1 and the max interleave count
5232   // that the target and trip count allows.
5233   if (IC > MaxInterleaveCount)
5234     IC = MaxInterleaveCount;
5235   else if (IC < 1)
5236     IC = 1;
5237 
5238   // Interleave if we vectorized this loop and there is a reduction that could
5239   // benefit from interleaving.
5240   if (VF > 1 && !Legal->getReductionVars()->empty()) {
5241     LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
5242     return IC;
5243   }
5244 
5245   // Note that if we've already vectorized the loop we will have done the
5246   // runtime check and so interleaving won't require further checks.
5247   bool InterleavingRequiresRuntimePointerCheck =
5248       (VF == 1 && Legal->getRuntimePointerChecking()->Need);
5249 
5250   // We want to interleave small loops in order to reduce the loop overhead and
5251   // potentially expose ILP opportunities.
5252   LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n');
5253   if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) {
5254     // We assume that the cost overhead is 1 and we use the cost model
5255     // to estimate the cost of the loop and interleave until the cost of the
5256     // loop overhead is about 5% of the cost of the loop.
5257     unsigned SmallIC =
5258         std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost));
5259 
5260     // Interleave until store/load ports (estimated by max interleave count) are
5261     // saturated.
5262     unsigned NumStores = Legal->getNumStores();
5263     unsigned NumLoads = Legal->getNumLoads();
5264     unsigned StoresIC = IC / (NumStores ? NumStores : 1);
5265     unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
5266 
5267     // If we have a scalar reduction (vector reductions are already dealt with
5268     // by this point), we can increase the critical path length if the loop
5269     // we're interleaving is inside another loop. Limit, by default to 2, so the
5270     // critical path only gets increased by one reduction operation.
5271     if (!Legal->getReductionVars()->empty() && TheLoop->getLoopDepth() > 1) {
5272       unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
5273       SmallIC = std::min(SmallIC, F);
5274       StoresIC = std::min(StoresIC, F);
5275       LoadsIC = std::min(LoadsIC, F);
5276     }
5277 
5278     if (EnableLoadStoreRuntimeInterleave &&
5279         std::max(StoresIC, LoadsIC) > SmallIC) {
5280       LLVM_DEBUG(
5281           dbgs() << "LV: Interleaving to saturate store or load ports.\n");
5282       return std::max(StoresIC, LoadsIC);
5283     }
5284 
5285     LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
5286     return SmallIC;
5287   }
5288 
5289   // Interleave if this is a large loop (small loops are already dealt with by
5290   // this point) that could benefit from interleaving.
5291   bool HasReductions = !Legal->getReductionVars()->empty();
5292   if (TTI.enableAggressiveInterleaving(HasReductions)) {
5293     LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5294     return IC;
5295   }
5296 
5297   LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
5298   return 1;
5299 }
5300 
5301 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
5302 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {
5303   // This function calculates the register usage by measuring the highest number
5304   // of values that are alive at a single location. Obviously, this is a very
5305   // rough estimation. We scan the loop in a topological order in order and
5306   // assign a number to each instruction. We use RPO to ensure that defs are
5307   // met before their users. We assume that each instruction that has in-loop
5308   // users starts an interval. We record every time that an in-loop value is
5309   // used, so we have a list of the first and last occurrences of each
5310   // instruction. Next, we transpose this data structure into a multi map that
5311   // holds the list of intervals that *end* at a specific location. This multi
5312   // map allows us to perform a linear search. We scan the instructions linearly
5313   // and record each time that a new interval starts, by placing it in a set.
5314   // If we find this value in the multi-map then we remove it from the set.
5315   // The max register usage is the maximum size of the set.
5316   // We also search for instructions that are defined outside the loop, but are
5317   // used inside the loop. We need this number separately from the max-interval
5318   // usage number because when we unroll, loop-invariant values do not take
5319   // more register.
5320   LoopBlocksDFS DFS(TheLoop);
5321   DFS.perform(LI);
5322 
5323   RegisterUsage RU;
5324 
5325   // Each 'key' in the map opens a new interval. The values
5326   // of the map are the index of the 'last seen' usage of the
5327   // instruction that is the key.
5328   using IntervalMap = DenseMap<Instruction *, unsigned>;
5329 
5330   // Maps instruction to its index.
5331   SmallVector<Instruction *, 64> IdxToInstr;
5332   // Marks the end of each interval.
5333   IntervalMap EndPoint;
5334   // Saves the list of instruction indices that are used in the loop.
5335   SmallPtrSet<Instruction *, 8> Ends;
5336   // Saves the list of values that are used in the loop but are
5337   // defined outside the loop, such as arguments and constants.
5338   SmallPtrSet<Value *, 8> LoopInvariants;
5339 
5340   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
5341     for (Instruction &I : BB->instructionsWithoutDebug()) {
5342       IdxToInstr.push_back(&I);
5343 
5344       // Save the end location of each USE.
5345       for (Value *U : I.operands()) {
5346         auto *Instr = dyn_cast<Instruction>(U);
5347 
5348         // Ignore non-instruction values such as arguments, constants, etc.
5349         if (!Instr)
5350           continue;
5351 
5352         // If this instruction is outside the loop then record it and continue.
5353         if (!TheLoop->contains(Instr)) {
5354           LoopInvariants.insert(Instr);
5355           continue;
5356         }
5357 
5358         // Overwrite previous end points.
5359         EndPoint[Instr] = IdxToInstr.size();
5360         Ends.insert(Instr);
5361       }
5362     }
5363   }
5364 
5365   // Saves the list of intervals that end with the index in 'key'.
5366   using InstrList = SmallVector<Instruction *, 2>;
5367   DenseMap<unsigned, InstrList> TransposeEnds;
5368 
5369   // Transpose the EndPoints to a list of values that end at each index.
5370   for (auto &Interval : EndPoint)
5371     TransposeEnds[Interval.second].push_back(Interval.first);
5372 
5373   SmallPtrSet<Instruction *, 8> OpenIntervals;
5374 
5375   // Get the size of the widest register.
5376   unsigned MaxSafeDepDist = -1U;
5377   if (Legal->getMaxSafeDepDistBytes() != -1U)
5378     MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8;
5379   unsigned WidestRegister =
5380       std::min(TTI.getRegisterBitWidth(true), MaxSafeDepDist);
5381   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5382 
5383   SmallVector<RegisterUsage, 8> RUs(VFs.size());
5384   SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size());
5385 
5386   LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
5387 
5388   // A lambda that gets the register usage for the given type and VF.
5389   auto GetRegUsage = [&DL, WidestRegister](Type *Ty, unsigned VF) {
5390     if (Ty->isTokenTy())
5391       return 0U;
5392     unsigned TypeSize = DL.getTypeSizeInBits(Ty->getScalarType());
5393     return std::max<unsigned>(1, VF * TypeSize / WidestRegister);
5394   };
5395 
5396   for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
5397     Instruction *I = IdxToInstr[i];
5398 
5399     // Remove all of the instructions that end at this location.
5400     InstrList &List = TransposeEnds[i];
5401     for (Instruction *ToRemove : List)
5402       OpenIntervals.erase(ToRemove);
5403 
5404     // Ignore instructions that are never used within the loop.
5405     if (Ends.find(I) == Ends.end())
5406       continue;
5407 
5408     // Skip ignored values.
5409     if (ValuesToIgnore.find(I) != ValuesToIgnore.end())
5410       continue;
5411 
5412     // For each VF find the maximum usage of registers.
5413     for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
5414       // Count the number of live intervals.
5415       SmallMapVector<unsigned, unsigned, 4> RegUsage;
5416 
5417       if (VFs[j] == 1) {
5418         for (auto Inst : OpenIntervals) {
5419           unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
5420           if (RegUsage.find(ClassID) == RegUsage.end())
5421             RegUsage[ClassID] = 1;
5422           else
5423             RegUsage[ClassID] += 1;
5424         }
5425       } else {
5426         collectUniformsAndScalars(VFs[j]);
5427         for (auto Inst : OpenIntervals) {
5428           // Skip ignored values for VF > 1.
5429           if (VecValuesToIgnore.find(Inst) != VecValuesToIgnore.end())
5430             continue;
5431           if (isScalarAfterVectorization(Inst, VFs[j])) {
5432             unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
5433             if (RegUsage.find(ClassID) == RegUsage.end())
5434               RegUsage[ClassID] = 1;
5435             else
5436               RegUsage[ClassID] += 1;
5437           } else {
5438             unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType());
5439             if (RegUsage.find(ClassID) == RegUsage.end())
5440               RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]);
5441             else
5442               RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]);
5443           }
5444         }
5445       }
5446 
5447       for (auto& pair : RegUsage) {
5448         if (MaxUsages[j].find(pair.first) != MaxUsages[j].end())
5449           MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second);
5450         else
5451           MaxUsages[j][pair.first] = pair.second;
5452       }
5453     }
5454 
5455     LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
5456                       << OpenIntervals.size() << '\n');
5457 
5458     // Add the current instruction to the list of open intervals.
5459     OpenIntervals.insert(I);
5460   }
5461 
5462   for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
5463     SmallMapVector<unsigned, unsigned, 4> Invariant;
5464 
5465     for (auto Inst : LoopInvariants) {
5466       unsigned Usage = VFs[i] == 1 ? 1 : GetRegUsage(Inst->getType(), VFs[i]);
5467       unsigned ClassID = TTI.getRegisterClassForType(VFs[i] > 1, Inst->getType());
5468       if (Invariant.find(ClassID) == Invariant.end())
5469         Invariant[ClassID] = Usage;
5470       else
5471         Invariant[ClassID] += Usage;
5472     }
5473 
5474     LLVM_DEBUG({
5475       dbgs() << "LV(REG): VF = " << VFs[i] << '\n';
5476       dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size()
5477              << " item\n";
5478       for (const auto &pair : MaxUsages[i]) {
5479         dbgs() << "LV(REG): RegisterClass: "
5480                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5481                << " registers\n";
5482       }
5483       dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()
5484              << " item\n";
5485       for (const auto &pair : Invariant) {
5486         dbgs() << "LV(REG): RegisterClass: "
5487                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5488                << " registers\n";
5489       }
5490     });
5491 
5492     RU.LoopInvariantRegs = Invariant;
5493     RU.MaxLocalUsers = MaxUsages[i];
5494     RUs[i] = RU;
5495   }
5496 
5497   return RUs;
5498 }
5499 
5500 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){
5501   // TODO: Cost model for emulated masked load/store is completely
5502   // broken. This hack guides the cost model to use an artificially
5503   // high enough value to practically disable vectorization with such
5504   // operations, except where previously deployed legality hack allowed
5505   // using very low cost values. This is to avoid regressions coming simply
5506   // from moving "masked load/store" check from legality to cost model.
5507   // Masked Load/Gather emulation was previously never allowed.
5508   // Limited number of Masked Store/Scatter emulation was allowed.
5509   assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction");
5510   return isa<LoadInst>(I) ||
5511          (isa<StoreInst>(I) &&
5512           NumPredStores > NumberOfStoresToPredicate);
5513 }
5514 
5515 void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) {
5516   // If we aren't vectorizing the loop, or if we've already collected the
5517   // instructions to scalarize, there's nothing to do. Collection may already
5518   // have occurred if we have a user-selected VF and are now computing the
5519   // expected cost for interleaving.
5520   if (VF < 2 || InstsToScalarize.find(VF) != InstsToScalarize.end())
5521     return;
5522 
5523   // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
5524   // not profitable to scalarize any instructions, the presence of VF in the
5525   // map will indicate that we've analyzed it already.
5526   ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
5527 
5528   // Find all the instructions that are scalar with predication in the loop and
5529   // determine if it would be better to not if-convert the blocks they are in.
5530   // If so, we also record the instructions to scalarize.
5531   for (BasicBlock *BB : TheLoop->blocks()) {
5532     if (!blockNeedsPredication(BB))
5533       continue;
5534     for (Instruction &I : *BB)
5535       if (isScalarWithPredication(&I)) {
5536         ScalarCostsTy ScalarCosts;
5537         // Do not apply discount logic if hacked cost is needed
5538         // for emulated masked memrefs.
5539         if (!useEmulatedMaskMemRefHack(&I) &&
5540             computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
5541           ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
5542         // Remember that BB will remain after vectorization.
5543         PredicatedBBsAfterVectorization.insert(BB);
5544       }
5545   }
5546 }
5547 
5548 int LoopVectorizationCostModel::computePredInstDiscount(
5549     Instruction *PredInst, DenseMap<Instruction *, unsigned> &ScalarCosts,
5550     unsigned VF) {
5551   assert(!isUniformAfterVectorization(PredInst, VF) &&
5552          "Instruction marked uniform-after-vectorization will be predicated");
5553 
5554   // Initialize the discount to zero, meaning that the scalar version and the
5555   // vector version cost the same.
5556   int Discount = 0;
5557 
5558   // Holds instructions to analyze. The instructions we visit are mapped in
5559   // ScalarCosts. Those instructions are the ones that would be scalarized if
5560   // we find that the scalar version costs less.
5561   SmallVector<Instruction *, 8> Worklist;
5562 
5563   // Returns true if the given instruction can be scalarized.
5564   auto canBeScalarized = [&](Instruction *I) -> bool {
5565     // We only attempt to scalarize instructions forming a single-use chain
5566     // from the original predicated block that would otherwise be vectorized.
5567     // Although not strictly necessary, we give up on instructions we know will
5568     // already be scalar to avoid traversing chains that are unlikely to be
5569     // beneficial.
5570     if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
5571         isScalarAfterVectorization(I, VF))
5572       return false;
5573 
5574     // If the instruction is scalar with predication, it will be analyzed
5575     // separately. We ignore it within the context of PredInst.
5576     if (isScalarWithPredication(I))
5577       return false;
5578 
5579     // If any of the instruction's operands are uniform after vectorization,
5580     // the instruction cannot be scalarized. This prevents, for example, a
5581     // masked load from being scalarized.
5582     //
5583     // We assume we will only emit a value for lane zero of an instruction
5584     // marked uniform after vectorization, rather than VF identical values.
5585     // Thus, if we scalarize an instruction that uses a uniform, we would
5586     // create uses of values corresponding to the lanes we aren't emitting code
5587     // for. This behavior can be changed by allowing getScalarValue to clone
5588     // the lane zero values for uniforms rather than asserting.
5589     for (Use &U : I->operands())
5590       if (auto *J = dyn_cast<Instruction>(U.get()))
5591         if (isUniformAfterVectorization(J, VF))
5592           return false;
5593 
5594     // Otherwise, we can scalarize the instruction.
5595     return true;
5596   };
5597 
5598   // Compute the expected cost discount from scalarizing the entire expression
5599   // feeding the predicated instruction. We currently only consider expressions
5600   // that are single-use instruction chains.
5601   Worklist.push_back(PredInst);
5602   while (!Worklist.empty()) {
5603     Instruction *I = Worklist.pop_back_val();
5604 
5605     // If we've already analyzed the instruction, there's nothing to do.
5606     if (ScalarCosts.find(I) != ScalarCosts.end())
5607       continue;
5608 
5609     // Compute the cost of the vector instruction. Note that this cost already
5610     // includes the scalarization overhead of the predicated instruction.
5611     unsigned VectorCost = getInstructionCost(I, VF).first;
5612 
5613     // Compute the cost of the scalarized instruction. This cost is the cost of
5614     // the instruction as if it wasn't if-converted and instead remained in the
5615     // predicated block. We will scale this cost by block probability after
5616     // computing the scalarization overhead.
5617     unsigned ScalarCost = VF * getInstructionCost(I, 1).first;
5618 
5619     // Compute the scalarization overhead of needed insertelement instructions
5620     // and phi nodes.
5621     if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) {
5622       ScalarCost += TTI.getScalarizationOverhead(ToVectorTy(I->getType(), VF),
5623                                                  true, false);
5624       ScalarCost += VF * TTI.getCFInstrCost(Instruction::PHI);
5625     }
5626 
5627     // Compute the scalarization overhead of needed extractelement
5628     // instructions. For each of the instruction's operands, if the operand can
5629     // be scalarized, add it to the worklist; otherwise, account for the
5630     // overhead.
5631     for (Use &U : I->operands())
5632       if (auto *J = dyn_cast<Instruction>(U.get())) {
5633         assert(VectorType::isValidElementType(J->getType()) &&
5634                "Instruction has non-scalar type");
5635         if (canBeScalarized(J))
5636           Worklist.push_back(J);
5637         else if (needsExtract(J, VF))
5638           ScalarCost += TTI.getScalarizationOverhead(
5639                               ToVectorTy(J->getType(),VF), false, true);
5640       }
5641 
5642     // Scale the total scalar cost by block probability.
5643     ScalarCost /= getReciprocalPredBlockProb();
5644 
5645     // Compute the discount. A non-negative discount means the vector version
5646     // of the instruction costs more, and scalarizing would be beneficial.
5647     Discount += VectorCost - ScalarCost;
5648     ScalarCosts[I] = ScalarCost;
5649   }
5650 
5651   return Discount;
5652 }
5653 
5654 LoopVectorizationCostModel::VectorizationCostTy
5655 LoopVectorizationCostModel::expectedCost(unsigned VF) {
5656   VectorizationCostTy Cost;
5657 
5658   // For each block.
5659   for (BasicBlock *BB : TheLoop->blocks()) {
5660     VectorizationCostTy BlockCost;
5661 
5662     // For each instruction in the old loop.
5663     for (Instruction &I : BB->instructionsWithoutDebug()) {
5664       // Skip ignored values.
5665       if (ValuesToIgnore.find(&I) != ValuesToIgnore.end() ||
5666           (VF > 1 && VecValuesToIgnore.find(&I) != VecValuesToIgnore.end()))
5667         continue;
5668 
5669       VectorizationCostTy C = getInstructionCost(&I, VF);
5670 
5671       // Check if we should override the cost.
5672       if (ForceTargetInstructionCost.getNumOccurrences() > 0)
5673         C.first = ForceTargetInstructionCost;
5674 
5675       BlockCost.first += C.first;
5676       BlockCost.second |= C.second;
5677       LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first
5678                         << " for VF " << VF << " For instruction: " << I
5679                         << '\n');
5680     }
5681 
5682     // If we are vectorizing a predicated block, it will have been
5683     // if-converted. This means that the block's instructions (aside from
5684     // stores and instructions that may divide by zero) will now be
5685     // unconditionally executed. For the scalar case, we may not always execute
5686     // the predicated block. Thus, scale the block's cost by the probability of
5687     // executing it.
5688     if (VF == 1 && blockNeedsPredication(BB))
5689       BlockCost.first /= getReciprocalPredBlockProb();
5690 
5691     Cost.first += BlockCost.first;
5692     Cost.second |= BlockCost.second;
5693   }
5694 
5695   return Cost;
5696 }
5697 
5698 /// Gets Address Access SCEV after verifying that the access pattern
5699 /// is loop invariant except the induction variable dependence.
5700 ///
5701 /// This SCEV can be sent to the Target in order to estimate the address
5702 /// calculation cost.
5703 static const SCEV *getAddressAccessSCEV(
5704               Value *Ptr,
5705               LoopVectorizationLegality *Legal,
5706               PredicatedScalarEvolution &PSE,
5707               const Loop *TheLoop) {
5708 
5709   auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
5710   if (!Gep)
5711     return nullptr;
5712 
5713   // We are looking for a gep with all loop invariant indices except for one
5714   // which should be an induction variable.
5715   auto SE = PSE.getSE();
5716   unsigned NumOperands = Gep->getNumOperands();
5717   for (unsigned i = 1; i < NumOperands; ++i) {
5718     Value *Opd = Gep->getOperand(i);
5719     if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
5720         !Legal->isInductionVariable(Opd))
5721       return nullptr;
5722   }
5723 
5724   // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
5725   return PSE.getSCEV(Ptr);
5726 }
5727 
5728 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
5729   return Legal->hasStride(I->getOperand(0)) ||
5730          Legal->hasStride(I->getOperand(1));
5731 }
5732 
5733 unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
5734                                                                  unsigned VF) {
5735   assert(VF > 1 && "Scalarization cost of instruction implies vectorization.");
5736   Type *ValTy = getMemInstValueType(I);
5737   auto SE = PSE.getSE();
5738 
5739   unsigned AS = getLoadStoreAddressSpace(I);
5740   Value *Ptr = getLoadStorePointerOperand(I);
5741   Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
5742 
5743   // Figure out whether the access is strided and get the stride value
5744   // if it's known in compile time
5745   const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
5746 
5747   // Get the cost of the scalar memory instruction and address computation.
5748   unsigned Cost = VF * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
5749 
5750   // Don't pass *I here, since it is scalar but will actually be part of a
5751   // vectorized loop where the user of it is a vectorized instruction.
5752   const MaybeAlign Alignment = getLoadStoreAlignment(I);
5753   Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(),
5754                                    Alignment, AS);
5755 
5756   // Get the overhead of the extractelement and insertelement instructions
5757   // we might create due to scalarization.
5758   Cost += getScalarizationOverhead(I, VF);
5759 
5760   // If we have a predicated store, it may not be executed for each vector
5761   // lane. Scale the cost by the probability of executing the predicated
5762   // block.
5763   if (isPredicatedInst(I)) {
5764     Cost /= getReciprocalPredBlockProb();
5765 
5766     if (useEmulatedMaskMemRefHack(I))
5767       // Artificially setting to a high enough value to practically disable
5768       // vectorization with such operations.
5769       Cost = 3000000;
5770   }
5771 
5772   return Cost;
5773 }
5774 
5775 unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
5776                                                              unsigned VF) {
5777   Type *ValTy = getMemInstValueType(I);
5778   Type *VectorTy = ToVectorTy(ValTy, VF);
5779   Value *Ptr = getLoadStorePointerOperand(I);
5780   unsigned AS = getLoadStoreAddressSpace(I);
5781   int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
5782 
5783   assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
5784          "Stride should be 1 or -1 for consecutive memory access");
5785   const MaybeAlign Alignment = getLoadStoreAlignment(I);
5786   unsigned Cost = 0;
5787   if (Legal->isMaskRequired(I))
5788     Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy,
5789                                       Alignment ? Alignment->value() : 0, AS);
5790   else
5791     Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, I);
5792 
5793   bool Reverse = ConsecutiveStride < 0;
5794   if (Reverse)
5795     Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
5796   return Cost;
5797 }
5798 
5799 unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
5800                                                          unsigned VF) {
5801   Type *ValTy = getMemInstValueType(I);
5802   Type *VectorTy = ToVectorTy(ValTy, VF);
5803   const MaybeAlign Alignment = getLoadStoreAlignment(I);
5804   unsigned AS = getLoadStoreAddressSpace(I);
5805   if (isa<LoadInst>(I)) {
5806     return TTI.getAddressComputationCost(ValTy) +
5807            TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS) +
5808            TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
5809   }
5810   StoreInst *SI = cast<StoreInst>(I);
5811 
5812   bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand());
5813   return TTI.getAddressComputationCost(ValTy) +
5814          TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS) +
5815          (isLoopInvariantStoreValue
5816               ? 0
5817               : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
5818                                        VF - 1));
5819 }
5820 
5821 unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
5822                                                           unsigned VF) {
5823   Type *ValTy = getMemInstValueType(I);
5824   Type *VectorTy = ToVectorTy(ValTy, VF);
5825   const MaybeAlign Alignment = getLoadStoreAlignment(I);
5826   Value *Ptr = getLoadStorePointerOperand(I);
5827 
5828   return TTI.getAddressComputationCost(VectorTy) +
5829          TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr,
5830                                     Legal->isMaskRequired(I),
5831                                     Alignment ? Alignment->value() : 0);
5832 }
5833 
5834 unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
5835                                                             unsigned VF) {
5836   Type *ValTy = getMemInstValueType(I);
5837   Type *VectorTy = ToVectorTy(ValTy, VF);
5838   unsigned AS = getLoadStoreAddressSpace(I);
5839 
5840   auto Group = getInterleavedAccessGroup(I);
5841   assert(Group && "Fail to get an interleaved access group.");
5842 
5843   unsigned InterleaveFactor = Group->getFactor();
5844   Type *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
5845 
5846   // Holds the indices of existing members in an interleaved load group.
5847   // An interleaved store group doesn't need this as it doesn't allow gaps.
5848   SmallVector<unsigned, 4> Indices;
5849   if (isa<LoadInst>(I)) {
5850     for (unsigned i = 0; i < InterleaveFactor; i++)
5851       if (Group->getMember(i))
5852         Indices.push_back(i);
5853   }
5854 
5855   // Calculate the cost of the whole interleaved group.
5856   bool UseMaskForGaps =
5857       Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
5858   unsigned Cost = TTI.getInterleavedMemoryOpCost(
5859       I->getOpcode(), WideVecTy, Group->getFactor(), Indices,
5860       Group->getAlignment(), AS, Legal->isMaskRequired(I), UseMaskForGaps);
5861 
5862   if (Group->isReverse()) {
5863     // TODO: Add support for reversed masked interleaved access.
5864     assert(!Legal->isMaskRequired(I) &&
5865            "Reverse masked interleaved access not supported.");
5866     Cost += Group->getNumMembers() *
5867             TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
5868   }
5869   return Cost;
5870 }
5871 
5872 unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
5873                                                               unsigned VF) {
5874   // Calculate scalar cost only. Vectorization cost should be ready at this
5875   // moment.
5876   if (VF == 1) {
5877     Type *ValTy = getMemInstValueType(I);
5878     const MaybeAlign Alignment = getLoadStoreAlignment(I);
5879     unsigned AS = getLoadStoreAddressSpace(I);
5880 
5881     return TTI.getAddressComputationCost(ValTy) +
5882            TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, I);
5883   }
5884   return getWideningCost(I, VF);
5885 }
5886 
5887 LoopVectorizationCostModel::VectorizationCostTy
5888 LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
5889   // If we know that this instruction will remain uniform, check the cost of
5890   // the scalar version.
5891   if (isUniformAfterVectorization(I, VF))
5892     VF = 1;
5893 
5894   if (VF > 1 && isProfitableToScalarize(I, VF))
5895     return VectorizationCostTy(InstsToScalarize[VF][I], false);
5896 
5897   // Forced scalars do not have any scalarization overhead.
5898   auto ForcedScalar = ForcedScalars.find(VF);
5899   if (VF > 1 && ForcedScalar != ForcedScalars.end()) {
5900     auto InstSet = ForcedScalar->second;
5901     if (InstSet.find(I) != InstSet.end())
5902       return VectorizationCostTy((getInstructionCost(I, 1).first * VF), false);
5903   }
5904 
5905   Type *VectorTy;
5906   unsigned C = getInstructionCost(I, VF, VectorTy);
5907 
5908   bool TypeNotScalarized =
5909       VF > 1 && VectorTy->isVectorTy() && TTI.getNumberOfParts(VectorTy) < VF;
5910   return VectorizationCostTy(C, TypeNotScalarized);
5911 }
5912 
5913 unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
5914                                                               unsigned VF) {
5915 
5916   if (VF == 1)
5917     return 0;
5918 
5919   unsigned Cost = 0;
5920   Type *RetTy = ToVectorTy(I->getType(), VF);
5921   if (!RetTy->isVoidTy() &&
5922       (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
5923     Cost += TTI.getScalarizationOverhead(RetTy, true, false);
5924 
5925   // Some targets keep addresses scalar.
5926   if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
5927     return Cost;
5928 
5929   // Some targets support efficient element stores.
5930   if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
5931     return Cost;
5932 
5933   // Collect operands to consider.
5934   CallInst *CI = dyn_cast<CallInst>(I);
5935   Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands();
5936 
5937   // Skip operands that do not require extraction/scalarization and do not incur
5938   // any overhead.
5939   return Cost + TTI.getOperandsScalarizationOverhead(
5940                     filterExtractingOperands(Ops, VF), VF);
5941 }
5942 
5943 void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {
5944   if (VF == 1)
5945     return;
5946   NumPredStores = 0;
5947   for (BasicBlock *BB : TheLoop->blocks()) {
5948     // For each instruction in the old loop.
5949     for (Instruction &I : *BB) {
5950       Value *Ptr =  getLoadStorePointerOperand(&I);
5951       if (!Ptr)
5952         continue;
5953 
5954       // TODO: We should generate better code and update the cost model for
5955       // predicated uniform stores. Today they are treated as any other
5956       // predicated store (see added test cases in
5957       // invariant-store-vectorization.ll).
5958       if (isa<StoreInst>(&I) && isScalarWithPredication(&I))
5959         NumPredStores++;
5960 
5961       if (Legal->isUniform(Ptr) &&
5962           // Conditional loads and stores should be scalarized and predicated.
5963           // isScalarWithPredication cannot be used here since masked
5964           // gather/scatters are not considered scalar with predication.
5965           !Legal->blockNeedsPredication(I.getParent())) {
5966         // TODO: Avoid replicating loads and stores instead of
5967         // relying on instcombine to remove them.
5968         // Load: Scalar load + broadcast
5969         // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
5970         unsigned Cost = getUniformMemOpCost(&I, VF);
5971         setWideningDecision(&I, VF, CM_Scalarize, Cost);
5972         continue;
5973       }
5974 
5975       // We assume that widening is the best solution when possible.
5976       if (memoryInstructionCanBeWidened(&I, VF)) {
5977         unsigned Cost = getConsecutiveMemOpCost(&I, VF);
5978         int ConsecutiveStride =
5979                Legal->isConsecutivePtr(getLoadStorePointerOperand(&I));
5980         assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
5981                "Expected consecutive stride.");
5982         InstWidening Decision =
5983             ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
5984         setWideningDecision(&I, VF, Decision, Cost);
5985         continue;
5986       }
5987 
5988       // Choose between Interleaving, Gather/Scatter or Scalarization.
5989       unsigned InterleaveCost = std::numeric_limits<unsigned>::max();
5990       unsigned NumAccesses = 1;
5991       if (isAccessInterleaved(&I)) {
5992         auto Group = getInterleavedAccessGroup(&I);
5993         assert(Group && "Fail to get an interleaved access group.");
5994 
5995         // Make one decision for the whole group.
5996         if (getWideningDecision(&I, VF) != CM_Unknown)
5997           continue;
5998 
5999         NumAccesses = Group->getNumMembers();
6000         if (interleavedAccessCanBeWidened(&I, VF))
6001           InterleaveCost = getInterleaveGroupCost(&I, VF);
6002       }
6003 
6004       unsigned GatherScatterCost =
6005           isLegalGatherOrScatter(&I)
6006               ? getGatherScatterCost(&I, VF) * NumAccesses
6007               : std::numeric_limits<unsigned>::max();
6008 
6009       unsigned ScalarizationCost =
6010           getMemInstScalarizationCost(&I, VF) * NumAccesses;
6011 
6012       // Choose better solution for the current VF,
6013       // write down this decision and use it during vectorization.
6014       unsigned Cost;
6015       InstWidening Decision;
6016       if (InterleaveCost <= GatherScatterCost &&
6017           InterleaveCost < ScalarizationCost) {
6018         Decision = CM_Interleave;
6019         Cost = InterleaveCost;
6020       } else if (GatherScatterCost < ScalarizationCost) {
6021         Decision = CM_GatherScatter;
6022         Cost = GatherScatterCost;
6023       } else {
6024         Decision = CM_Scalarize;
6025         Cost = ScalarizationCost;
6026       }
6027       // If the instructions belongs to an interleave group, the whole group
6028       // receives the same decision. The whole group receives the cost, but
6029       // the cost will actually be assigned to one instruction.
6030       if (auto Group = getInterleavedAccessGroup(&I))
6031         setWideningDecision(Group, VF, Decision, Cost);
6032       else
6033         setWideningDecision(&I, VF, Decision, Cost);
6034     }
6035   }
6036 
6037   // Make sure that any load of address and any other address computation
6038   // remains scalar unless there is gather/scatter support. This avoids
6039   // inevitable extracts into address registers, and also has the benefit of
6040   // activating LSR more, since that pass can't optimize vectorized
6041   // addresses.
6042   if (TTI.prefersVectorizedAddressing())
6043     return;
6044 
6045   // Start with all scalar pointer uses.
6046   SmallPtrSet<Instruction *, 8> AddrDefs;
6047   for (BasicBlock *BB : TheLoop->blocks())
6048     for (Instruction &I : *BB) {
6049       Instruction *PtrDef =
6050         dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
6051       if (PtrDef && TheLoop->contains(PtrDef) &&
6052           getWideningDecision(&I, VF) != CM_GatherScatter)
6053         AddrDefs.insert(PtrDef);
6054     }
6055 
6056   // Add all instructions used to generate the addresses.
6057   SmallVector<Instruction *, 4> Worklist;
6058   for (auto *I : AddrDefs)
6059     Worklist.push_back(I);
6060   while (!Worklist.empty()) {
6061     Instruction *I = Worklist.pop_back_val();
6062     for (auto &Op : I->operands())
6063       if (auto *InstOp = dyn_cast<Instruction>(Op))
6064         if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
6065             AddrDefs.insert(InstOp).second)
6066           Worklist.push_back(InstOp);
6067   }
6068 
6069   for (auto *I : AddrDefs) {
6070     if (isa<LoadInst>(I)) {
6071       // Setting the desired widening decision should ideally be handled in
6072       // by cost functions, but since this involves the task of finding out
6073       // if the loaded register is involved in an address computation, it is
6074       // instead changed here when we know this is the case.
6075       InstWidening Decision = getWideningDecision(I, VF);
6076       if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
6077         // Scalarize a widened load of address.
6078         setWideningDecision(I, VF, CM_Scalarize,
6079                             (VF * getMemoryInstructionCost(I, 1)));
6080       else if (auto Group = getInterleavedAccessGroup(I)) {
6081         // Scalarize an interleave group of address loads.
6082         for (unsigned I = 0; I < Group->getFactor(); ++I) {
6083           if (Instruction *Member = Group->getMember(I))
6084             setWideningDecision(Member, VF, CM_Scalarize,
6085                                 (VF * getMemoryInstructionCost(Member, 1)));
6086         }
6087       }
6088     } else
6089       // Make sure I gets scalarized and a cost estimate without
6090       // scalarization overhead.
6091       ForcedScalars[VF].insert(I);
6092   }
6093 }
6094 
6095 unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
6096                                                         unsigned VF,
6097                                                         Type *&VectorTy) {
6098   Type *RetTy = I->getType();
6099   if (canTruncateToMinimalBitwidth(I, VF))
6100     RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
6101   VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF);
6102   auto SE = PSE.getSE();
6103 
6104   // TODO: We need to estimate the cost of intrinsic calls.
6105   switch (I->getOpcode()) {
6106   case Instruction::GetElementPtr:
6107     // We mark this instruction as zero-cost because the cost of GEPs in
6108     // vectorized code depends on whether the corresponding memory instruction
6109     // is scalarized or not. Therefore, we handle GEPs with the memory
6110     // instruction cost.
6111     return 0;
6112   case Instruction::Br: {
6113     // In cases of scalarized and predicated instructions, there will be VF
6114     // predicated blocks in the vectorized loop. Each branch around these
6115     // blocks requires also an extract of its vector compare i1 element.
6116     bool ScalarPredicatedBB = false;
6117     BranchInst *BI = cast<BranchInst>(I);
6118     if (VF > 1 && BI->isConditional() &&
6119         (PredicatedBBsAfterVectorization.find(BI->getSuccessor(0)) !=
6120              PredicatedBBsAfterVectorization.end() ||
6121          PredicatedBBsAfterVectorization.find(BI->getSuccessor(1)) !=
6122              PredicatedBBsAfterVectorization.end()))
6123       ScalarPredicatedBB = true;
6124 
6125     if (ScalarPredicatedBB) {
6126       // Return cost for branches around scalarized and predicated blocks.
6127       Type *Vec_i1Ty =
6128           VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
6129       return (TTI.getScalarizationOverhead(Vec_i1Ty, false, true) +
6130               (TTI.getCFInstrCost(Instruction::Br) * VF));
6131     } else if (I->getParent() == TheLoop->getLoopLatch() || VF == 1)
6132       // The back-edge branch will remain, as will all scalar branches.
6133       return TTI.getCFInstrCost(Instruction::Br);
6134     else
6135       // This branch will be eliminated by if-conversion.
6136       return 0;
6137     // Note: We currently assume zero cost for an unconditional branch inside
6138     // a predicated block since it will become a fall-through, although we
6139     // may decide in the future to call TTI for all branches.
6140   }
6141   case Instruction::PHI: {
6142     auto *Phi = cast<PHINode>(I);
6143 
6144     // First-order recurrences are replaced by vector shuffles inside the loop.
6145     // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type.
6146     if (VF > 1 && Legal->isFirstOrderRecurrence(Phi))
6147       return TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector,
6148                                 VectorTy, VF - 1, VectorType::get(RetTy, 1));
6149 
6150     // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
6151     // converted into select instructions. We require N - 1 selects per phi
6152     // node, where N is the number of incoming values.
6153     if (VF > 1 && Phi->getParent() != TheLoop->getHeader())
6154       return (Phi->getNumIncomingValues() - 1) *
6155              TTI.getCmpSelInstrCost(
6156                  Instruction::Select, ToVectorTy(Phi->getType(), VF),
6157                  ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF));
6158 
6159     return TTI.getCFInstrCost(Instruction::PHI);
6160   }
6161   case Instruction::UDiv:
6162   case Instruction::SDiv:
6163   case Instruction::URem:
6164   case Instruction::SRem:
6165     // If we have a predicated instruction, it may not be executed for each
6166     // vector lane. Get the scalarization cost and scale this amount by the
6167     // probability of executing the predicated block. If the instruction is not
6168     // predicated, we fall through to the next case.
6169     if (VF > 1 && isScalarWithPredication(I)) {
6170       unsigned Cost = 0;
6171 
6172       // These instructions have a non-void type, so account for the phi nodes
6173       // that we will create. This cost is likely to be zero. The phi node
6174       // cost, if any, should be scaled by the block probability because it
6175       // models a copy at the end of each predicated block.
6176       Cost += VF * TTI.getCFInstrCost(Instruction::PHI);
6177 
6178       // The cost of the non-predicated instruction.
6179       Cost += VF * TTI.getArithmeticInstrCost(I->getOpcode(), RetTy);
6180 
6181       // The cost of insertelement and extractelement instructions needed for
6182       // scalarization.
6183       Cost += getScalarizationOverhead(I, VF);
6184 
6185       // Scale the cost by the probability of executing the predicated blocks.
6186       // This assumes the predicated block for each vector lane is equally
6187       // likely.
6188       return Cost / getReciprocalPredBlockProb();
6189     }
6190     LLVM_FALLTHROUGH;
6191   case Instruction::Add:
6192   case Instruction::FAdd:
6193   case Instruction::Sub:
6194   case Instruction::FSub:
6195   case Instruction::Mul:
6196   case Instruction::FMul:
6197   case Instruction::FDiv:
6198   case Instruction::FRem:
6199   case Instruction::Shl:
6200   case Instruction::LShr:
6201   case Instruction::AShr:
6202   case Instruction::And:
6203   case Instruction::Or:
6204   case Instruction::Xor: {
6205     // Since we will replace the stride by 1 the multiplication should go away.
6206     if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal))
6207       return 0;
6208     // Certain instructions can be cheaper to vectorize if they have a constant
6209     // second vector operand. One example of this are shifts on x86.
6210     Value *Op2 = I->getOperand(1);
6211     TargetTransformInfo::OperandValueProperties Op2VP;
6212     TargetTransformInfo::OperandValueKind Op2VK =
6213         TTI.getOperandInfo(Op2, Op2VP);
6214     if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2))
6215       Op2VK = TargetTransformInfo::OK_UniformValue;
6216 
6217     SmallVector<const Value *, 4> Operands(I->operand_values());
6218     unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
6219     return N * TTI.getArithmeticInstrCost(
6220                    I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue,
6221                    Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands);
6222   }
6223   case Instruction::FNeg: {
6224     unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
6225     return N * TTI.getArithmeticInstrCost(
6226                    I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue,
6227                    TargetTransformInfo::OK_AnyValue,
6228                    TargetTransformInfo::OP_None, TargetTransformInfo::OP_None,
6229                    I->getOperand(0));
6230   }
6231   case Instruction::Select: {
6232     SelectInst *SI = cast<SelectInst>(I);
6233     const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
6234     bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
6235     Type *CondTy = SI->getCondition()->getType();
6236     if (!ScalarCond)
6237       CondTy = VectorType::get(CondTy, VF);
6238 
6239     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, I);
6240   }
6241   case Instruction::ICmp:
6242   case Instruction::FCmp: {
6243     Type *ValTy = I->getOperand(0)->getType();
6244     Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
6245     if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
6246       ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
6247     VectorTy = ToVectorTy(ValTy, VF);
6248     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, I);
6249   }
6250   case Instruction::Store:
6251   case Instruction::Load: {
6252     unsigned Width = VF;
6253     if (Width > 1) {
6254       InstWidening Decision = getWideningDecision(I, Width);
6255       assert(Decision != CM_Unknown &&
6256              "CM decision should be taken at this point");
6257       if (Decision == CM_Scalarize)
6258         Width = 1;
6259     }
6260     VectorTy = ToVectorTy(getMemInstValueType(I), Width);
6261     return getMemoryInstructionCost(I, VF);
6262   }
6263   case Instruction::ZExt:
6264   case Instruction::SExt:
6265   case Instruction::FPToUI:
6266   case Instruction::FPToSI:
6267   case Instruction::FPExt:
6268   case Instruction::PtrToInt:
6269   case Instruction::IntToPtr:
6270   case Instruction::SIToFP:
6271   case Instruction::UIToFP:
6272   case Instruction::Trunc:
6273   case Instruction::FPTrunc:
6274   case Instruction::BitCast: {
6275     // We optimize the truncation of induction variables having constant
6276     // integer steps. The cost of these truncations is the same as the scalar
6277     // operation.
6278     if (isOptimizableIVTruncate(I, VF)) {
6279       auto *Trunc = cast<TruncInst>(I);
6280       return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
6281                                   Trunc->getSrcTy(), Trunc);
6282     }
6283 
6284     Type *SrcScalarTy = I->getOperand(0)->getType();
6285     Type *SrcVecTy =
6286         VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
6287     if (canTruncateToMinimalBitwidth(I, VF)) {
6288       // This cast is going to be shrunk. This may remove the cast or it might
6289       // turn it into slightly different cast. For example, if MinBW == 16,
6290       // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
6291       //
6292       // Calculate the modified src and dest types.
6293       Type *MinVecTy = VectorTy;
6294       if (I->getOpcode() == Instruction::Trunc) {
6295         SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
6296         VectorTy =
6297             largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
6298       } else if (I->getOpcode() == Instruction::ZExt ||
6299                  I->getOpcode() == Instruction::SExt) {
6300         SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy);
6301         VectorTy =
6302             smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
6303       }
6304     }
6305 
6306     unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
6307     return N * TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy, I);
6308   }
6309   case Instruction::Call: {
6310     bool NeedToScalarize;
6311     CallInst *CI = cast<CallInst>(I);
6312     unsigned CallCost = getVectorCallCost(CI, VF, NeedToScalarize);
6313     if (getVectorIntrinsicIDForCall(CI, TLI))
6314       return std::min(CallCost, getVectorIntrinsicCost(CI, VF));
6315     return CallCost;
6316   }
6317   default:
6318     // The cost of executing VF copies of the scalar instruction. This opcode
6319     // is unknown. Assume that it is the same as 'mul'.
6320     return VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy) +
6321            getScalarizationOverhead(I, VF);
6322   } // end of switch.
6323 }
6324 
6325 char LoopVectorize::ID = 0;
6326 
6327 static const char lv_name[] = "Loop Vectorization";
6328 
6329 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
6330 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
6331 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)
6332 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
6333 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
6334 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
6335 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
6336 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
6337 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
6338 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
6339 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
6340 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
6341 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
6342 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
6343 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
6344 
6345 namespace llvm {
6346 
6347 Pass *createLoopVectorizePass() { return new LoopVectorize(); }
6348 
6349 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced,
6350                               bool VectorizeOnlyWhenForced) {
6351   return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced);
6352 }
6353 
6354 } // end namespace llvm
6355 
6356 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
6357   // Check if the pointer operand of a load or store instruction is
6358   // consecutive.
6359   if (auto *Ptr = getLoadStorePointerOperand(Inst))
6360     return Legal->isConsecutivePtr(Ptr);
6361   return false;
6362 }
6363 
6364 void LoopVectorizationCostModel::collectValuesToIgnore() {
6365   // Ignore ephemeral values.
6366   CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
6367 
6368   // Ignore type-promoting instructions we identified during reduction
6369   // detection.
6370   for (auto &Reduction : *Legal->getReductionVars()) {
6371     RecurrenceDescriptor &RedDes = Reduction.second;
6372     SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
6373     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
6374   }
6375   // Ignore type-casting instructions we identified during induction
6376   // detection.
6377   for (auto &Induction : *Legal->getInductionVars()) {
6378     InductionDescriptor &IndDes = Induction.second;
6379     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
6380     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
6381   }
6382 }
6383 
6384 // TODO: we could return a pair of values that specify the max VF and
6385 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
6386 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
6387 // doesn't have a cost model that can choose which plan to execute if
6388 // more than one is generated.
6389 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits,
6390                                  LoopVectorizationCostModel &CM) {
6391   unsigned WidestType;
6392   std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
6393   return WidestVectorRegBits / WidestType;
6394 }
6395 
6396 VectorizationFactor
6397 LoopVectorizationPlanner::planInVPlanNativePath(unsigned UserVF) {
6398   unsigned VF = UserVF;
6399   // Outer loop handling: They may require CFG and instruction level
6400   // transformations before even evaluating whether vectorization is profitable.
6401   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
6402   // the vectorization pipeline.
6403   if (!OrigLoop->empty()) {
6404     // If the user doesn't provide a vectorization factor, determine a
6405     // reasonable one.
6406     if (!UserVF) {
6407       VF = determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM);
6408       LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
6409 
6410       // Make sure we have a VF > 1 for stress testing.
6411       if (VPlanBuildStressTest && VF < 2) {
6412         LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
6413                           << "overriding computed VF.\n");
6414         VF = 4;
6415       }
6416     }
6417     assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
6418     assert(isPowerOf2_32(VF) && "VF needs to be a power of two");
6419     LLVM_DEBUG(dbgs() << "LV: Using " << (UserVF ? "user " : "") << "VF " << VF
6420                       << " to build VPlans.\n");
6421     buildVPlans(VF, VF);
6422 
6423     // For VPlan build stress testing, we bail out after VPlan construction.
6424     if (VPlanBuildStressTest)
6425       return VectorizationFactor::Disabled();
6426 
6427     return {VF, 0};
6428   }
6429 
6430   LLVM_DEBUG(
6431       dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
6432                 "VPlan-native path.\n");
6433   return VectorizationFactor::Disabled();
6434 }
6435 
6436 Optional<VectorizationFactor> LoopVectorizationPlanner::plan(unsigned UserVF) {
6437   assert(OrigLoop->empty() && "Inner loop expected.");
6438   Optional<unsigned> MaybeMaxVF = CM.computeMaxVF();
6439   if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved.
6440     return None;
6441 
6442   // Invalidate interleave groups if all blocks of loop will be predicated.
6443   if (CM.blockNeedsPredication(OrigLoop->getHeader()) &&
6444       !useMaskedInterleavedAccesses(*TTI)) {
6445     LLVM_DEBUG(
6446         dbgs()
6447         << "LV: Invalidate all interleaved groups due to fold-tail by masking "
6448            "which requires masked-interleaved support.\n");
6449     CM.InterleaveInfo.reset();
6450   }
6451 
6452   if (UserVF) {
6453     LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
6454     assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two");
6455     // Collect the instructions (and their associated costs) that will be more
6456     // profitable to scalarize.
6457     CM.selectUserVectorizationFactor(UserVF);
6458     buildVPlansWithVPRecipes(UserVF, UserVF);
6459     LLVM_DEBUG(printPlans(dbgs()));
6460     return {{UserVF, 0}};
6461   }
6462 
6463   unsigned MaxVF = MaybeMaxVF.getValue();
6464   assert(MaxVF != 0 && "MaxVF is zero.");
6465 
6466   for (unsigned VF = 1; VF <= MaxVF; VF *= 2) {
6467     // Collect Uniform and Scalar instructions after vectorization with VF.
6468     CM.collectUniformsAndScalars(VF);
6469 
6470     // Collect the instructions (and their associated costs) that will be more
6471     // profitable to scalarize.
6472     if (VF > 1)
6473       CM.collectInstsToScalarize(VF);
6474   }
6475 
6476   buildVPlansWithVPRecipes(1, MaxVF);
6477   LLVM_DEBUG(printPlans(dbgs()));
6478   if (MaxVF == 1)
6479     return VectorizationFactor::Disabled();
6480 
6481   // Select the optimal vectorization factor.
6482   return CM.selectVectorizationFactor(MaxVF);
6483 }
6484 
6485 void LoopVectorizationPlanner::setBestPlan(unsigned VF, unsigned UF) {
6486   LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF
6487                     << '\n');
6488   BestVF = VF;
6489   BestUF = UF;
6490 
6491   erase_if(VPlans, [VF](const VPlanPtr &Plan) {
6492     return !Plan->hasVF(VF);
6493   });
6494   assert(VPlans.size() == 1 && "Best VF has not a single VPlan.");
6495 }
6496 
6497 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV,
6498                                            DominatorTree *DT) {
6499   // Perform the actual loop transformation.
6500 
6501   // 1. Create a new empty loop. Unlink the old loop and connect the new one.
6502   VPCallbackILV CallbackILV(ILV);
6503 
6504   VPTransformState State{BestVF, BestUF,      LI,
6505                          DT,     ILV.Builder, ILV.VectorLoopValueMap,
6506                          &ILV,   CallbackILV};
6507   State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
6508   State.TripCount = ILV.getOrCreateTripCount(nullptr);
6509 
6510   //===------------------------------------------------===//
6511   //
6512   // Notice: any optimization or new instruction that go
6513   // into the code below should also be implemented in
6514   // the cost-model.
6515   //
6516   //===------------------------------------------------===//
6517 
6518   // 2. Copy and widen instructions from the old loop into the new loop.
6519   assert(VPlans.size() == 1 && "Not a single VPlan to execute.");
6520   VPlans.front()->execute(&State);
6521 
6522   // 3. Fix the vectorized code: take care of header phi's, live-outs,
6523   //    predication, updating analyses.
6524   ILV.fixVectorizedLoop();
6525 }
6526 
6527 void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
6528     SmallPtrSetImpl<Instruction *> &DeadInstructions) {
6529   BasicBlock *Latch = OrigLoop->getLoopLatch();
6530 
6531   // We create new control-flow for the vectorized loop, so the original
6532   // condition will be dead after vectorization if it's only used by the
6533   // branch.
6534   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
6535   if (Cmp && Cmp->hasOneUse())
6536     DeadInstructions.insert(Cmp);
6537 
6538   // We create new "steps" for induction variable updates to which the original
6539   // induction variables map. An original update instruction will be dead if
6540   // all its users except the induction variable are dead.
6541   for (auto &Induction : *Legal->getInductionVars()) {
6542     PHINode *Ind = Induction.first;
6543     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
6544     if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
6545           return U == Ind || DeadInstructions.find(cast<Instruction>(U)) !=
6546                                  DeadInstructions.end();
6547         }))
6548       DeadInstructions.insert(IndUpdate);
6549 
6550     // We record as "Dead" also the type-casting instructions we had identified
6551     // during induction analysis. We don't need any handling for them in the
6552     // vectorized loop because we have proven that, under a proper runtime
6553     // test guarding the vectorized loop, the value of the phi, and the casted
6554     // value of the phi, are the same. The last instruction in this casting chain
6555     // will get its scalar/vector/widened def from the scalar/vector/widened def
6556     // of the respective phi node. Any other casts in the induction def-use chain
6557     // have no other uses outside the phi update chain, and will be ignored.
6558     InductionDescriptor &IndDes = Induction.second;
6559     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
6560     DeadInstructions.insert(Casts.begin(), Casts.end());
6561   }
6562 }
6563 
6564 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; }
6565 
6566 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; }
6567 
6568 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step,
6569                                         Instruction::BinaryOps BinOp) {
6570   // When unrolling and the VF is 1, we only need to add a simple scalar.
6571   Type *Ty = Val->getType();
6572   assert(!Ty->isVectorTy() && "Val must be a scalar");
6573 
6574   if (Ty->isFloatingPointTy()) {
6575     Constant *C = ConstantFP::get(Ty, (double)StartIdx);
6576 
6577     // Floating point operations had to be 'fast' to enable the unrolling.
6578     Value *MulOp = addFastMathFlag(Builder.CreateFMul(C, Step));
6579     return addFastMathFlag(Builder.CreateBinOp(BinOp, Val, MulOp));
6580   }
6581   Constant *C = ConstantInt::get(Ty, StartIdx);
6582   return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction");
6583 }
6584 
6585 static void AddRuntimeUnrollDisableMetaData(Loop *L) {
6586   SmallVector<Metadata *, 4> MDs;
6587   // Reserve first location for self reference to the LoopID metadata node.
6588   MDs.push_back(nullptr);
6589   bool IsUnrollMetadata = false;
6590   MDNode *LoopID = L->getLoopID();
6591   if (LoopID) {
6592     // First find existing loop unrolling disable metadata.
6593     for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
6594       auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
6595       if (MD) {
6596         const auto *S = dyn_cast<MDString>(MD->getOperand(0));
6597         IsUnrollMetadata =
6598             S && S->getString().startswith("llvm.loop.unroll.disable");
6599       }
6600       MDs.push_back(LoopID->getOperand(i));
6601     }
6602   }
6603 
6604   if (!IsUnrollMetadata) {
6605     // Add runtime unroll disable metadata.
6606     LLVMContext &Context = L->getHeader()->getContext();
6607     SmallVector<Metadata *, 1> DisableOperands;
6608     DisableOperands.push_back(
6609         MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
6610     MDNode *DisableNode = MDNode::get(Context, DisableOperands);
6611     MDs.push_back(DisableNode);
6612     MDNode *NewLoopID = MDNode::get(Context, MDs);
6613     // Set operand 0 to refer to the loop id itself.
6614     NewLoopID->replaceOperandWith(0, NewLoopID);
6615     L->setLoopID(NewLoopID);
6616   }
6617 }
6618 
6619 bool LoopVectorizationPlanner::getDecisionAndClampRange(
6620     const std::function<bool(unsigned)> &Predicate, VFRange &Range) {
6621   assert(Range.End > Range.Start && "Trying to test an empty VF range.");
6622   bool PredicateAtRangeStart = Predicate(Range.Start);
6623 
6624   for (unsigned TmpVF = Range.Start * 2; TmpVF < Range.End; TmpVF *= 2)
6625     if (Predicate(TmpVF) != PredicateAtRangeStart) {
6626       Range.End = TmpVF;
6627       break;
6628     }
6629 
6630   return PredicateAtRangeStart;
6631 }
6632 
6633 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
6634 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
6635 /// of VF's starting at a given VF and extending it as much as possible. Each
6636 /// vectorization decision can potentially shorten this sub-range during
6637 /// buildVPlan().
6638 void LoopVectorizationPlanner::buildVPlans(unsigned MinVF, unsigned MaxVF) {
6639   for (unsigned VF = MinVF; VF < MaxVF + 1;) {
6640     VFRange SubRange = {VF, MaxVF + 1};
6641     VPlans.push_back(buildVPlan(SubRange));
6642     VF = SubRange.End;
6643   }
6644 }
6645 
6646 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
6647                                          VPlanPtr &Plan) {
6648   assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
6649 
6650   // Look for cached value.
6651   std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
6652   EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
6653   if (ECEntryIt != EdgeMaskCache.end())
6654     return ECEntryIt->second;
6655 
6656   VPValue *SrcMask = createBlockInMask(Src, Plan);
6657 
6658   // The terminator has to be a branch inst!
6659   BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
6660   assert(BI && "Unexpected terminator found");
6661 
6662   if (!BI->isConditional())
6663     return EdgeMaskCache[Edge] = SrcMask;
6664 
6665   VPValue *EdgeMask = Plan->getVPValue(BI->getCondition());
6666   assert(EdgeMask && "No Edge Mask found for condition");
6667 
6668   if (BI->getSuccessor(0) != Dst)
6669     EdgeMask = Builder.createNot(EdgeMask);
6670 
6671   if (SrcMask) // Otherwise block in-mask is all-one, no need to AND.
6672     EdgeMask = Builder.createAnd(EdgeMask, SrcMask);
6673 
6674   return EdgeMaskCache[Edge] = EdgeMask;
6675 }
6676 
6677 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
6678   assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
6679 
6680   // Look for cached value.
6681   BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB);
6682   if (BCEntryIt != BlockMaskCache.end())
6683     return BCEntryIt->second;
6684 
6685   // All-one mask is modelled as no-mask following the convention for masked
6686   // load/store/gather/scatter. Initialize BlockMask to no-mask.
6687   VPValue *BlockMask = nullptr;
6688 
6689   if (OrigLoop->getHeader() == BB) {
6690     if (!CM.blockNeedsPredication(BB))
6691       return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one.
6692 
6693     // Introduce the early-exit compare IV <= BTC to form header block mask.
6694     // This is used instead of IV < TC because TC may wrap, unlike BTC.
6695     VPValue *IV = Plan->getVPValue(Legal->getPrimaryInduction());
6696     VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
6697     BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
6698     return BlockMaskCache[BB] = BlockMask;
6699   }
6700 
6701   // This is the block mask. We OR all incoming edges.
6702   for (auto *Predecessor : predecessors(BB)) {
6703     VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan);
6704     if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too.
6705       return BlockMaskCache[BB] = EdgeMask;
6706 
6707     if (!BlockMask) { // BlockMask has its initialized nullptr value.
6708       BlockMask = EdgeMask;
6709       continue;
6710     }
6711 
6712     BlockMask = Builder.createOr(BlockMask, EdgeMask);
6713   }
6714 
6715   return BlockMaskCache[BB] = BlockMask;
6716 }
6717 
6718 VPWidenMemoryInstructionRecipe *
6719 VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range,
6720                                   VPlanPtr &Plan) {
6721   if (!isa<LoadInst>(I) && !isa<StoreInst>(I))
6722     return nullptr;
6723 
6724   auto willWiden = [&](unsigned VF) -> bool {
6725     if (VF == 1)
6726       return false;
6727     LoopVectorizationCostModel::InstWidening Decision =
6728         CM.getWideningDecision(I, VF);
6729     assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
6730            "CM decision should be taken at this point.");
6731     if (Decision == LoopVectorizationCostModel::CM_Interleave)
6732       return true;
6733     if (CM.isScalarAfterVectorization(I, VF) ||
6734         CM.isProfitableToScalarize(I, VF))
6735       return false;
6736     return Decision != LoopVectorizationCostModel::CM_Scalarize;
6737   };
6738 
6739   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
6740     return nullptr;
6741 
6742   VPValue *Mask = nullptr;
6743   if (Legal->isMaskRequired(I))
6744     Mask = createBlockInMask(I->getParent(), Plan);
6745 
6746   return new VPWidenMemoryInstructionRecipe(*I, Mask);
6747 }
6748 
6749 VPWidenIntOrFpInductionRecipe *
6750 VPRecipeBuilder::tryToOptimizeInduction(Instruction *I, VFRange &Range) {
6751   if (PHINode *Phi = dyn_cast<PHINode>(I)) {
6752     // Check if this is an integer or fp induction. If so, build the recipe that
6753     // produces its scalar and vector values.
6754     InductionDescriptor II = Legal->getInductionVars()->lookup(Phi);
6755     if (II.getKind() == InductionDescriptor::IK_IntInduction ||
6756         II.getKind() == InductionDescriptor::IK_FpInduction)
6757       return new VPWidenIntOrFpInductionRecipe(Phi);
6758 
6759     return nullptr;
6760   }
6761 
6762   // Optimize the special case where the source is a constant integer
6763   // induction variable. Notice that we can only optimize the 'trunc' case
6764   // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
6765   // (c) other casts depend on pointer size.
6766 
6767   // Determine whether \p K is a truncation based on an induction variable that
6768   // can be optimized.
6769   auto isOptimizableIVTruncate =
6770       [&](Instruction *K) -> std::function<bool(unsigned)> {
6771     return
6772         [=](unsigned VF) -> bool { return CM.isOptimizableIVTruncate(K, VF); };
6773   };
6774 
6775   if (isa<TruncInst>(I) && LoopVectorizationPlanner::getDecisionAndClampRange(
6776                                isOptimizableIVTruncate(I), Range))
6777     return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)),
6778                                              cast<TruncInst>(I));
6779   return nullptr;
6780 }
6781 
6782 VPBlendRecipe *VPRecipeBuilder::tryToBlend(Instruction *I, VPlanPtr &Plan) {
6783   PHINode *Phi = dyn_cast<PHINode>(I);
6784   if (!Phi || Phi->getParent() == OrigLoop->getHeader())
6785     return nullptr;
6786 
6787   // We know that all PHIs in non-header blocks are converted into selects, so
6788   // we don't have to worry about the insertion order and we can just use the
6789   // builder. At this point we generate the predication tree. There may be
6790   // duplications since this is a simple recursive scan, but future
6791   // optimizations will clean it up.
6792 
6793   SmallVector<VPValue *, 2> Masks;
6794   unsigned NumIncoming = Phi->getNumIncomingValues();
6795   for (unsigned In = 0; In < NumIncoming; In++) {
6796     VPValue *EdgeMask =
6797       createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan);
6798     assert((EdgeMask || NumIncoming == 1) &&
6799            "Multiple predecessors with one having a full mask");
6800     if (EdgeMask)
6801       Masks.push_back(EdgeMask);
6802   }
6803   return new VPBlendRecipe(Phi, Masks);
6804 }
6805 
6806 bool VPRecipeBuilder::tryToWiden(Instruction *I, VPBasicBlock *VPBB,
6807                                  VFRange &Range) {
6808 
6809   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
6810       [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range);
6811 
6812   if (IsPredicated)
6813     return false;
6814 
6815   auto IsVectorizableOpcode = [](unsigned Opcode) {
6816     switch (Opcode) {
6817     case Instruction::Add:
6818     case Instruction::And:
6819     case Instruction::AShr:
6820     case Instruction::BitCast:
6821     case Instruction::Br:
6822     case Instruction::Call:
6823     case Instruction::FAdd:
6824     case Instruction::FCmp:
6825     case Instruction::FDiv:
6826     case Instruction::FMul:
6827     case Instruction::FNeg:
6828     case Instruction::FPExt:
6829     case Instruction::FPToSI:
6830     case Instruction::FPToUI:
6831     case Instruction::FPTrunc:
6832     case Instruction::FRem:
6833     case Instruction::FSub:
6834     case Instruction::GetElementPtr:
6835     case Instruction::ICmp:
6836     case Instruction::IntToPtr:
6837     case Instruction::Load:
6838     case Instruction::LShr:
6839     case Instruction::Mul:
6840     case Instruction::Or:
6841     case Instruction::PHI:
6842     case Instruction::PtrToInt:
6843     case Instruction::SDiv:
6844     case Instruction::Select:
6845     case Instruction::SExt:
6846     case Instruction::Shl:
6847     case Instruction::SIToFP:
6848     case Instruction::SRem:
6849     case Instruction::Store:
6850     case Instruction::Sub:
6851     case Instruction::Trunc:
6852     case Instruction::UDiv:
6853     case Instruction::UIToFP:
6854     case Instruction::URem:
6855     case Instruction::Xor:
6856     case Instruction::ZExt:
6857       return true;
6858     }
6859     return false;
6860   };
6861 
6862   if (!IsVectorizableOpcode(I->getOpcode()))
6863     return false;
6864 
6865   if (CallInst *CI = dyn_cast<CallInst>(I)) {
6866     Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
6867     if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
6868                ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect))
6869       return false;
6870   }
6871 
6872   auto willWiden = [&](unsigned VF) -> bool {
6873     if (!isa<PHINode>(I) && (CM.isScalarAfterVectorization(I, VF) ||
6874                              CM.isProfitableToScalarize(I, VF)))
6875       return false;
6876     if (CallInst *CI = dyn_cast<CallInst>(I)) {
6877       Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
6878       // The following case may be scalarized depending on the VF.
6879       // The flag shows whether we use Intrinsic or a usual Call for vectorized
6880       // version of the instruction.
6881       // Is it beneficial to perform intrinsic call compared to lib call?
6882       bool NeedToScalarize;
6883       unsigned CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize);
6884       bool UseVectorIntrinsic =
6885           ID && CM.getVectorIntrinsicCost(CI, VF) <= CallCost;
6886       return UseVectorIntrinsic || !NeedToScalarize;
6887     }
6888     if (isa<LoadInst>(I) || isa<StoreInst>(I)) {
6889       assert(CM.getWideningDecision(I, VF) ==
6890                  LoopVectorizationCostModel::CM_Scalarize &&
6891              "Memory widening decisions should have been taken care by now");
6892       return false;
6893     }
6894     return true;
6895   };
6896 
6897   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
6898     return false;
6899 
6900   // If this ingredient's recipe is to be recorded, keep its recipe a singleton
6901   // to avoid having to split recipes later.
6902   bool IsSingleton = Ingredient2Recipe.count(I);
6903 
6904   // Success: widen this instruction. We optimize the common case where
6905   // consecutive instructions can be represented by a single recipe.
6906   if (!IsSingleton && !VPBB->empty() && LastExtensibleRecipe == &VPBB->back() &&
6907       LastExtensibleRecipe->appendInstruction(I))
6908     return true;
6909 
6910   VPWidenRecipe *WidenRecipe = new VPWidenRecipe(I);
6911   if (!IsSingleton)
6912     LastExtensibleRecipe = WidenRecipe;
6913   setRecipe(I, WidenRecipe);
6914   VPBB->appendRecipe(WidenRecipe);
6915   return true;
6916 }
6917 
6918 VPBasicBlock *VPRecipeBuilder::handleReplication(
6919     Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
6920     DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe,
6921     VPlanPtr &Plan) {
6922   bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
6923       [&](unsigned VF) { return CM.isUniformAfterVectorization(I, VF); },
6924       Range);
6925 
6926   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
6927       [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range);
6928 
6929   auto *Recipe = new VPReplicateRecipe(I, IsUniform, IsPredicated);
6930   setRecipe(I, Recipe);
6931 
6932   // Find if I uses a predicated instruction. If so, it will use its scalar
6933   // value. Avoid hoisting the insert-element which packs the scalar value into
6934   // a vector value, as that happens iff all users use the vector value.
6935   for (auto &Op : I->operands())
6936     if (auto *PredInst = dyn_cast<Instruction>(Op))
6937       if (PredInst2Recipe.find(PredInst) != PredInst2Recipe.end())
6938         PredInst2Recipe[PredInst]->setAlsoPack(false);
6939 
6940   // Finalize the recipe for Instr, first if it is not predicated.
6941   if (!IsPredicated) {
6942     LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
6943     VPBB->appendRecipe(Recipe);
6944     return VPBB;
6945   }
6946   LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
6947   assert(VPBB->getSuccessors().empty() &&
6948          "VPBB has successors when handling predicated replication.");
6949   // Record predicated instructions for above packing optimizations.
6950   PredInst2Recipe[I] = Recipe;
6951   VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan);
6952   VPBlockUtils::insertBlockAfter(Region, VPBB);
6953   auto *RegSucc = new VPBasicBlock();
6954   VPBlockUtils::insertBlockAfter(RegSucc, Region);
6955   return RegSucc;
6956 }
6957 
6958 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr,
6959                                                       VPRecipeBase *PredRecipe,
6960                                                       VPlanPtr &Plan) {
6961   // Instructions marked for predication are replicated and placed under an
6962   // if-then construct to prevent side-effects.
6963 
6964   // Generate recipes to compute the block mask for this region.
6965   VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan);
6966 
6967   // Build the triangular if-then region.
6968   std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
6969   assert(Instr->getParent() && "Predicated instruction not in any basic block");
6970   auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask);
6971   auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
6972   auto *PHIRecipe =
6973       Instr->getType()->isVoidTy() ? nullptr : new VPPredInstPHIRecipe(Instr);
6974   auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
6975   auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe);
6976   VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true);
6977 
6978   // Note: first set Entry as region entry and then connect successors starting
6979   // from it in order, to propagate the "parent" of each VPBasicBlock.
6980   VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry);
6981   VPBlockUtils::connectBlocks(Pred, Exit);
6982 
6983   return Region;
6984 }
6985 
6986 bool VPRecipeBuilder::tryToCreateRecipe(Instruction *Instr, VFRange &Range,
6987                                         VPlanPtr &Plan, VPBasicBlock *VPBB) {
6988   VPRecipeBase *Recipe = nullptr;
6989 
6990   // First, check for specific widening recipes that deal with memory
6991   // operations, inductions and Phi nodes.
6992   if ((Recipe = tryToWidenMemory(Instr, Range, Plan)) ||
6993       (Recipe = tryToOptimizeInduction(Instr, Range)) ||
6994       (Recipe = tryToBlend(Instr, Plan)) ||
6995       (isa<PHINode>(Instr) &&
6996        (Recipe = new VPWidenPHIRecipe(cast<PHINode>(Instr))))) {
6997     setRecipe(Instr, Recipe);
6998     VPBB->appendRecipe(Recipe);
6999     return true;
7000   }
7001 
7002   // Check if Instr is to be widened by a general VPWidenRecipe.
7003   if (tryToWiden(Instr, VPBB, Range))
7004     return true;
7005 
7006   return false;
7007 }
7008 
7009 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF,
7010                                                         unsigned MaxVF) {
7011   assert(OrigLoop->empty() && "Inner loop expected.");
7012 
7013   // Collect conditions feeding internal conditional branches; they need to be
7014   // represented in VPlan for it to model masking.
7015   SmallPtrSet<Value *, 1> NeedDef;
7016 
7017   auto *Latch = OrigLoop->getLoopLatch();
7018   for (BasicBlock *BB : OrigLoop->blocks()) {
7019     if (BB == Latch)
7020       continue;
7021     BranchInst *Branch = dyn_cast<BranchInst>(BB->getTerminator());
7022     if (Branch && Branch->isConditional())
7023       NeedDef.insert(Branch->getCondition());
7024   }
7025 
7026   // If the tail is to be folded by masking, the primary induction variable
7027   // needs to be represented in VPlan for it to model early-exit masking.
7028   // Also, both the Phi and the live-out instruction of each reduction are
7029   // required in order to introduce a select between them in VPlan.
7030   if (CM.foldTailByMasking()) {
7031     NeedDef.insert(Legal->getPrimaryInduction());
7032     for (auto &Reduction : *Legal->getReductionVars()) {
7033       NeedDef.insert(Reduction.first);
7034       NeedDef.insert(Reduction.second.getLoopExitInstr());
7035     }
7036   }
7037 
7038   // Collect instructions from the original loop that will become trivially dead
7039   // in the vectorized loop. We don't need to vectorize these instructions. For
7040   // example, original induction update instructions can become dead because we
7041   // separately emit induction "steps" when generating code for the new loop.
7042   // Similarly, we create a new latch condition when setting up the structure
7043   // of the new loop, so the old one can become dead.
7044   SmallPtrSet<Instruction *, 4> DeadInstructions;
7045   collectTriviallyDeadInstructions(DeadInstructions);
7046 
7047   for (unsigned VF = MinVF; VF < MaxVF + 1;) {
7048     VFRange SubRange = {VF, MaxVF + 1};
7049     VPlans.push_back(
7050         buildVPlanWithVPRecipes(SubRange, NeedDef, DeadInstructions));
7051     VF = SubRange.End;
7052   }
7053 }
7054 
7055 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
7056     VFRange &Range, SmallPtrSetImpl<Value *> &NeedDef,
7057     SmallPtrSetImpl<Instruction *> &DeadInstructions) {
7058 
7059   // Hold a mapping from predicated instructions to their recipes, in order to
7060   // fix their AlsoPack behavior if a user is determined to replicate and use a
7061   // scalar instead of vector value.
7062   DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe;
7063 
7064   DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter();
7065 
7066   SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
7067 
7068   VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, Builder);
7069 
7070   // ---------------------------------------------------------------------------
7071   // Pre-construction: record ingredients whose recipes we'll need to further
7072   // process after constructing the initial VPlan.
7073   // ---------------------------------------------------------------------------
7074 
7075   // Mark instructions we'll need to sink later and their targets as
7076   // ingredients whose recipe we'll need to record.
7077   for (auto &Entry : SinkAfter) {
7078     RecipeBuilder.recordRecipeOf(Entry.first);
7079     RecipeBuilder.recordRecipeOf(Entry.second);
7080   }
7081 
7082   // For each interleave group which is relevant for this (possibly trimmed)
7083   // Range, add it to the set of groups to be later applied to the VPlan and add
7084   // placeholders for its members' Recipes which we'll be replacing with a
7085   // single VPInterleaveRecipe.
7086   for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
7087     auto applyIG = [IG, this](unsigned VF) -> bool {
7088       return (VF >= 2 && // Query is illegal for VF == 1
7089               CM.getWideningDecision(IG->getInsertPos(), VF) ==
7090                   LoopVectorizationCostModel::CM_Interleave);
7091     };
7092     if (!getDecisionAndClampRange(applyIG, Range))
7093       continue;
7094     InterleaveGroups.insert(IG);
7095     for (unsigned i = 0; i < IG->getFactor(); i++)
7096       if (Instruction *Member = IG->getMember(i))
7097         RecipeBuilder.recordRecipeOf(Member);
7098   };
7099 
7100   // ---------------------------------------------------------------------------
7101   // Build initial VPlan: Scan the body of the loop in a topological order to
7102   // visit each basic block after having visited its predecessor basic blocks.
7103   // ---------------------------------------------------------------------------
7104 
7105   // Create a dummy pre-entry VPBasicBlock to start building the VPlan.
7106   VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry");
7107   auto Plan = std::make_unique<VPlan>(VPBB);
7108 
7109   // Represent values that will have defs inside VPlan.
7110   for (Value *V : NeedDef)
7111     Plan->addVPValue(V);
7112 
7113   // Scan the body of the loop in a topological order to visit each basic block
7114   // after having visited its predecessor basic blocks.
7115   LoopBlocksDFS DFS(OrigLoop);
7116   DFS.perform(LI);
7117 
7118   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
7119     // Relevant instructions from basic block BB will be grouped into VPRecipe
7120     // ingredients and fill a new VPBasicBlock.
7121     unsigned VPBBsForBB = 0;
7122     auto *FirstVPBBForBB = new VPBasicBlock(BB->getName());
7123     VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB);
7124     VPBB = FirstVPBBForBB;
7125     Builder.setInsertPoint(VPBB);
7126 
7127     // Introduce each ingredient into VPlan.
7128     for (Instruction &I : BB->instructionsWithoutDebug()) {
7129       Instruction *Instr = &I;
7130 
7131       // First filter out irrelevant instructions, to ensure no recipes are
7132       // built for them.
7133       if (isa<BranchInst>(Instr) ||
7134           DeadInstructions.find(Instr) != DeadInstructions.end())
7135         continue;
7136 
7137       if (RecipeBuilder.tryToCreateRecipe(Instr, Range, Plan, VPBB))
7138         continue;
7139 
7140       // Otherwise, if all widening options failed, Instruction is to be
7141       // replicated. This may create a successor for VPBB.
7142       VPBasicBlock *NextVPBB = RecipeBuilder.handleReplication(
7143           Instr, Range, VPBB, PredInst2Recipe, Plan);
7144       if (NextVPBB != VPBB) {
7145         VPBB = NextVPBB;
7146         VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++)
7147                                     : "");
7148       }
7149     }
7150   }
7151 
7152   // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks
7153   // may also be empty, such as the last one VPBB, reflecting original
7154   // basic-blocks with no recipes.
7155   VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry());
7156   assert(PreEntry->empty() && "Expecting empty pre-entry block.");
7157   VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor());
7158   VPBlockUtils::disconnectBlocks(PreEntry, Entry);
7159   delete PreEntry;
7160 
7161   // ---------------------------------------------------------------------------
7162   // Transform initial VPlan: Apply previously taken decisions, in order, to
7163   // bring the VPlan to its final state.
7164   // ---------------------------------------------------------------------------
7165 
7166   // Apply Sink-After legal constraints.
7167   for (auto &Entry : SinkAfter) {
7168     VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first);
7169     VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second);
7170     Sink->moveAfter(Target);
7171   }
7172 
7173   // Interleave memory: for each Interleave Group we marked earlier as relevant
7174   // for this VPlan, replace the Recipes widening its memory instructions with a
7175   // single VPInterleaveRecipe at its insertion point.
7176   for (auto IG : InterleaveGroups) {
7177     auto *Recipe = cast<VPWidenMemoryInstructionRecipe>(
7178         RecipeBuilder.getRecipe(IG->getInsertPos()));
7179     (new VPInterleaveRecipe(IG, Recipe->getMask()))->insertBefore(Recipe);
7180 
7181     for (unsigned i = 0; i < IG->getFactor(); ++i)
7182       if (Instruction *Member = IG->getMember(i)) {
7183         RecipeBuilder.getRecipe(Member)->eraseFromParent();
7184       }
7185   }
7186 
7187   // Finally, if tail is folded by masking, introduce selects between the phi
7188   // and the live-out instruction of each reduction, at the end of the latch.
7189   if (CM.foldTailByMasking()) {
7190     Builder.setInsertPoint(VPBB);
7191     auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan);
7192     for (auto &Reduction : *Legal->getReductionVars()) {
7193       VPValue *Phi = Plan->getVPValue(Reduction.first);
7194       VPValue *Red = Plan->getVPValue(Reduction.second.getLoopExitInstr());
7195       Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi});
7196     }
7197   }
7198 
7199   std::string PlanName;
7200   raw_string_ostream RSO(PlanName);
7201   unsigned VF = Range.Start;
7202   Plan->addVF(VF);
7203   RSO << "Initial VPlan for VF={" << VF;
7204   for (VF *= 2; VF < Range.End; VF *= 2) {
7205     Plan->addVF(VF);
7206     RSO << "," << VF;
7207   }
7208   RSO << "},UF>=1";
7209   RSO.flush();
7210   Plan->setName(PlanName);
7211 
7212   return Plan;
7213 }
7214 
7215 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
7216   // Outer loop handling: They may require CFG and instruction level
7217   // transformations before even evaluating whether vectorization is profitable.
7218   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
7219   // the vectorization pipeline.
7220   assert(!OrigLoop->empty());
7221   assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
7222 
7223   // Create new empty VPlan
7224   auto Plan = std::make_unique<VPlan>();
7225 
7226   // Build hierarchical CFG
7227   VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
7228   HCFGBuilder.buildHierarchicalCFG();
7229 
7230   for (unsigned VF = Range.Start; VF < Range.End; VF *= 2)
7231     Plan->addVF(VF);
7232 
7233   if (EnableVPlanPredication) {
7234     VPlanPredicator VPP(*Plan);
7235     VPP.predicate();
7236 
7237     // Avoid running transformation to recipes until masked code generation in
7238     // VPlan-native path is in place.
7239     return Plan;
7240   }
7241 
7242   SmallPtrSet<Instruction *, 1> DeadInstructions;
7243   VPlanHCFGTransforms::VPInstructionsToVPRecipes(
7244       Plan, Legal->getInductionVars(), DeadInstructions);
7245 
7246   return Plan;
7247 }
7248 
7249 Value* LoopVectorizationPlanner::VPCallbackILV::
7250 getOrCreateVectorValues(Value *V, unsigned Part) {
7251       return ILV.getOrCreateVectorValue(V, Part);
7252 }
7253 
7254 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent) const {
7255   O << " +\n"
7256     << Indent << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
7257   IG->getInsertPos()->printAsOperand(O, false);
7258   if (User) {
7259     O << ", ";
7260     User->getOperand(0)->printAsOperand(O);
7261   }
7262   O << "\\l\"";
7263   for (unsigned i = 0; i < IG->getFactor(); ++i)
7264     if (Instruction *I = IG->getMember(i))
7265       O << " +\n"
7266         << Indent << "\"  " << VPlanIngredient(I) << " " << i << "\\l\"";
7267 }
7268 
7269 void VPWidenRecipe::execute(VPTransformState &State) {
7270   for (auto &Instr : make_range(Begin, End))
7271     State.ILV->widenInstruction(Instr);
7272 }
7273 
7274 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
7275   assert(!State.Instance && "Int or FP induction being replicated.");
7276   State.ILV->widenIntOrFpInduction(IV, Trunc);
7277 }
7278 
7279 void VPWidenPHIRecipe::execute(VPTransformState &State) {
7280   State.ILV->widenPHIInstruction(Phi, State.UF, State.VF);
7281 }
7282 
7283 void VPBlendRecipe::execute(VPTransformState &State) {
7284   State.ILV->setDebugLocFromInst(State.Builder, Phi);
7285   // We know that all PHIs in non-header blocks are converted into
7286   // selects, so we don't have to worry about the insertion order and we
7287   // can just use the builder.
7288   // At this point we generate the predication tree. There may be
7289   // duplications since this is a simple recursive scan, but future
7290   // optimizations will clean it up.
7291 
7292   unsigned NumIncoming = Phi->getNumIncomingValues();
7293 
7294   assert((User || NumIncoming == 1) &&
7295          "Multiple predecessors with predecessors having a full mask");
7296   // Generate a sequence of selects of the form:
7297   // SELECT(Mask3, In3,
7298   //      SELECT(Mask2, In2,
7299   //                   ( ...)))
7300   InnerLoopVectorizer::VectorParts Entry(State.UF);
7301   for (unsigned In = 0; In < NumIncoming; ++In) {
7302     for (unsigned Part = 0; Part < State.UF; ++Part) {
7303       // We might have single edge PHIs (blocks) - use an identity
7304       // 'select' for the first PHI operand.
7305       Value *In0 =
7306           State.ILV->getOrCreateVectorValue(Phi->getIncomingValue(In), Part);
7307       if (In == 0)
7308         Entry[Part] = In0; // Initialize with the first incoming value.
7309       else {
7310         // Select between the current value and the previous incoming edge
7311         // based on the incoming mask.
7312         Value *Cond = State.get(User->getOperand(In), Part);
7313         Entry[Part] =
7314             State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi");
7315       }
7316     }
7317   }
7318   for (unsigned Part = 0; Part < State.UF; ++Part)
7319     State.ValueMap.setVectorValue(Phi, Part, Entry[Part]);
7320 }
7321 
7322 void VPInterleaveRecipe::execute(VPTransformState &State) {
7323   assert(!State.Instance && "Interleave group being replicated.");
7324   if (!User)
7325     return State.ILV->vectorizeInterleaveGroup(IG->getInsertPos());
7326 
7327   // Last (and currently only) operand is a mask.
7328   InnerLoopVectorizer::VectorParts MaskValues(State.UF);
7329   VPValue *Mask = User->getOperand(User->getNumOperands() - 1);
7330   for (unsigned Part = 0; Part < State.UF; ++Part)
7331     MaskValues[Part] = State.get(Mask, Part);
7332   State.ILV->vectorizeInterleaveGroup(IG->getInsertPos(), &MaskValues);
7333 }
7334 
7335 void VPReplicateRecipe::execute(VPTransformState &State) {
7336   if (State.Instance) { // Generate a single instance.
7337     State.ILV->scalarizeInstruction(Ingredient, *State.Instance, IsPredicated);
7338     // Insert scalar instance packing it into a vector.
7339     if (AlsoPack && State.VF > 1) {
7340       // If we're constructing lane 0, initialize to start from undef.
7341       if (State.Instance->Lane == 0) {
7342         Value *Undef =
7343             UndefValue::get(VectorType::get(Ingredient->getType(), State.VF));
7344         State.ValueMap.setVectorValue(Ingredient, State.Instance->Part, Undef);
7345       }
7346       State.ILV->packScalarIntoVectorValue(Ingredient, *State.Instance);
7347     }
7348     return;
7349   }
7350 
7351   // Generate scalar instances for all VF lanes of all UF parts, unless the
7352   // instruction is uniform inwhich case generate only the first lane for each
7353   // of the UF parts.
7354   unsigned EndLane = IsUniform ? 1 : State.VF;
7355   for (unsigned Part = 0; Part < State.UF; ++Part)
7356     for (unsigned Lane = 0; Lane < EndLane; ++Lane)
7357       State.ILV->scalarizeInstruction(Ingredient, {Part, Lane}, IsPredicated);
7358 }
7359 
7360 void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
7361   assert(State.Instance && "Branch on Mask works only on single instance.");
7362 
7363   unsigned Part = State.Instance->Part;
7364   unsigned Lane = State.Instance->Lane;
7365 
7366   Value *ConditionBit = nullptr;
7367   if (!User) // Block in mask is all-one.
7368     ConditionBit = State.Builder.getTrue();
7369   else {
7370     VPValue *BlockInMask = User->getOperand(0);
7371     ConditionBit = State.get(BlockInMask, Part);
7372     if (ConditionBit->getType()->isVectorTy())
7373       ConditionBit = State.Builder.CreateExtractElement(
7374           ConditionBit, State.Builder.getInt32(Lane));
7375   }
7376 
7377   // Replace the temporary unreachable terminator with a new conditional branch,
7378   // whose two destinations will be set later when they are created.
7379   auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
7380   assert(isa<UnreachableInst>(CurrentTerminator) &&
7381          "Expected to replace unreachable terminator with conditional branch.");
7382   auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
7383   CondBr->setSuccessor(0, nullptr);
7384   ReplaceInstWithInst(CurrentTerminator, CondBr);
7385 }
7386 
7387 void VPPredInstPHIRecipe::execute(VPTransformState &State) {
7388   assert(State.Instance && "Predicated instruction PHI works per instance.");
7389   Instruction *ScalarPredInst = cast<Instruction>(
7390       State.ValueMap.getScalarValue(PredInst, *State.Instance));
7391   BasicBlock *PredicatedBB = ScalarPredInst->getParent();
7392   BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
7393   assert(PredicatingBB && "Predicated block has no single predecessor.");
7394 
7395   // By current pack/unpack logic we need to generate only a single phi node: if
7396   // a vector value for the predicated instruction exists at this point it means
7397   // the instruction has vector users only, and a phi for the vector value is
7398   // needed. In this case the recipe of the predicated instruction is marked to
7399   // also do that packing, thereby "hoisting" the insert-element sequence.
7400   // Otherwise, a phi node for the scalar value is needed.
7401   unsigned Part = State.Instance->Part;
7402   if (State.ValueMap.hasVectorValue(PredInst, Part)) {
7403     Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part);
7404     InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
7405     PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
7406     VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
7407     VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
7408     State.ValueMap.resetVectorValue(PredInst, Part, VPhi); // Update cache.
7409   } else {
7410     Type *PredInstType = PredInst->getType();
7411     PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
7412     Phi->addIncoming(UndefValue::get(ScalarPredInst->getType()), PredicatingBB);
7413     Phi->addIncoming(ScalarPredInst, PredicatedBB);
7414     State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi);
7415   }
7416 }
7417 
7418 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
7419   VPValue *Mask = getMask();
7420   if (!Mask)
7421     return State.ILV->vectorizeMemoryInstruction(&Instr);
7422 
7423   InnerLoopVectorizer::VectorParts MaskValues(State.UF);
7424   for (unsigned Part = 0; Part < State.UF; ++Part)
7425     MaskValues[Part] = State.get(Mask, Part);
7426   State.ILV->vectorizeMemoryInstruction(&Instr, &MaskValues);
7427 }
7428 
7429 static ScalarEpilogueLowering
7430 getScalarEpilogueLowering(Function *F, Loop *L, LoopVectorizeHints &Hints,
7431                           ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI,
7432                           TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
7433                           AssumptionCache *AC, LoopInfo *LI,
7434                           ScalarEvolution *SE, DominatorTree *DT,
7435                           const LoopAccessInfo *LAI) {
7436   ScalarEpilogueLowering SEL = CM_ScalarEpilogueAllowed;
7437   bool PredicateOptDisabled = PreferPredicateOverEpilog.getNumOccurrences() &&
7438                               !PreferPredicateOverEpilog;
7439 
7440   if (Hints.getForce() != LoopVectorizeHints::FK_Enabled &&
7441       (F->hasOptSize() ||
7442        llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
7443                                    PGSOQueryType::IRPass)))
7444     SEL = CM_ScalarEpilogueNotAllowedOptSize;
7445   else if (PreferPredicateOverEpilog ||
7446            Hints.getPredicate() == LoopVectorizeHints::FK_Enabled ||
7447            (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, LAI) &&
7448             Hints.getPredicate() != LoopVectorizeHints::FK_Disabled &&
7449             !PredicateOptDisabled))
7450     SEL = CM_ScalarEpilogueNotNeededUsePredicate;
7451 
7452   return SEL;
7453 }
7454 
7455 // Process the loop in the VPlan-native vectorization path. This path builds
7456 // VPlan upfront in the vectorization pipeline, which allows to apply
7457 // VPlan-to-VPlan transformations from the very beginning without modifying the
7458 // input LLVM IR.
7459 static bool processLoopInVPlanNativePath(
7460     Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
7461     LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
7462     TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
7463     OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
7464     ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) {
7465 
7466   assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
7467   Function *F = L->getHeader()->getParent();
7468   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
7469 
7470   ScalarEpilogueLowering SEL =
7471     getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, AC, LI,
7472                               PSE.getSE(), DT, LVL->getLAI());
7473 
7474   LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
7475                                 &Hints, IAI);
7476   // Use the planner for outer loop vectorization.
7477   // TODO: CM is not used at this point inside the planner. Turn CM into an
7478   // optional argument if we don't need it in the future.
7479   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI);
7480 
7481   // Get user vectorization factor.
7482   const unsigned UserVF = Hints.getWidth();
7483 
7484   // Plan how to best vectorize, return the best VF and its cost.
7485   const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
7486 
7487   // If we are stress testing VPlan builds, do not attempt to generate vector
7488   // code. Masked vector code generation support will follow soon.
7489   // Also, do not attempt to vectorize if no vector code will be produced.
7490   if (VPlanBuildStressTest || EnableVPlanPredication ||
7491       VectorizationFactor::Disabled() == VF)
7492     return false;
7493 
7494   LVP.setBestPlan(VF.Width, 1);
7495 
7496   InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL,
7497                          &CM);
7498   LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
7499                     << L->getHeader()->getParent()->getName() << "\"\n");
7500   LVP.executePlan(LB, DT);
7501 
7502   // Mark the loop as already vectorized to avoid vectorizing again.
7503   Hints.setAlreadyVectorized();
7504 
7505   LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent()));
7506   return true;
7507 }
7508 
7509 bool LoopVectorizePass::processLoop(Loop *L) {
7510   assert((EnableVPlanNativePath || L->empty()) &&
7511          "VPlan-native path is not enabled. Only process inner loops.");
7512 
7513 #ifndef NDEBUG
7514   const std::string DebugLocStr = getDebugLocString(L);
7515 #endif /* NDEBUG */
7516 
7517   LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \""
7518                     << L->getHeader()->getParent()->getName() << "\" from "
7519                     << DebugLocStr << "\n");
7520 
7521   LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE);
7522 
7523   LLVM_DEBUG(
7524       dbgs() << "LV: Loop hints:"
7525              << " force="
7526              << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
7527                      ? "disabled"
7528                      : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
7529                             ? "enabled"
7530                             : "?"))
7531              << " width=" << Hints.getWidth()
7532              << " unroll=" << Hints.getInterleave() << "\n");
7533 
7534   // Function containing loop
7535   Function *F = L->getHeader()->getParent();
7536 
7537   // Looking at the diagnostic output is the only way to determine if a loop
7538   // was vectorized (other than looking at the IR or machine code), so it
7539   // is important to generate an optimization remark for each loop. Most of
7540   // these messages are generated as OptimizationRemarkAnalysis. Remarks
7541   // generated as OptimizationRemark and OptimizationRemarkMissed are
7542   // less verbose reporting vectorized loops and unvectorized loops that may
7543   // benefit from vectorization, respectively.
7544 
7545   if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
7546     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
7547     return false;
7548   }
7549 
7550   PredicatedScalarEvolution PSE(*SE, *L);
7551 
7552   // Check if it is legal to vectorize the loop.
7553   LoopVectorizationRequirements Requirements(*ORE);
7554   LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE,
7555                                 &Requirements, &Hints, DB, AC);
7556   if (!LVL.canVectorize(EnableVPlanNativePath)) {
7557     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
7558     Hints.emitRemarkWithHints();
7559     return false;
7560   }
7561 
7562   // Check the function attributes and profiles to find out if this function
7563   // should be optimized for size.
7564   ScalarEpilogueLowering SEL =
7565     getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, AC, LI,
7566                               PSE.getSE(), DT, LVL.getLAI());
7567 
7568   // Entrance to the VPlan-native vectorization path. Outer loops are processed
7569   // here. They may require CFG and instruction level transformations before
7570   // even evaluating whether vectorization is profitable. Since we cannot modify
7571   // the incoming IR, we need to build VPlan upfront in the vectorization
7572   // pipeline.
7573   if (!L->empty())
7574     return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
7575                                         ORE, BFI, PSI, Hints);
7576 
7577   assert(L->empty() && "Inner loop expected.");
7578 
7579   // Check the loop for a trip count threshold: vectorize loops with a tiny trip
7580   // count by optimizing for size, to minimize overheads.
7581   auto ExpectedTC = getSmallBestKnownTC(*SE, L);
7582   if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
7583     LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
7584                       << "This loop is worth vectorizing only if no scalar "
7585                       << "iteration overheads are incurred.");
7586     if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
7587       LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
7588     else {
7589       LLVM_DEBUG(dbgs() << "\n");
7590       SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
7591     }
7592   }
7593 
7594   // Check the function attributes to see if implicit floats are allowed.
7595   // FIXME: This check doesn't seem possibly correct -- what if the loop is
7596   // an integer loop and the vector instructions selected are purely integer
7597   // vector instructions?
7598   if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
7599     reportVectorizationFailure(
7600         "Can't vectorize when the NoImplicitFloat attribute is used",
7601         "loop not vectorized due to NoImplicitFloat attribute",
7602         "NoImplicitFloat", ORE, L);
7603     Hints.emitRemarkWithHints();
7604     return false;
7605   }
7606 
7607   // Check if the target supports potentially unsafe FP vectorization.
7608   // FIXME: Add a check for the type of safety issue (denormal, signaling)
7609   // for the target we're vectorizing for, to make sure none of the
7610   // additional fp-math flags can help.
7611   if (Hints.isPotentiallyUnsafe() &&
7612       TTI->isFPVectorizationPotentiallyUnsafe()) {
7613     reportVectorizationFailure(
7614         "Potentially unsafe FP op prevents vectorization",
7615         "loop not vectorized due to unsafe FP support.",
7616         "UnsafeFP", ORE, L);
7617     Hints.emitRemarkWithHints();
7618     return false;
7619   }
7620 
7621   bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
7622   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
7623 
7624   // If an override option has been passed in for interleaved accesses, use it.
7625   if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
7626     UseInterleaved = EnableInterleavedMemAccesses;
7627 
7628   // Analyze interleaved memory accesses.
7629   if (UseInterleaved) {
7630     IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
7631   }
7632 
7633   // Use the cost model.
7634   LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
7635                                 F, &Hints, IAI);
7636   CM.collectValuesToIgnore();
7637 
7638   // Use the planner for vectorization.
7639   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI);
7640 
7641   // Get user vectorization factor.
7642   unsigned UserVF = Hints.getWidth();
7643 
7644   // Plan how to best vectorize, return the best VF and its cost.
7645   Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF);
7646 
7647   VectorizationFactor VF = VectorizationFactor::Disabled();
7648   unsigned IC = 1;
7649   unsigned UserIC = Hints.getInterleave();
7650 
7651   if (MaybeVF) {
7652     VF = *MaybeVF;
7653     // Select the interleave count.
7654     IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
7655   }
7656 
7657   // Identify the diagnostic messages that should be produced.
7658   std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
7659   bool VectorizeLoop = true, InterleaveLoop = true;
7660   if (Requirements.doesNotMeet(F, L, Hints)) {
7661     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization "
7662                          "requirements.\n");
7663     Hints.emitRemarkWithHints();
7664     return false;
7665   }
7666 
7667   if (VF.Width == 1) {
7668     LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
7669     VecDiagMsg = std::make_pair(
7670         "VectorizationNotBeneficial",
7671         "the cost-model indicates that vectorization is not beneficial");
7672     VectorizeLoop = false;
7673   }
7674 
7675   if (!MaybeVF && UserIC > 1) {
7676     // Tell the user interleaving was avoided up-front, despite being explicitly
7677     // requested.
7678     LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
7679                          "interleaving should be avoided up front\n");
7680     IntDiagMsg = std::make_pair(
7681         "InterleavingAvoided",
7682         "Ignoring UserIC, because interleaving was avoided up front");
7683     InterleaveLoop = false;
7684   } else if (IC == 1 && UserIC <= 1) {
7685     // Tell the user interleaving is not beneficial.
7686     LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
7687     IntDiagMsg = std::make_pair(
7688         "InterleavingNotBeneficial",
7689         "the cost-model indicates that interleaving is not beneficial");
7690     InterleaveLoop = false;
7691     if (UserIC == 1) {
7692       IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
7693       IntDiagMsg.second +=
7694           " and is explicitly disabled or interleave count is set to 1";
7695     }
7696   } else if (IC > 1 && UserIC == 1) {
7697     // Tell the user interleaving is beneficial, but it explicitly disabled.
7698     LLVM_DEBUG(
7699         dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
7700     IntDiagMsg = std::make_pair(
7701         "InterleavingBeneficialButDisabled",
7702         "the cost-model indicates that interleaving is beneficial "
7703         "but is explicitly disabled or interleave count is set to 1");
7704     InterleaveLoop = false;
7705   }
7706 
7707   // Override IC if user provided an interleave count.
7708   IC = UserIC > 0 ? UserIC : IC;
7709 
7710   // Emit diagnostic messages, if any.
7711   const char *VAPassName = Hints.vectorizeAnalysisPassName();
7712   if (!VectorizeLoop && !InterleaveLoop) {
7713     // Do not vectorize or interleaving the loop.
7714     ORE->emit([&]() {
7715       return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
7716                                       L->getStartLoc(), L->getHeader())
7717              << VecDiagMsg.second;
7718     });
7719     ORE->emit([&]() {
7720       return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
7721                                       L->getStartLoc(), L->getHeader())
7722              << IntDiagMsg.second;
7723     });
7724     return false;
7725   } else if (!VectorizeLoop && InterleaveLoop) {
7726     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
7727     ORE->emit([&]() {
7728       return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
7729                                         L->getStartLoc(), L->getHeader())
7730              << VecDiagMsg.second;
7731     });
7732   } else if (VectorizeLoop && !InterleaveLoop) {
7733     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
7734                       << ") in " << DebugLocStr << '\n');
7735     ORE->emit([&]() {
7736       return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
7737                                         L->getStartLoc(), L->getHeader())
7738              << IntDiagMsg.second;
7739     });
7740   } else if (VectorizeLoop && InterleaveLoop) {
7741     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
7742                       << ") in " << DebugLocStr << '\n');
7743     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
7744   }
7745 
7746   LVP.setBestPlan(VF.Width, IC);
7747 
7748   using namespace ore;
7749   bool DisableRuntimeUnroll = false;
7750   MDNode *OrigLoopID = L->getLoopID();
7751 
7752   if (!VectorizeLoop) {
7753     assert(IC > 1 && "interleave count should not be 1 or 0");
7754     // If we decided that it is not legal to vectorize the loop, then
7755     // interleave it.
7756     InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
7757                                &CM);
7758     LVP.executePlan(Unroller, DT);
7759 
7760     ORE->emit([&]() {
7761       return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
7762                                 L->getHeader())
7763              << "interleaved loop (interleaved count: "
7764              << NV("InterleaveCount", IC) << ")";
7765     });
7766   } else {
7767     // If we decided that it is *legal* to vectorize the loop, then do it.
7768     InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
7769                            &LVL, &CM);
7770     LVP.executePlan(LB, DT);
7771     ++LoopsVectorized;
7772 
7773     // Add metadata to disable runtime unrolling a scalar loop when there are
7774     // no runtime checks about strides and memory. A scalar loop that is
7775     // rarely used is not worth unrolling.
7776     if (!LB.areSafetyChecksAdded())
7777       DisableRuntimeUnroll = true;
7778 
7779     // Report the vectorization decision.
7780     ORE->emit([&]() {
7781       return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(),
7782                                 L->getHeader())
7783              << "vectorized loop (vectorization width: "
7784              << NV("VectorizationFactor", VF.Width)
7785              << ", interleaved count: " << NV("InterleaveCount", IC) << ")";
7786     });
7787   }
7788 
7789   Optional<MDNode *> RemainderLoopID =
7790       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
7791                                       LLVMLoopVectorizeFollowupEpilogue});
7792   if (RemainderLoopID.hasValue()) {
7793     L->setLoopID(RemainderLoopID.getValue());
7794   } else {
7795     if (DisableRuntimeUnroll)
7796       AddRuntimeUnrollDisableMetaData(L);
7797 
7798     // Mark the loop as already vectorized to avoid vectorizing again.
7799     Hints.setAlreadyVectorized();
7800   }
7801 
7802   LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent()));
7803   return true;
7804 }
7805 
7806 bool LoopVectorizePass::runImpl(
7807     Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_,
7808     DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_,
7809     DemandedBits &DB_, AliasAnalysis &AA_, AssumptionCache &AC_,
7810     std::function<const LoopAccessInfo &(Loop &)> &GetLAA_,
7811     OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) {
7812   SE = &SE_;
7813   LI = &LI_;
7814   TTI = &TTI_;
7815   DT = &DT_;
7816   BFI = &BFI_;
7817   TLI = TLI_;
7818   AA = &AA_;
7819   AC = &AC_;
7820   GetLAA = &GetLAA_;
7821   DB = &DB_;
7822   ORE = &ORE_;
7823   PSI = PSI_;
7824 
7825   // Don't attempt if
7826   // 1. the target claims to have no vector registers, and
7827   // 2. interleaving won't help ILP.
7828   //
7829   // The second condition is necessary because, even if the target has no
7830   // vector registers, loop vectorization may still enable scalar
7831   // interleaving.
7832   if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) &&
7833       TTI->getMaxInterleaveFactor(1) < 2)
7834     return false;
7835 
7836   bool Changed = false;
7837 
7838   // The vectorizer requires loops to be in simplified form.
7839   // Since simplification may add new inner loops, it has to run before the
7840   // legality and profitability checks. This means running the loop vectorizer
7841   // will simplify all loops, regardless of whether anything end up being
7842   // vectorized.
7843   for (auto &L : *LI)
7844     Changed |=
7845         simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
7846 
7847   // Build up a worklist of inner-loops to vectorize. This is necessary as
7848   // the act of vectorizing or partially unrolling a loop creates new loops
7849   // and can invalidate iterators across the loops.
7850   SmallVector<Loop *, 8> Worklist;
7851 
7852   for (Loop *L : *LI)
7853     collectSupportedLoops(*L, LI, ORE, Worklist);
7854 
7855   LoopsAnalyzed += Worklist.size();
7856 
7857   // Now walk the identified inner loops.
7858   while (!Worklist.empty()) {
7859     Loop *L = Worklist.pop_back_val();
7860 
7861     // For the inner loops we actually process, form LCSSA to simplify the
7862     // transform.
7863     Changed |= formLCSSARecursively(*L, *DT, LI, SE);
7864 
7865     Changed |= processLoop(L);
7866   }
7867 
7868   // Process each loop nest in the function.
7869   return Changed;
7870 }
7871 
7872 PreservedAnalyses LoopVectorizePass::run(Function &F,
7873                                          FunctionAnalysisManager &AM) {
7874     auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
7875     auto &LI = AM.getResult<LoopAnalysis>(F);
7876     auto &TTI = AM.getResult<TargetIRAnalysis>(F);
7877     auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
7878     auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F);
7879     auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
7880     auto &AA = AM.getResult<AAManager>(F);
7881     auto &AC = AM.getResult<AssumptionAnalysis>(F);
7882     auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
7883     auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
7884     MemorySSA *MSSA = EnableMSSALoopDependency
7885                           ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA()
7886                           : nullptr;
7887 
7888     auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
7889     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
7890         [&](Loop &L) -> const LoopAccessInfo & {
7891       LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI, MSSA};
7892       return LAM.getResult<LoopAccessAnalysis>(L, AR);
7893     };
7894     const ModuleAnalysisManager &MAM =
7895         AM.getResult<ModuleAnalysisManagerFunctionProxy>(F).getManager();
7896     ProfileSummaryInfo *PSI =
7897         MAM.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
7898     bool Changed =
7899         runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI);
7900     if (!Changed)
7901       return PreservedAnalyses::all();
7902     PreservedAnalyses PA;
7903 
7904     // We currently do not preserve loopinfo/dominator analyses with outer loop
7905     // vectorization. Until this is addressed, mark these analyses as preserved
7906     // only for non-VPlan-native path.
7907     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
7908     if (!EnableVPlanNativePath) {
7909       PA.preserve<LoopAnalysis>();
7910       PA.preserve<DominatorTreeAnalysis>();
7911     }
7912     PA.preserve<BasicAA>();
7913     PA.preserve<GlobalsAA>();
7914     return PA;
7915 }
7916