1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10 // and generates target-independent LLVM-IR.
11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12 // of instructions in order to estimate the profitability of vectorization.
13 //
14 // The loop vectorizer combines consecutive loop iterations into a single
15 // 'wide' iteration. After this transformation the index is incremented
16 // by the SIMD vector width, and not by one.
17 //
18 // This pass has three parts:
19 // 1. The main loop pass that drives the different parts.
20 // 2. LoopVectorizationLegality - A unit that checks for the legality
21 //    of the vectorization.
22 // 3. InnerLoopVectorizer - A unit that performs the actual
23 //    widening of instructions.
24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
25 //    of vectorization. It decides on the optimal vector width, which
26 //    can be one, if vectorization is not profitable.
27 //
28 // There is a development effort going on to migrate loop vectorizer to the
29 // VPlan infrastructure and to introduce outer loop vectorization support (see
30 // docs/Proposal/VectorizationPlan.rst and
31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32 // purpose, we temporarily introduced the VPlan-native vectorization path: an
33 // alternative vectorization path that is natively implemented on top of the
34 // VPlan infrastructure. See EnableVPlanNativePath for enabling.
35 //
36 //===----------------------------------------------------------------------===//
37 //
38 // The reduction-variable vectorization is based on the paper:
39 //  D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40 //
41 // Variable uniformity checks are inspired by:
42 //  Karrenberg, R. and Hack, S. Whole Function Vectorization.
43 //
44 // The interleaved access vectorization is based on the paper:
45 //  Dorit Nuzman, Ira Rosen and Ayal Zaks.  Auto-Vectorization of Interleaved
46 //  Data for SIMD
47 //
48 // Other ideas/concepts are from:
49 //  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50 //
51 //  S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua.  An Evaluation of
52 //  Vectorizing Compilers.
53 //
54 //===----------------------------------------------------------------------===//
55 
56 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
57 #include "LoopVectorizationPlanner.h"
58 #include "VPRecipeBuilder.h"
59 #include "VPlan.h"
60 #include "VPlanHCFGBuilder.h"
61 #include "VPlanHCFGTransforms.h"
62 #include "VPlanPredicator.h"
63 #include "llvm/ADT/APInt.h"
64 #include "llvm/ADT/ArrayRef.h"
65 #include "llvm/ADT/DenseMap.h"
66 #include "llvm/ADT/DenseMapInfo.h"
67 #include "llvm/ADT/Hashing.h"
68 #include "llvm/ADT/MapVector.h"
69 #include "llvm/ADT/None.h"
70 #include "llvm/ADT/Optional.h"
71 #include "llvm/ADT/STLExtras.h"
72 #include "llvm/ADT/SetVector.h"
73 #include "llvm/ADT/SmallPtrSet.h"
74 #include "llvm/ADT/SmallVector.h"
75 #include "llvm/ADT/Statistic.h"
76 #include "llvm/ADT/StringRef.h"
77 #include "llvm/ADT/Twine.h"
78 #include "llvm/ADT/iterator_range.h"
79 #include "llvm/Analysis/AssumptionCache.h"
80 #include "llvm/Analysis/BasicAliasAnalysis.h"
81 #include "llvm/Analysis/BlockFrequencyInfo.h"
82 #include "llvm/Analysis/CFG.h"
83 #include "llvm/Analysis/CodeMetrics.h"
84 #include "llvm/Analysis/DemandedBits.h"
85 #include "llvm/Analysis/GlobalsModRef.h"
86 #include "llvm/Analysis/LoopAccessAnalysis.h"
87 #include "llvm/Analysis/LoopAnalysisManager.h"
88 #include "llvm/Analysis/LoopInfo.h"
89 #include "llvm/Analysis/LoopIterator.h"
90 #include "llvm/Analysis/MemorySSA.h"
91 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
92 #include "llvm/Analysis/ProfileSummaryInfo.h"
93 #include "llvm/Analysis/ScalarEvolution.h"
94 #include "llvm/Analysis/ScalarEvolutionExpander.h"
95 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
96 #include "llvm/Analysis/TargetLibraryInfo.h"
97 #include "llvm/Analysis/TargetTransformInfo.h"
98 #include "llvm/Analysis/VectorUtils.h"
99 #include "llvm/IR/Attributes.h"
100 #include "llvm/IR/BasicBlock.h"
101 #include "llvm/IR/CFG.h"
102 #include "llvm/IR/Constant.h"
103 #include "llvm/IR/Constants.h"
104 #include "llvm/IR/DataLayout.h"
105 #include "llvm/IR/DebugInfoMetadata.h"
106 #include "llvm/IR/DebugLoc.h"
107 #include "llvm/IR/DerivedTypes.h"
108 #include "llvm/IR/DiagnosticInfo.h"
109 #include "llvm/IR/Dominators.h"
110 #include "llvm/IR/Function.h"
111 #include "llvm/IR/IRBuilder.h"
112 #include "llvm/IR/InstrTypes.h"
113 #include "llvm/IR/Instruction.h"
114 #include "llvm/IR/Instructions.h"
115 #include "llvm/IR/IntrinsicInst.h"
116 #include "llvm/IR/Intrinsics.h"
117 #include "llvm/IR/LLVMContext.h"
118 #include "llvm/IR/Metadata.h"
119 #include "llvm/IR/Module.h"
120 #include "llvm/IR/Operator.h"
121 #include "llvm/IR/Type.h"
122 #include "llvm/IR/Use.h"
123 #include "llvm/IR/User.h"
124 #include "llvm/IR/Value.h"
125 #include "llvm/IR/ValueHandle.h"
126 #include "llvm/IR/Verifier.h"
127 #include "llvm/InitializePasses.h"
128 #include "llvm/Pass.h"
129 #include "llvm/Support/Casting.h"
130 #include "llvm/Support/CommandLine.h"
131 #include "llvm/Support/Compiler.h"
132 #include "llvm/Support/Debug.h"
133 #include "llvm/Support/ErrorHandling.h"
134 #include "llvm/Support/MathExtras.h"
135 #include "llvm/Support/raw_ostream.h"
136 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
137 #include "llvm/Transforms/Utils/LoopSimplify.h"
138 #include "llvm/Transforms/Utils/LoopUtils.h"
139 #include "llvm/Transforms/Utils/LoopVersioning.h"
140 #include "llvm/Transforms/Utils/SizeOpts.h"
141 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
142 #include <algorithm>
143 #include <cassert>
144 #include <cstdint>
145 #include <cstdlib>
146 #include <functional>
147 #include <iterator>
148 #include <limits>
149 #include <memory>
150 #include <string>
151 #include <tuple>
152 #include <utility>
153 
154 using namespace llvm;
155 
156 #define LV_NAME "loop-vectorize"
157 #define DEBUG_TYPE LV_NAME
158 
159 /// @{
160 /// Metadata attribute names
161 static const char *const LLVMLoopVectorizeFollowupAll =
162     "llvm.loop.vectorize.followup_all";
163 static const char *const LLVMLoopVectorizeFollowupVectorized =
164     "llvm.loop.vectorize.followup_vectorized";
165 static const char *const LLVMLoopVectorizeFollowupEpilogue =
166     "llvm.loop.vectorize.followup_epilogue";
167 /// @}
168 
169 STATISTIC(LoopsVectorized, "Number of loops vectorized");
170 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
171 
172 /// Loops with a known constant trip count below this number are vectorized only
173 /// if no scalar iteration overheads are incurred.
174 static cl::opt<unsigned> TinyTripCountVectorThreshold(
175     "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
176     cl::desc("Loops with a constant trip count that is smaller than this "
177              "value are vectorized only if no scalar iteration overheads "
178              "are incurred."));
179 
180 // Indicates that an epilogue is undesired, predication is preferred.
181 // This means that the vectorizer will try to fold the loop-tail (epilogue)
182 // into the loop and predicate the loop body accordingly.
183 static cl::opt<bool> PreferPredicateOverEpilog(
184     "prefer-predicate-over-epilog", cl::init(false), cl::Hidden,
185     cl::desc("Indicate that an epilogue is undesired, predication should be "
186              "used instead."));
187 
188 static cl::opt<bool> MaximizeBandwidth(
189     "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
190     cl::desc("Maximize bandwidth when selecting vectorization factor which "
191              "will be determined by the smallest type in loop."));
192 
193 static cl::opt<bool> EnableInterleavedMemAccesses(
194     "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
195     cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
196 
197 /// An interleave-group may need masking if it resides in a block that needs
198 /// predication, or in order to mask away gaps.
199 static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
200     "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
201     cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
202 
203 static cl::opt<unsigned> TinyTripCountInterleaveThreshold(
204     "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden,
205     cl::desc("We don't interleave loops with a estimated constant trip count "
206              "below this number"));
207 
208 static cl::opt<unsigned> ForceTargetNumScalarRegs(
209     "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
210     cl::desc("A flag that overrides the target's number of scalar registers."));
211 
212 static cl::opt<unsigned> ForceTargetNumVectorRegs(
213     "force-target-num-vector-regs", cl::init(0), cl::Hidden,
214     cl::desc("A flag that overrides the target's number of vector registers."));
215 
216 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
217     "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
218     cl::desc("A flag that overrides the target's max interleave factor for "
219              "scalar loops."));
220 
221 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
222     "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
223     cl::desc("A flag that overrides the target's max interleave factor for "
224              "vectorized loops."));
225 
226 static cl::opt<unsigned> ForceTargetInstructionCost(
227     "force-target-instruction-cost", cl::init(0), cl::Hidden,
228     cl::desc("A flag that overrides the target's expected cost for "
229              "an instruction to a single constant value. Mostly "
230              "useful for getting consistent testing."));
231 
232 static cl::opt<unsigned> SmallLoopCost(
233     "small-loop-cost", cl::init(20), cl::Hidden,
234     cl::desc(
235         "The cost of a loop that is considered 'small' by the interleaver."));
236 
237 static cl::opt<bool> LoopVectorizeWithBlockFrequency(
238     "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
239     cl::desc("Enable the use of the block frequency analysis to access PGO "
240              "heuristics minimizing code growth in cold regions and being more "
241              "aggressive in hot regions."));
242 
243 // Runtime interleave loops for load/store throughput.
244 static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
245     "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
246     cl::desc(
247         "Enable runtime interleaving until load/store ports are saturated"));
248 
249 /// The number of stores in a loop that are allowed to need predication.
250 static cl::opt<unsigned> NumberOfStoresToPredicate(
251     "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
252     cl::desc("Max number of stores to be predicated behind an if."));
253 
254 static cl::opt<bool> EnableIndVarRegisterHeur(
255     "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
256     cl::desc("Count the induction variable only once when interleaving"));
257 
258 static cl::opt<bool> EnableCondStoresVectorization(
259     "enable-cond-stores-vec", cl::init(true), cl::Hidden,
260     cl::desc("Enable if predication of stores during vectorization."));
261 
262 static cl::opt<unsigned> MaxNestedScalarReductionIC(
263     "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
264     cl::desc("The maximum interleave count to use when interleaving a scalar "
265              "reduction in a nested loop."));
266 
267 cl::opt<bool> EnableVPlanNativePath(
268     "enable-vplan-native-path", cl::init(false), cl::Hidden,
269     cl::desc("Enable VPlan-native vectorization path with "
270              "support for outer loop vectorization."));
271 
272 // FIXME: Remove this switch once we have divergence analysis. Currently we
273 // assume divergent non-backedge branches when this switch is true.
274 cl::opt<bool> EnableVPlanPredication(
275     "enable-vplan-predication", cl::init(false), cl::Hidden,
276     cl::desc("Enable VPlan-native vectorization path predicator with "
277              "support for outer loop vectorization."));
278 
279 // This flag enables the stress testing of the VPlan H-CFG construction in the
280 // VPlan-native vectorization path. It must be used in conjuction with
281 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
282 // verification of the H-CFGs built.
283 static cl::opt<bool> VPlanBuildStressTest(
284     "vplan-build-stress-test", cl::init(false), cl::Hidden,
285     cl::desc(
286         "Build VPlan for every supported loop nest in the function and bail "
287         "out right after the build (stress test the VPlan H-CFG construction "
288         "in the VPlan-native vectorization path)."));
289 
290 cl::opt<bool> llvm::EnableLoopInterleaving(
291     "interleave-loops", cl::init(true), cl::Hidden,
292     cl::desc("Enable loop interleaving in Loop vectorization passes"));
293 cl::opt<bool> llvm::EnableLoopVectorization(
294     "vectorize-loops", cl::init(true), cl::Hidden,
295     cl::desc("Run the Loop vectorization passes"));
296 
297 /// A helper function for converting Scalar types to vector types.
298 /// If the incoming type is void, we return void. If the VF is 1, we return
299 /// the scalar type.
300 static Type *ToVectorTy(Type *Scalar, unsigned VF) {
301   if (Scalar->isVoidTy() || VF == 1)
302     return Scalar;
303   return VectorType::get(Scalar, VF);
304 }
305 
306 /// A helper function that returns the type of loaded or stored value.
307 static Type *getMemInstValueType(Value *I) {
308   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
309          "Expected Load or Store instruction");
310   if (auto *LI = dyn_cast<LoadInst>(I))
311     return LI->getType();
312   return cast<StoreInst>(I)->getValueOperand()->getType();
313 }
314 
315 /// A helper function that returns true if the given type is irregular. The
316 /// type is irregular if its allocated size doesn't equal the store size of an
317 /// element of the corresponding vector type at the given vectorization factor.
318 static bool hasIrregularType(Type *Ty, const DataLayout &DL, unsigned VF) {
319   // Determine if an array of VF elements of type Ty is "bitcast compatible"
320   // with a <VF x Ty> vector.
321   if (VF > 1) {
322     auto *VectorTy = VectorType::get(Ty, VF);
323     return VF * DL.getTypeAllocSize(Ty) != DL.getTypeStoreSize(VectorTy);
324   }
325 
326   // If the vectorization factor is one, we just check if an array of type Ty
327   // requires padding between elements.
328   return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
329 }
330 
331 /// A helper function that returns the reciprocal of the block probability of
332 /// predicated blocks. If we return X, we are assuming the predicated block
333 /// will execute once for every X iterations of the loop header.
334 ///
335 /// TODO: We should use actual block probability here, if available. Currently,
336 ///       we always assume predicated blocks have a 50% chance of executing.
337 static unsigned getReciprocalPredBlockProb() { return 2; }
338 
339 /// A helper function that adds a 'fast' flag to floating-point operations.
340 static Value *addFastMathFlag(Value *V) {
341   if (isa<FPMathOperator>(V))
342     cast<Instruction>(V)->setFastMathFlags(FastMathFlags::getFast());
343   return V;
344 }
345 
346 static Value *addFastMathFlag(Value *V, FastMathFlags FMF) {
347   if (isa<FPMathOperator>(V))
348     cast<Instruction>(V)->setFastMathFlags(FMF);
349   return V;
350 }
351 
352 /// A helper function that returns an integer or floating-point constant with
353 /// value C.
354 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
355   return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
356                            : ConstantFP::get(Ty, C);
357 }
358 
359 /// Returns "best known" trip count for the specified loop \p L as defined by
360 /// the following procedure:
361 ///   1) Returns exact trip count if it is known.
362 ///   2) Returns expected trip count according to profile data if any.
363 ///   3) Returns upper bound estimate if it is known.
364 ///   4) Returns None if all of the above failed.
365 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) {
366   // Check if exact trip count is known.
367   if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
368     return ExpectedTC;
369 
370   // Check if there is an expected trip count available from profile data.
371   if (LoopVectorizeWithBlockFrequency)
372     if (auto EstimatedTC = getLoopEstimatedTripCount(L))
373       return EstimatedTC;
374 
375   // Check if upper bound estimate is known.
376   if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
377     return ExpectedTC;
378 
379   return None;
380 }
381 
382 namespace llvm {
383 
384 /// InnerLoopVectorizer vectorizes loops which contain only one basic
385 /// block to a specified vectorization factor (VF).
386 /// This class performs the widening of scalars into vectors, or multiple
387 /// scalars. This class also implements the following features:
388 /// * It inserts an epilogue loop for handling loops that don't have iteration
389 ///   counts that are known to be a multiple of the vectorization factor.
390 /// * It handles the code generation for reduction variables.
391 /// * Scalarization (implementation using scalars) of un-vectorizable
392 ///   instructions.
393 /// InnerLoopVectorizer does not perform any vectorization-legality
394 /// checks, and relies on the caller to check for the different legality
395 /// aspects. The InnerLoopVectorizer relies on the
396 /// LoopVectorizationLegality class to provide information about the induction
397 /// and reduction variables that were found to a given vectorization factor.
398 class InnerLoopVectorizer {
399 public:
400   InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
401                       LoopInfo *LI, DominatorTree *DT,
402                       const TargetLibraryInfo *TLI,
403                       const TargetTransformInfo *TTI, AssumptionCache *AC,
404                       OptimizationRemarkEmitter *ORE, unsigned VecWidth,
405                       unsigned UnrollFactor, LoopVectorizationLegality *LVL,
406                       LoopVectorizationCostModel *CM)
407       : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
408         AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
409         Builder(PSE.getSE()->getContext()),
410         VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM) {}
411   virtual ~InnerLoopVectorizer() = default;
412 
413   /// Create a new empty loop. Unlink the old loop and connect the new one.
414   /// Return the pre-header block of the new loop.
415   BasicBlock *createVectorizedLoopSkeleton();
416 
417   /// Widen a single instruction within the innermost loop.
418   void widenInstruction(Instruction &I);
419 
420   /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
421   void fixVectorizedLoop();
422 
423   // Return true if any runtime check is added.
424   bool areSafetyChecksAdded() { return AddedSafetyChecks; }
425 
426   /// A type for vectorized values in the new loop. Each value from the
427   /// original loop, when vectorized, is represented by UF vector values in the
428   /// new unrolled loop, where UF is the unroll factor.
429   using VectorParts = SmallVector<Value *, 2>;
430 
431   /// Vectorize a single PHINode in a block. This method handles the induction
432   /// variable canonicalization. It supports both VF = 1 for unrolled loops and
433   /// arbitrary length vectors.
434   void widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF);
435 
436   /// A helper function to scalarize a single Instruction in the innermost loop.
437   /// Generates a sequence of scalar instances for each lane between \p MinLane
438   /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
439   /// inclusive..
440   void scalarizeInstruction(Instruction *Instr, const VPIteration &Instance,
441                             bool IfPredicateInstr);
442 
443   /// Widen an integer or floating-point induction variable \p IV. If \p Trunc
444   /// is provided, the integer induction variable will first be truncated to
445   /// the corresponding type.
446   void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr);
447 
448   /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a
449   /// vector or scalar value on-demand if one is not yet available. When
450   /// vectorizing a loop, we visit the definition of an instruction before its
451   /// uses. When visiting the definition, we either vectorize or scalarize the
452   /// instruction, creating an entry for it in the corresponding map. (In some
453   /// cases, such as induction variables, we will create both vector and scalar
454   /// entries.) Then, as we encounter uses of the definition, we derive values
455   /// for each scalar or vector use unless such a value is already available.
456   /// For example, if we scalarize a definition and one of its uses is vector,
457   /// we build the required vector on-demand with an insertelement sequence
458   /// when visiting the use. Otherwise, if the use is scalar, we can use the
459   /// existing scalar definition.
460   ///
461   /// Return a value in the new loop corresponding to \p V from the original
462   /// loop at unroll index \p Part. If the value has already been vectorized,
463   /// the corresponding vector entry in VectorLoopValueMap is returned. If,
464   /// however, the value has a scalar entry in VectorLoopValueMap, we construct
465   /// a new vector value on-demand by inserting the scalar values into a vector
466   /// with an insertelement sequence. If the value has been neither vectorized
467   /// nor scalarized, it must be loop invariant, so we simply broadcast the
468   /// value into a vector.
469   Value *getOrCreateVectorValue(Value *V, unsigned Part);
470 
471   /// Return a value in the new loop corresponding to \p V from the original
472   /// loop at unroll and vector indices \p Instance. If the value has been
473   /// vectorized but not scalarized, the necessary extractelement instruction
474   /// will be generated.
475   Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance);
476 
477   /// Construct the vector value of a scalarized value \p V one lane at a time.
478   void packScalarIntoVectorValue(Value *V, const VPIteration &Instance);
479 
480   /// Try to vectorize the interleaved access group that \p Instr belongs to,
481   /// optionally masking the vector operations if \p BlockInMask is non-null.
482   void vectorizeInterleaveGroup(Instruction *Instr,
483                                 VectorParts *BlockInMask = nullptr);
484 
485   /// Vectorize Load and Store instructions, optionally masking the vector
486   /// operations if \p BlockInMask is non-null.
487   void vectorizeMemoryInstruction(Instruction *Instr,
488                                   VectorParts *BlockInMask = nullptr);
489 
490   /// Set the debug location in the builder using the debug location in
491   /// the instruction.
492   void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr);
493 
494   /// Fix the non-induction PHIs in the OrigPHIsToFix vector.
495   void fixNonInductionPHIs(void);
496 
497 protected:
498   friend class LoopVectorizationPlanner;
499 
500   /// A small list of PHINodes.
501   using PhiVector = SmallVector<PHINode *, 4>;
502 
503   /// A type for scalarized values in the new loop. Each value from the
504   /// original loop, when scalarized, is represented by UF x VF scalar values
505   /// in the new unrolled loop, where UF is the unroll factor and VF is the
506   /// vectorization factor.
507   using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
508 
509   /// Set up the values of the IVs correctly when exiting the vector loop.
510   void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
511                     Value *CountRoundDown, Value *EndValue,
512                     BasicBlock *MiddleBlock);
513 
514   /// Create a new induction variable inside L.
515   PHINode *createInductionVariable(Loop *L, Value *Start, Value *End,
516                                    Value *Step, Instruction *DL);
517 
518   /// Handle all cross-iteration phis in the header.
519   void fixCrossIterationPHIs();
520 
521   /// Fix a first-order recurrence. This is the second phase of vectorizing
522   /// this phi node.
523   void fixFirstOrderRecurrence(PHINode *Phi);
524 
525   /// Fix a reduction cross-iteration phi. This is the second phase of
526   /// vectorizing this phi node.
527   void fixReduction(PHINode *Phi);
528 
529   /// The Loop exit block may have single value PHI nodes with some
530   /// incoming value. While vectorizing we only handled real values
531   /// that were defined inside the loop and we should have one value for
532   /// each predecessor of its parent basic block. See PR14725.
533   void fixLCSSAPHIs();
534 
535   /// Iteratively sink the scalarized operands of a predicated instruction into
536   /// the block that was created for it.
537   void sinkScalarOperands(Instruction *PredInst);
538 
539   /// Shrinks vector element sizes to the smallest bitwidth they can be legally
540   /// represented as.
541   void truncateToMinimalBitwidths();
542 
543   /// Insert the new loop to the loop hierarchy and pass manager
544   /// and update the analysis passes.
545   void updateAnalysis();
546 
547   /// Create a broadcast instruction. This method generates a broadcast
548   /// instruction (shuffle) for loop invariant values and for the induction
549   /// value. If this is the induction variable then we extend it to N, N+1, ...
550   /// this is needed because each iteration in the loop corresponds to a SIMD
551   /// element.
552   virtual Value *getBroadcastInstrs(Value *V);
553 
554   /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...)
555   /// to each vector element of Val. The sequence starts at StartIndex.
556   /// \p Opcode is relevant for FP induction variable.
557   virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step,
558                                Instruction::BinaryOps Opcode =
559                                Instruction::BinaryOpsEnd);
560 
561   /// Compute scalar induction steps. \p ScalarIV is the scalar induction
562   /// variable on which to base the steps, \p Step is the size of the step, and
563   /// \p EntryVal is the value from the original loop that maps to the steps.
564   /// Note that \p EntryVal doesn't have to be an induction variable - it
565   /// can also be a truncate instruction.
566   void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal,
567                         const InductionDescriptor &ID);
568 
569   /// Create a vector induction phi node based on an existing scalar one. \p
570   /// EntryVal is the value from the original loop that maps to the vector phi
571   /// node, and \p Step is the loop-invariant step. If \p EntryVal is a
572   /// truncate instruction, instead of widening the original IV, we widen a
573   /// version of the IV truncated to \p EntryVal's type.
574   void createVectorIntOrFpInductionPHI(const InductionDescriptor &II,
575                                        Value *Step, Instruction *EntryVal);
576 
577   /// Returns true if an instruction \p I should be scalarized instead of
578   /// vectorized for the chosen vectorization factor.
579   bool shouldScalarizeInstruction(Instruction *I) const;
580 
581   /// Returns true if we should generate a scalar version of \p IV.
582   bool needsScalarInduction(Instruction *IV) const;
583 
584   /// If there is a cast involved in the induction variable \p ID, which should
585   /// be ignored in the vectorized loop body, this function records the
586   /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the
587   /// cast. We had already proved that the casted Phi is equal to the uncasted
588   /// Phi in the vectorized loop (under a runtime guard), and therefore
589   /// there is no need to vectorize the cast - the same value can be used in the
590   /// vector loop for both the Phi and the cast.
591   /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified,
592   /// Otherwise, \p VectorLoopValue is a widened/vectorized value.
593   ///
594   /// \p EntryVal is the value from the original loop that maps to the vector
595   /// phi node and is used to distinguish what is the IV currently being
596   /// processed - original one (if \p EntryVal is a phi corresponding to the
597   /// original IV) or the "newly-created" one based on the proof mentioned above
598   /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the
599   /// latter case \p EntryVal is a TruncInst and we must not record anything for
600   /// that IV, but it's error-prone to expect callers of this routine to care
601   /// about that, hence this explicit parameter.
602   void recordVectorLoopValueForInductionCast(const InductionDescriptor &ID,
603                                              const Instruction *EntryVal,
604                                              Value *VectorLoopValue,
605                                              unsigned Part,
606                                              unsigned Lane = UINT_MAX);
607 
608   /// Generate a shuffle sequence that will reverse the vector Vec.
609   virtual Value *reverseVector(Value *Vec);
610 
611   /// Returns (and creates if needed) the original loop trip count.
612   Value *getOrCreateTripCount(Loop *NewLoop);
613 
614   /// Returns (and creates if needed) the trip count of the widened loop.
615   Value *getOrCreateVectorTripCount(Loop *NewLoop);
616 
617   /// Returns a bitcasted value to the requested vector type.
618   /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
619   Value *createBitOrPointerCast(Value *V, VectorType *DstVTy,
620                                 const DataLayout &DL);
621 
622   /// Emit a bypass check to see if the vector trip count is zero, including if
623   /// it overflows.
624   void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass);
625 
626   /// Emit a bypass check to see if all of the SCEV assumptions we've
627   /// had to make are correct.
628   void emitSCEVChecks(Loop *L, BasicBlock *Bypass);
629 
630   /// Emit bypass checks to check any memory assumptions we may have made.
631   void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass);
632 
633   /// Compute the transformed value of Index at offset StartValue using step
634   /// StepValue.
635   /// For integer induction, returns StartValue + Index * StepValue.
636   /// For pointer induction, returns StartValue[Index * StepValue].
637   /// FIXME: The newly created binary instructions should contain nsw/nuw
638   /// flags, which can be found from the original scalar operations.
639   Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE,
640                               const DataLayout &DL,
641                               const InductionDescriptor &ID) const;
642 
643   /// Add additional metadata to \p To that was not present on \p Orig.
644   ///
645   /// Currently this is used to add the noalias annotations based on the
646   /// inserted memchecks.  Use this for instructions that are *cloned* into the
647   /// vector loop.
648   void addNewMetadata(Instruction *To, const Instruction *Orig);
649 
650   /// Add metadata from one instruction to another.
651   ///
652   /// This includes both the original MDs from \p From and additional ones (\see
653   /// addNewMetadata).  Use this for *newly created* instructions in the vector
654   /// loop.
655   void addMetadata(Instruction *To, Instruction *From);
656 
657   /// Similar to the previous function but it adds the metadata to a
658   /// vector of instructions.
659   void addMetadata(ArrayRef<Value *> To, Instruction *From);
660 
661   /// The original loop.
662   Loop *OrigLoop;
663 
664   /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
665   /// dynamic knowledge to simplify SCEV expressions and converts them to a
666   /// more usable form.
667   PredicatedScalarEvolution &PSE;
668 
669   /// Loop Info.
670   LoopInfo *LI;
671 
672   /// Dominator Tree.
673   DominatorTree *DT;
674 
675   /// Alias Analysis.
676   AliasAnalysis *AA;
677 
678   /// Target Library Info.
679   const TargetLibraryInfo *TLI;
680 
681   /// Target Transform Info.
682   const TargetTransformInfo *TTI;
683 
684   /// Assumption Cache.
685   AssumptionCache *AC;
686 
687   /// Interface to emit optimization remarks.
688   OptimizationRemarkEmitter *ORE;
689 
690   /// LoopVersioning.  It's only set up (non-null) if memchecks were
691   /// used.
692   ///
693   /// This is currently only used to add no-alias metadata based on the
694   /// memchecks.  The actually versioning is performed manually.
695   std::unique_ptr<LoopVersioning> LVer;
696 
697   /// The vectorization SIMD factor to use. Each vector will have this many
698   /// vector elements.
699   unsigned VF;
700 
701   /// The vectorization unroll factor to use. Each scalar is vectorized to this
702   /// many different vector instructions.
703   unsigned UF;
704 
705   /// The builder that we use
706   IRBuilder<> Builder;
707 
708   // --- Vectorization state ---
709 
710   /// The vector-loop preheader.
711   BasicBlock *LoopVectorPreHeader;
712 
713   /// The scalar-loop preheader.
714   BasicBlock *LoopScalarPreHeader;
715 
716   /// Middle Block between the vector and the scalar.
717   BasicBlock *LoopMiddleBlock;
718 
719   /// The ExitBlock of the scalar loop.
720   BasicBlock *LoopExitBlock;
721 
722   /// The vector loop body.
723   BasicBlock *LoopVectorBody;
724 
725   /// The scalar loop body.
726   BasicBlock *LoopScalarBody;
727 
728   /// A list of all bypass blocks. The first block is the entry of the loop.
729   SmallVector<BasicBlock *, 4> LoopBypassBlocks;
730 
731   /// The new Induction variable which was added to the new block.
732   PHINode *Induction = nullptr;
733 
734   /// The induction variable of the old basic block.
735   PHINode *OldInduction = nullptr;
736 
737   /// Maps values from the original loop to their corresponding values in the
738   /// vectorized loop. A key value can map to either vector values, scalar
739   /// values or both kinds of values, depending on whether the key was
740   /// vectorized and scalarized.
741   VectorizerValueMap VectorLoopValueMap;
742 
743   /// Store instructions that were predicated.
744   SmallVector<Instruction *, 4> PredicatedInstructions;
745 
746   /// Trip count of the original loop.
747   Value *TripCount = nullptr;
748 
749   /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
750   Value *VectorTripCount = nullptr;
751 
752   /// The legality analysis.
753   LoopVectorizationLegality *Legal;
754 
755   /// The profitablity analysis.
756   LoopVectorizationCostModel *Cost;
757 
758   // Record whether runtime checks are added.
759   bool AddedSafetyChecks = false;
760 
761   // Holds the end values for each induction variable. We save the end values
762   // so we can later fix-up the external users of the induction variables.
763   DenseMap<PHINode *, Value *> IVEndValues;
764 
765   // Vector of original scalar PHIs whose corresponding widened PHIs need to be
766   // fixed up at the end of vector code generation.
767   SmallVector<PHINode *, 8> OrigPHIsToFix;
768 };
769 
770 class InnerLoopUnroller : public InnerLoopVectorizer {
771 public:
772   InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
773                     LoopInfo *LI, DominatorTree *DT,
774                     const TargetLibraryInfo *TLI,
775                     const TargetTransformInfo *TTI, AssumptionCache *AC,
776                     OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
777                     LoopVectorizationLegality *LVL,
778                     LoopVectorizationCostModel *CM)
779       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1,
780                             UnrollFactor, LVL, CM) {}
781 
782 private:
783   Value *getBroadcastInstrs(Value *V) override;
784   Value *getStepVector(Value *Val, int StartIdx, Value *Step,
785                        Instruction::BinaryOps Opcode =
786                        Instruction::BinaryOpsEnd) override;
787   Value *reverseVector(Value *Vec) override;
788 };
789 
790 } // end namespace llvm
791 
792 /// Look for a meaningful debug location on the instruction or it's
793 /// operands.
794 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
795   if (!I)
796     return I;
797 
798   DebugLoc Empty;
799   if (I->getDebugLoc() != Empty)
800     return I;
801 
802   for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) {
803     if (Instruction *OpInst = dyn_cast<Instruction>(*OI))
804       if (OpInst->getDebugLoc() != Empty)
805         return OpInst;
806   }
807 
808   return I;
809 }
810 
811 void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) {
812   if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) {
813     const DILocation *DIL = Inst->getDebugLoc();
814     if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
815         !isa<DbgInfoIntrinsic>(Inst)) {
816       auto NewDIL = DIL->cloneByMultiplyingDuplicationFactor(UF * VF);
817       if (NewDIL)
818         B.SetCurrentDebugLocation(NewDIL.getValue());
819       else
820         LLVM_DEBUG(dbgs()
821                    << "Failed to create new discriminator: "
822                    << DIL->getFilename() << " Line: " << DIL->getLine());
823     }
824     else
825       B.SetCurrentDebugLocation(DIL);
826   } else
827     B.SetCurrentDebugLocation(DebugLoc());
828 }
829 
830 /// Write a record \p DebugMsg about vectorization failure to the debug
831 /// output stream. If \p I is passed, it is an instruction that prevents
832 /// vectorization.
833 #ifndef NDEBUG
834 static void debugVectorizationFailure(const StringRef DebugMsg,
835     Instruction *I) {
836   dbgs() << "LV: Not vectorizing: " << DebugMsg;
837   if (I != nullptr)
838     dbgs() << " " << *I;
839   else
840     dbgs() << '.';
841   dbgs() << '\n';
842 }
843 #endif
844 
845 /// Create an analysis remark that explains why vectorization failed
846 ///
847 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint).  \p
848 /// RemarkName is the identifier for the remark.  If \p I is passed it is an
849 /// instruction that prevents vectorization.  Otherwise \p TheLoop is used for
850 /// the location of the remark.  \return the remark object that can be
851 /// streamed to.
852 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
853     StringRef RemarkName, Loop *TheLoop, Instruction *I) {
854   Value *CodeRegion = TheLoop->getHeader();
855   DebugLoc DL = TheLoop->getStartLoc();
856 
857   if (I) {
858     CodeRegion = I->getParent();
859     // If there is no debug location attached to the instruction, revert back to
860     // using the loop's.
861     if (I->getDebugLoc())
862       DL = I->getDebugLoc();
863   }
864 
865   OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion);
866   R << "loop not vectorized: ";
867   return R;
868 }
869 
870 namespace llvm {
871 
872 void reportVectorizationFailure(const StringRef DebugMsg,
873     const StringRef OREMsg, const StringRef ORETag,
874     OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) {
875   LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I));
876   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
877   ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(),
878                 ORETag, TheLoop, I) << OREMsg);
879 }
880 
881 } // end namespace llvm
882 
883 #ifndef NDEBUG
884 /// \return string containing a file name and a line # for the given loop.
885 static std::string getDebugLocString(const Loop *L) {
886   std::string Result;
887   if (L) {
888     raw_string_ostream OS(Result);
889     if (const DebugLoc LoopDbgLoc = L->getStartLoc())
890       LoopDbgLoc.print(OS);
891     else
892       // Just print the module name.
893       OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
894     OS.flush();
895   }
896   return Result;
897 }
898 #endif
899 
900 void InnerLoopVectorizer::addNewMetadata(Instruction *To,
901                                          const Instruction *Orig) {
902   // If the loop was versioned with memchecks, add the corresponding no-alias
903   // metadata.
904   if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig)))
905     LVer->annotateInstWithNoAlias(To, Orig);
906 }
907 
908 void InnerLoopVectorizer::addMetadata(Instruction *To,
909                                       Instruction *From) {
910   propagateMetadata(To, From);
911   addNewMetadata(To, From);
912 }
913 
914 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,
915                                       Instruction *From) {
916   for (Value *V : To) {
917     if (Instruction *I = dyn_cast<Instruction>(V))
918       addMetadata(I, From);
919   }
920 }
921 
922 namespace llvm {
923 
924 // Loop vectorization cost-model hints how the scalar epilogue loop should be
925 // lowered.
926 enum ScalarEpilogueLowering {
927 
928   // The default: allowing scalar epilogues.
929   CM_ScalarEpilogueAllowed,
930 
931   // Vectorization with OptForSize: don't allow epilogues.
932   CM_ScalarEpilogueNotAllowedOptSize,
933 
934   // A special case of vectorisation with OptForSize: loops with a very small
935   // trip count are considered for vectorization under OptForSize, thereby
936   // making sure the cost of their loop body is dominant, free of runtime
937   // guards and scalar iteration overheads.
938   CM_ScalarEpilogueNotAllowedLowTripLoop,
939 
940   // Loop hint predicate indicating an epilogue is undesired.
941   CM_ScalarEpilogueNotNeededUsePredicate
942 };
943 
944 /// LoopVectorizationCostModel - estimates the expected speedups due to
945 /// vectorization.
946 /// In many cases vectorization is not profitable. This can happen because of
947 /// a number of reasons. In this class we mainly attempt to predict the
948 /// expected speedup/slowdowns due to the supported instruction set. We use the
949 /// TargetTransformInfo to query the different backends for the cost of
950 /// different operations.
951 class LoopVectorizationCostModel {
952 public:
953   LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
954                              PredicatedScalarEvolution &PSE, LoopInfo *LI,
955                              LoopVectorizationLegality *Legal,
956                              const TargetTransformInfo &TTI,
957                              const TargetLibraryInfo *TLI, DemandedBits *DB,
958                              AssumptionCache *AC,
959                              OptimizationRemarkEmitter *ORE, const Function *F,
960                              const LoopVectorizeHints *Hints,
961                              InterleavedAccessInfo &IAI)
962       : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
963         TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
964         Hints(Hints), InterleaveInfo(IAI) {}
965 
966   /// \return An upper bound for the vectorization factor, or None if
967   /// vectorization and interleaving should be avoided up front.
968   Optional<unsigned> computeMaxVF();
969 
970   /// \return True if runtime checks are required for vectorization, and false
971   /// otherwise.
972   bool runtimeChecksRequired();
973 
974   /// \return The most profitable vectorization factor and the cost of that VF.
975   /// This method checks every power of two up to MaxVF. If UserVF is not ZERO
976   /// then this vectorization factor will be selected if vectorization is
977   /// possible.
978   VectorizationFactor selectVectorizationFactor(unsigned MaxVF);
979 
980   /// Setup cost-based decisions for user vectorization factor.
981   void selectUserVectorizationFactor(unsigned UserVF) {
982     collectUniformsAndScalars(UserVF);
983     collectInstsToScalarize(UserVF);
984   }
985 
986   /// \return The size (in bits) of the smallest and widest types in the code
987   /// that needs to be vectorized. We ignore values that remain scalar such as
988   /// 64 bit loop indices.
989   std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
990 
991   /// \return The desired interleave count.
992   /// If interleave count has been specified by metadata it will be returned.
993   /// Otherwise, the interleave count is computed and returned. VF and LoopCost
994   /// are the selected vectorization factor and the cost of the selected VF.
995   unsigned selectInterleaveCount(unsigned VF, unsigned LoopCost);
996 
997   /// Memory access instruction may be vectorized in more than one way.
998   /// Form of instruction after vectorization depends on cost.
999   /// This function takes cost-based decisions for Load/Store instructions
1000   /// and collects them in a map. This decisions map is used for building
1001   /// the lists of loop-uniform and loop-scalar instructions.
1002   /// The calculated cost is saved with widening decision in order to
1003   /// avoid redundant calculations.
1004   void setCostBasedWideningDecision(unsigned VF);
1005 
1006   /// A struct that represents some properties of the register usage
1007   /// of a loop.
1008   struct RegisterUsage {
1009     /// Holds the number of loop invariant values that are used in the loop.
1010     /// The key is ClassID of target-provided register class.
1011     SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs;
1012     /// Holds the maximum number of concurrent live intervals in the loop.
1013     /// The key is ClassID of target-provided register class.
1014     SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
1015   };
1016 
1017   /// \return Returns information about the register usages of the loop for the
1018   /// given vectorization factors.
1019   SmallVector<RegisterUsage, 8> calculateRegisterUsage(ArrayRef<unsigned> VFs);
1020 
1021   /// Collect values we want to ignore in the cost model.
1022   void collectValuesToIgnore();
1023 
1024   /// \returns The smallest bitwidth each instruction can be represented with.
1025   /// The vector equivalents of these instructions should be truncated to this
1026   /// type.
1027   const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
1028     return MinBWs;
1029   }
1030 
1031   /// \returns True if it is more profitable to scalarize instruction \p I for
1032   /// vectorization factor \p VF.
1033   bool isProfitableToScalarize(Instruction *I, unsigned VF) const {
1034     assert(VF > 1 && "Profitable to scalarize relevant only for VF > 1.");
1035 
1036     // Cost model is not run in the VPlan-native path - return conservative
1037     // result until this changes.
1038     if (EnableVPlanNativePath)
1039       return false;
1040 
1041     auto Scalars = InstsToScalarize.find(VF);
1042     assert(Scalars != InstsToScalarize.end() &&
1043            "VF not yet analyzed for scalarization profitability");
1044     return Scalars->second.find(I) != Scalars->second.end();
1045   }
1046 
1047   /// Returns true if \p I is known to be uniform after vectorization.
1048   bool isUniformAfterVectorization(Instruction *I, unsigned VF) const {
1049     if (VF == 1)
1050       return true;
1051 
1052     // Cost model is not run in the VPlan-native path - return conservative
1053     // result until this changes.
1054     if (EnableVPlanNativePath)
1055       return false;
1056 
1057     auto UniformsPerVF = Uniforms.find(VF);
1058     assert(UniformsPerVF != Uniforms.end() &&
1059            "VF not yet analyzed for uniformity");
1060     return UniformsPerVF->second.find(I) != UniformsPerVF->second.end();
1061   }
1062 
1063   /// Returns true if \p I is known to be scalar after vectorization.
1064   bool isScalarAfterVectorization(Instruction *I, unsigned VF) const {
1065     if (VF == 1)
1066       return true;
1067 
1068     // Cost model is not run in the VPlan-native path - return conservative
1069     // result until this changes.
1070     if (EnableVPlanNativePath)
1071       return false;
1072 
1073     auto ScalarsPerVF = Scalars.find(VF);
1074     assert(ScalarsPerVF != Scalars.end() &&
1075            "Scalar values are not calculated for VF");
1076     return ScalarsPerVF->second.find(I) != ScalarsPerVF->second.end();
1077   }
1078 
1079   /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1080   /// for vectorization factor \p VF.
1081   bool canTruncateToMinimalBitwidth(Instruction *I, unsigned VF) const {
1082     return VF > 1 && MinBWs.find(I) != MinBWs.end() &&
1083            !isProfitableToScalarize(I, VF) &&
1084            !isScalarAfterVectorization(I, VF);
1085   }
1086 
1087   /// Decision that was taken during cost calculation for memory instruction.
1088   enum InstWidening {
1089     CM_Unknown,
1090     CM_Widen,         // For consecutive accesses with stride +1.
1091     CM_Widen_Reverse, // For consecutive accesses with stride -1.
1092     CM_Interleave,
1093     CM_GatherScatter,
1094     CM_Scalarize
1095   };
1096 
1097   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1098   /// instruction \p I and vector width \p VF.
1099   void setWideningDecision(Instruction *I, unsigned VF, InstWidening W,
1100                            unsigned Cost) {
1101     assert(VF >= 2 && "Expected VF >=2");
1102     WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1103   }
1104 
1105   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1106   /// interleaving group \p Grp and vector width \p VF.
1107   void setWideningDecision(const InterleaveGroup<Instruction> *Grp, unsigned VF,
1108                            InstWidening W, unsigned Cost) {
1109     assert(VF >= 2 && "Expected VF >=2");
1110     /// Broadcast this decicion to all instructions inside the group.
1111     /// But the cost will be assigned to one instruction only.
1112     for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1113       if (auto *I = Grp->getMember(i)) {
1114         if (Grp->getInsertPos() == I)
1115           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1116         else
1117           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1118       }
1119     }
1120   }
1121 
1122   /// Return the cost model decision for the given instruction \p I and vector
1123   /// width \p VF. Return CM_Unknown if this instruction did not pass
1124   /// through the cost modeling.
1125   InstWidening getWideningDecision(Instruction *I, unsigned VF) {
1126     assert(VF >= 2 && "Expected VF >=2");
1127 
1128     // Cost model is not run in the VPlan-native path - return conservative
1129     // result until this changes.
1130     if (EnableVPlanNativePath)
1131       return CM_GatherScatter;
1132 
1133     std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
1134     auto Itr = WideningDecisions.find(InstOnVF);
1135     if (Itr == WideningDecisions.end())
1136       return CM_Unknown;
1137     return Itr->second.first;
1138   }
1139 
1140   /// Return the vectorization cost for the given instruction \p I and vector
1141   /// width \p VF.
1142   unsigned getWideningCost(Instruction *I, unsigned VF) {
1143     assert(VF >= 2 && "Expected VF >=2");
1144     std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
1145     assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&
1146            "The cost is not calculated");
1147     return WideningDecisions[InstOnVF].second;
1148   }
1149 
1150   /// Return True if instruction \p I is an optimizable truncate whose operand
1151   /// is an induction variable. Such a truncate will be removed by adding a new
1152   /// induction variable with the destination type.
1153   bool isOptimizableIVTruncate(Instruction *I, unsigned VF) {
1154     // If the instruction is not a truncate, return false.
1155     auto *Trunc = dyn_cast<TruncInst>(I);
1156     if (!Trunc)
1157       return false;
1158 
1159     // Get the source and destination types of the truncate.
1160     Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1161     Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1162 
1163     // If the truncate is free for the given types, return false. Replacing a
1164     // free truncate with an induction variable would add an induction variable
1165     // update instruction to each iteration of the loop. We exclude from this
1166     // check the primary induction variable since it will need an update
1167     // instruction regardless.
1168     Value *Op = Trunc->getOperand(0);
1169     if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1170       return false;
1171 
1172     // If the truncated value is not an induction variable, return false.
1173     return Legal->isInductionPhi(Op);
1174   }
1175 
1176   /// Collects the instructions to scalarize for each predicated instruction in
1177   /// the loop.
1178   void collectInstsToScalarize(unsigned VF);
1179 
1180   /// Collect Uniform and Scalar values for the given \p VF.
1181   /// The sets depend on CM decision for Load/Store instructions
1182   /// that may be vectorized as interleave, gather-scatter or scalarized.
1183   void collectUniformsAndScalars(unsigned VF) {
1184     // Do the analysis once.
1185     if (VF == 1 || Uniforms.find(VF) != Uniforms.end())
1186       return;
1187     setCostBasedWideningDecision(VF);
1188     collectLoopUniforms(VF);
1189     collectLoopScalars(VF);
1190   }
1191 
1192   /// Returns true if the target machine supports masked store operation
1193   /// for the given \p DataType and kind of access to \p Ptr.
1194   bool isLegalMaskedStore(Type *DataType, Value *Ptr, MaybeAlign Alignment) {
1195     return Legal->isConsecutivePtr(Ptr) &&
1196            TTI.isLegalMaskedStore(DataType, Alignment);
1197   }
1198 
1199   /// Returns true if the target machine supports masked load operation
1200   /// for the given \p DataType and kind of access to \p Ptr.
1201   bool isLegalMaskedLoad(Type *DataType, Value *Ptr, MaybeAlign Alignment) {
1202     return Legal->isConsecutivePtr(Ptr) &&
1203            TTI.isLegalMaskedLoad(DataType, Alignment);
1204   }
1205 
1206   /// Returns true if the target machine supports masked scatter operation
1207   /// for the given \p DataType.
1208   bool isLegalMaskedScatter(Type *DataType) {
1209     return TTI.isLegalMaskedScatter(DataType);
1210   }
1211 
1212   /// Returns true if the target machine supports masked gather operation
1213   /// for the given \p DataType.
1214   bool isLegalMaskedGather(Type *DataType) {
1215     return TTI.isLegalMaskedGather(DataType);
1216   }
1217 
1218   /// Returns true if the target machine can represent \p V as a masked gather
1219   /// or scatter operation.
1220   bool isLegalGatherOrScatter(Value *V) {
1221     bool LI = isa<LoadInst>(V);
1222     bool SI = isa<StoreInst>(V);
1223     if (!LI && !SI)
1224       return false;
1225     auto *Ty = getMemInstValueType(V);
1226     return (LI && isLegalMaskedGather(Ty)) || (SI && isLegalMaskedScatter(Ty));
1227   }
1228 
1229   /// Returns true if \p I is an instruction that will be scalarized with
1230   /// predication. Such instructions include conditional stores and
1231   /// instructions that may divide by zero.
1232   /// If a non-zero VF has been calculated, we check if I will be scalarized
1233   /// predication for that VF.
1234   bool isScalarWithPredication(Instruction *I, unsigned VF = 1);
1235 
1236   // Returns true if \p I is an instruction that will be predicated either
1237   // through scalar predication or masked load/store or masked gather/scatter.
1238   // Superset of instructions that return true for isScalarWithPredication.
1239   bool isPredicatedInst(Instruction *I) {
1240     if (!blockNeedsPredication(I->getParent()))
1241       return false;
1242     // Loads and stores that need some form of masked operation are predicated
1243     // instructions.
1244     if (isa<LoadInst>(I) || isa<StoreInst>(I))
1245       return Legal->isMaskRequired(I);
1246     return isScalarWithPredication(I);
1247   }
1248 
1249   /// Returns true if \p I is a memory instruction with consecutive memory
1250   /// access that can be widened.
1251   bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1);
1252 
1253   /// Returns true if \p I is a memory instruction in an interleaved-group
1254   /// of memory accesses that can be vectorized with wide vector loads/stores
1255   /// and shuffles.
1256   bool interleavedAccessCanBeWidened(Instruction *I, unsigned VF = 1);
1257 
1258   /// Check if \p Instr belongs to any interleaved access group.
1259   bool isAccessInterleaved(Instruction *Instr) {
1260     return InterleaveInfo.isInterleaved(Instr);
1261   }
1262 
1263   /// Get the interleaved access group that \p Instr belongs to.
1264   const InterleaveGroup<Instruction> *
1265   getInterleavedAccessGroup(Instruction *Instr) {
1266     return InterleaveInfo.getInterleaveGroup(Instr);
1267   }
1268 
1269   /// Returns true if an interleaved group requires a scalar iteration
1270   /// to handle accesses with gaps, and there is nothing preventing us from
1271   /// creating a scalar epilogue.
1272   bool requiresScalarEpilogue() const {
1273     return isScalarEpilogueAllowed() && InterleaveInfo.requiresScalarEpilogue();
1274   }
1275 
1276   /// Returns true if a scalar epilogue is not allowed due to optsize or a
1277   /// loop hint annotation.
1278   bool isScalarEpilogueAllowed() const {
1279     return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1280   }
1281 
1282   /// Returns true if all loop blocks should be masked to fold tail loop.
1283   bool foldTailByMasking() const { return FoldTailByMasking; }
1284 
1285   bool blockNeedsPredication(BasicBlock *BB) {
1286     return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1287   }
1288 
1289   /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1290   /// with factor VF.  Return the cost of the instruction, including
1291   /// scalarization overhead if it's needed.
1292   unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF);
1293 
1294   /// Estimate cost of a call instruction CI if it were vectorized with factor
1295   /// VF. Return the cost of the instruction, including scalarization overhead
1296   /// if it's needed. The flag NeedToScalarize shows if the call needs to be
1297   /// scalarized -
1298   /// i.e. either vector version isn't available, or is too expensive.
1299   unsigned getVectorCallCost(CallInst *CI, unsigned VF, bool &NeedToScalarize);
1300 
1301 private:
1302   unsigned NumPredStores = 0;
1303 
1304   /// \return An upper bound for the vectorization factor, larger than zero.
1305   /// One is returned if vectorization should best be avoided due to cost.
1306   unsigned computeFeasibleMaxVF(unsigned ConstTripCount);
1307 
1308   /// The vectorization cost is a combination of the cost itself and a boolean
1309   /// indicating whether any of the contributing operations will actually
1310   /// operate on
1311   /// vector values after type legalization in the backend. If this latter value
1312   /// is
1313   /// false, then all operations will be scalarized (i.e. no vectorization has
1314   /// actually taken place).
1315   using VectorizationCostTy = std::pair<unsigned, bool>;
1316 
1317   /// Returns the expected execution cost. The unit of the cost does
1318   /// not matter because we use the 'cost' units to compare different
1319   /// vector widths. The cost that is returned is *not* normalized by
1320   /// the factor width.
1321   VectorizationCostTy expectedCost(unsigned VF);
1322 
1323   /// Returns the execution time cost of an instruction for a given vector
1324   /// width. Vector width of one means scalar.
1325   VectorizationCostTy getInstructionCost(Instruction *I, unsigned VF);
1326 
1327   /// The cost-computation logic from getInstructionCost which provides
1328   /// the vector type as an output parameter.
1329   unsigned getInstructionCost(Instruction *I, unsigned VF, Type *&VectorTy);
1330 
1331   /// Calculate vectorization cost of memory instruction \p I.
1332   unsigned getMemoryInstructionCost(Instruction *I, unsigned VF);
1333 
1334   /// The cost computation for scalarized memory instruction.
1335   unsigned getMemInstScalarizationCost(Instruction *I, unsigned VF);
1336 
1337   /// The cost computation for interleaving group of memory instructions.
1338   unsigned getInterleaveGroupCost(Instruction *I, unsigned VF);
1339 
1340   /// The cost computation for Gather/Scatter instruction.
1341   unsigned getGatherScatterCost(Instruction *I, unsigned VF);
1342 
1343   /// The cost computation for widening instruction \p I with consecutive
1344   /// memory access.
1345   unsigned getConsecutiveMemOpCost(Instruction *I, unsigned VF);
1346 
1347   /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1348   /// Load: scalar load + broadcast.
1349   /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1350   /// element)
1351   unsigned getUniformMemOpCost(Instruction *I, unsigned VF);
1352 
1353   /// Estimate the overhead of scalarizing an instruction. This is a
1354   /// convenience wrapper for the type-based getScalarizationOverhead API.
1355   unsigned getScalarizationOverhead(Instruction *I, unsigned VF);
1356 
1357   /// Returns whether the instruction is a load or store and will be a emitted
1358   /// as a vector operation.
1359   bool isConsecutiveLoadOrStore(Instruction *I);
1360 
1361   /// Returns true if an artificially high cost for emulated masked memrefs
1362   /// should be used.
1363   bool useEmulatedMaskMemRefHack(Instruction *I);
1364 
1365   /// Map of scalar integer values to the smallest bitwidth they can be legally
1366   /// represented as. The vector equivalents of these values should be truncated
1367   /// to this type.
1368   MapVector<Instruction *, uint64_t> MinBWs;
1369 
1370   /// A type representing the costs for instructions if they were to be
1371   /// scalarized rather than vectorized. The entries are Instruction-Cost
1372   /// pairs.
1373   using ScalarCostsTy = DenseMap<Instruction *, unsigned>;
1374 
1375   /// A set containing all BasicBlocks that are known to present after
1376   /// vectorization as a predicated block.
1377   SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;
1378 
1379   /// Records whether it is allowed to have the original scalar loop execute at
1380   /// least once. This may be needed as a fallback loop in case runtime
1381   /// aliasing/dependence checks fail, or to handle the tail/remainder
1382   /// iterations when the trip count is unknown or doesn't divide by the VF,
1383   /// or as a peel-loop to handle gaps in interleave-groups.
1384   /// Under optsize and when the trip count is very small we don't allow any
1385   /// iterations to execute in the scalar loop.
1386   ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1387 
1388   /// All blocks of loop are to be masked to fold tail of scalar iterations.
1389   bool FoldTailByMasking = false;
1390 
1391   /// A map holding scalar costs for different vectorization factors. The
1392   /// presence of a cost for an instruction in the mapping indicates that the
1393   /// instruction will be scalarized when vectorizing with the associated
1394   /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1395   DenseMap<unsigned, ScalarCostsTy> InstsToScalarize;
1396 
1397   /// Holds the instructions known to be uniform after vectorization.
1398   /// The data is collected per VF.
1399   DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Uniforms;
1400 
1401   /// Holds the instructions known to be scalar after vectorization.
1402   /// The data is collected per VF.
1403   DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Scalars;
1404 
1405   /// Holds the instructions (address computations) that are forced to be
1406   /// scalarized.
1407   DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1408 
1409   /// Returns the expected difference in cost from scalarizing the expression
1410   /// feeding a predicated instruction \p PredInst. The instructions to
1411   /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1412   /// non-negative return value implies the expression will be scalarized.
1413   /// Currently, only single-use chains are considered for scalarization.
1414   int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
1415                               unsigned VF);
1416 
1417   /// Collect the instructions that are uniform after vectorization. An
1418   /// instruction is uniform if we represent it with a single scalar value in
1419   /// the vectorized loop corresponding to each vector iteration. Examples of
1420   /// uniform instructions include pointer operands of consecutive or
1421   /// interleaved memory accesses. Note that although uniformity implies an
1422   /// instruction will be scalar, the reverse is not true. In general, a
1423   /// scalarized instruction will be represented by VF scalar values in the
1424   /// vectorized loop, each corresponding to an iteration of the original
1425   /// scalar loop.
1426   void collectLoopUniforms(unsigned VF);
1427 
1428   /// Collect the instructions that are scalar after vectorization. An
1429   /// instruction is scalar if it is known to be uniform or will be scalarized
1430   /// during vectorization. Non-uniform scalarized instructions will be
1431   /// represented by VF values in the vectorized loop, each corresponding to an
1432   /// iteration of the original scalar loop.
1433   void collectLoopScalars(unsigned VF);
1434 
1435   /// Keeps cost model vectorization decision and cost for instructions.
1436   /// Right now it is used for memory instructions only.
1437   using DecisionList = DenseMap<std::pair<Instruction *, unsigned>,
1438                                 std::pair<InstWidening, unsigned>>;
1439 
1440   DecisionList WideningDecisions;
1441 
1442   /// Returns true if \p V is expected to be vectorized and it needs to be
1443   /// extracted.
1444   bool needsExtract(Value *V, unsigned VF) const {
1445     Instruction *I = dyn_cast<Instruction>(V);
1446     if (VF == 1 || !I || !TheLoop->contains(I) || TheLoop->isLoopInvariant(I))
1447       return false;
1448 
1449     // Assume we can vectorize V (and hence we need extraction) if the
1450     // scalars are not computed yet. This can happen, because it is called
1451     // via getScalarizationOverhead from setCostBasedWideningDecision, before
1452     // the scalars are collected. That should be a safe assumption in most
1453     // cases, because we check if the operands have vectorizable types
1454     // beforehand in LoopVectorizationLegality.
1455     return Scalars.find(VF) == Scalars.end() ||
1456            !isScalarAfterVectorization(I, VF);
1457   };
1458 
1459   /// Returns a range containing only operands needing to be extracted.
1460   SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1461                                                    unsigned VF) {
1462     return SmallVector<Value *, 4>(make_filter_range(
1463         Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1464   }
1465 
1466 public:
1467   /// The loop that we evaluate.
1468   Loop *TheLoop;
1469 
1470   /// Predicated scalar evolution analysis.
1471   PredicatedScalarEvolution &PSE;
1472 
1473   /// Loop Info analysis.
1474   LoopInfo *LI;
1475 
1476   /// Vectorization legality.
1477   LoopVectorizationLegality *Legal;
1478 
1479   /// Vector target information.
1480   const TargetTransformInfo &TTI;
1481 
1482   /// Target Library Info.
1483   const TargetLibraryInfo *TLI;
1484 
1485   /// Demanded bits analysis.
1486   DemandedBits *DB;
1487 
1488   /// Assumption cache.
1489   AssumptionCache *AC;
1490 
1491   /// Interface to emit optimization remarks.
1492   OptimizationRemarkEmitter *ORE;
1493 
1494   const Function *TheFunction;
1495 
1496   /// Loop Vectorize Hint.
1497   const LoopVectorizeHints *Hints;
1498 
1499   /// The interleave access information contains groups of interleaved accesses
1500   /// with the same stride and close to each other.
1501   InterleavedAccessInfo &InterleaveInfo;
1502 
1503   /// Values to ignore in the cost model.
1504   SmallPtrSet<const Value *, 16> ValuesToIgnore;
1505 
1506   /// Values to ignore in the cost model when VF > 1.
1507   SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1508 };
1509 
1510 } // end namespace llvm
1511 
1512 // Return true if \p OuterLp is an outer loop annotated with hints for explicit
1513 // vectorization. The loop needs to be annotated with #pragma omp simd
1514 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
1515 // vector length information is not provided, vectorization is not considered
1516 // explicit. Interleave hints are not allowed either. These limitations will be
1517 // relaxed in the future.
1518 // Please, note that we are currently forced to abuse the pragma 'clang
1519 // vectorize' semantics. This pragma provides *auto-vectorization hints*
1520 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
1521 // provides *explicit vectorization hints* (LV can bypass legal checks and
1522 // assume that vectorization is legal). However, both hints are implemented
1523 // using the same metadata (llvm.loop.vectorize, processed by
1524 // LoopVectorizeHints). This will be fixed in the future when the native IR
1525 // representation for pragma 'omp simd' is introduced.
1526 static bool isExplicitVecOuterLoop(Loop *OuterLp,
1527                                    OptimizationRemarkEmitter *ORE) {
1528   assert(!OuterLp->empty() && "This is not an outer loop");
1529   LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
1530 
1531   // Only outer loops with an explicit vectorization hint are supported.
1532   // Unannotated outer loops are ignored.
1533   if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
1534     return false;
1535 
1536   Function *Fn = OuterLp->getHeader()->getParent();
1537   if (!Hints.allowVectorization(Fn, OuterLp,
1538                                 true /*VectorizeOnlyWhenForced*/)) {
1539     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
1540     return false;
1541   }
1542 
1543   if (Hints.getInterleave() > 1) {
1544     // TODO: Interleave support is future work.
1545     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
1546                          "outer loops.\n");
1547     Hints.emitRemarkWithHints();
1548     return false;
1549   }
1550 
1551   return true;
1552 }
1553 
1554 static void collectSupportedLoops(Loop &L, LoopInfo *LI,
1555                                   OptimizationRemarkEmitter *ORE,
1556                                   SmallVectorImpl<Loop *> &V) {
1557   // Collect inner loops and outer loops without irreducible control flow. For
1558   // now, only collect outer loops that have explicit vectorization hints. If we
1559   // are stress testing the VPlan H-CFG construction, we collect the outermost
1560   // loop of every loop nest.
1561   if (L.empty() || VPlanBuildStressTest ||
1562       (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
1563     LoopBlocksRPO RPOT(&L);
1564     RPOT.perform(LI);
1565     if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
1566       V.push_back(&L);
1567       // TODO: Collect inner loops inside marked outer loops in case
1568       // vectorization fails for the outer loop. Do not invoke
1569       // 'containsIrreducibleCFG' again for inner loops when the outer loop is
1570       // already known to be reducible. We can use an inherited attribute for
1571       // that.
1572       return;
1573     }
1574   }
1575   for (Loop *InnerL : L)
1576     collectSupportedLoops(*InnerL, LI, ORE, V);
1577 }
1578 
1579 namespace {
1580 
1581 /// The LoopVectorize Pass.
1582 struct LoopVectorize : public FunctionPass {
1583   /// Pass identification, replacement for typeid
1584   static char ID;
1585 
1586   LoopVectorizePass Impl;
1587 
1588   explicit LoopVectorize(bool InterleaveOnlyWhenForced = false,
1589                          bool VectorizeOnlyWhenForced = false)
1590       : FunctionPass(ID) {
1591     Impl.InterleaveOnlyWhenForced = InterleaveOnlyWhenForced;
1592     Impl.VectorizeOnlyWhenForced = VectorizeOnlyWhenForced;
1593     initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
1594   }
1595 
1596   bool runOnFunction(Function &F) override {
1597     if (skipFunction(F))
1598       return false;
1599 
1600     auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
1601     auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
1602     auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
1603     auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
1604     auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
1605     auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
1606     auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
1607     auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
1608     auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
1609     auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
1610     auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
1611     auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
1612     auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
1613 
1614     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
1615         [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
1616 
1617     return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC,
1618                         GetLAA, *ORE, PSI);
1619   }
1620 
1621   void getAnalysisUsage(AnalysisUsage &AU) const override {
1622     AU.addRequired<AssumptionCacheTracker>();
1623     AU.addRequired<BlockFrequencyInfoWrapperPass>();
1624     AU.addRequired<DominatorTreeWrapperPass>();
1625     AU.addRequired<LoopInfoWrapperPass>();
1626     AU.addRequired<ScalarEvolutionWrapperPass>();
1627     AU.addRequired<TargetTransformInfoWrapperPass>();
1628     AU.addRequired<AAResultsWrapperPass>();
1629     AU.addRequired<LoopAccessLegacyAnalysis>();
1630     AU.addRequired<DemandedBitsWrapperPass>();
1631     AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
1632 
1633     // We currently do not preserve loopinfo/dominator analyses with outer loop
1634     // vectorization. Until this is addressed, mark these analyses as preserved
1635     // only for non-VPlan-native path.
1636     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
1637     if (!EnableVPlanNativePath) {
1638       AU.addPreserved<LoopInfoWrapperPass>();
1639       AU.addPreserved<DominatorTreeWrapperPass>();
1640     }
1641 
1642     AU.addPreserved<BasicAAWrapperPass>();
1643     AU.addPreserved<GlobalsAAWrapperPass>();
1644     AU.addRequired<ProfileSummaryInfoWrapperPass>();
1645   }
1646 };
1647 
1648 } // end anonymous namespace
1649 
1650 //===----------------------------------------------------------------------===//
1651 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
1652 // LoopVectorizationCostModel and LoopVectorizationPlanner.
1653 //===----------------------------------------------------------------------===//
1654 
1655 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
1656   // We need to place the broadcast of invariant variables outside the loop,
1657   // but only if it's proven safe to do so. Else, broadcast will be inside
1658   // vector loop body.
1659   Instruction *Instr = dyn_cast<Instruction>(V);
1660   bool SafeToHoist = OrigLoop->isLoopInvariant(V) &&
1661                      (!Instr ||
1662                       DT->dominates(Instr->getParent(), LoopVectorPreHeader));
1663   // Place the code for broadcasting invariant variables in the new preheader.
1664   IRBuilder<>::InsertPointGuard Guard(Builder);
1665   if (SafeToHoist)
1666     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
1667 
1668   // Broadcast the scalar into all locations in the vector.
1669   Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
1670 
1671   return Shuf;
1672 }
1673 
1674 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
1675     const InductionDescriptor &II, Value *Step, Instruction *EntryVal) {
1676   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1677          "Expected either an induction phi-node or a truncate of it!");
1678   Value *Start = II.getStartValue();
1679 
1680   // Construct the initial value of the vector IV in the vector loop preheader
1681   auto CurrIP = Builder.saveIP();
1682   Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
1683   if (isa<TruncInst>(EntryVal)) {
1684     assert(Start->getType()->isIntegerTy() &&
1685            "Truncation requires an integer type");
1686     auto *TruncType = cast<IntegerType>(EntryVal->getType());
1687     Step = Builder.CreateTrunc(Step, TruncType);
1688     Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
1689   }
1690   Value *SplatStart = Builder.CreateVectorSplat(VF, Start);
1691   Value *SteppedStart =
1692       getStepVector(SplatStart, 0, Step, II.getInductionOpcode());
1693 
1694   // We create vector phi nodes for both integer and floating-point induction
1695   // variables. Here, we determine the kind of arithmetic we will perform.
1696   Instruction::BinaryOps AddOp;
1697   Instruction::BinaryOps MulOp;
1698   if (Step->getType()->isIntegerTy()) {
1699     AddOp = Instruction::Add;
1700     MulOp = Instruction::Mul;
1701   } else {
1702     AddOp = II.getInductionOpcode();
1703     MulOp = Instruction::FMul;
1704   }
1705 
1706   // Multiply the vectorization factor by the step using integer or
1707   // floating-point arithmetic as appropriate.
1708   Value *ConstVF = getSignedIntOrFpConstant(Step->getType(), VF);
1709   Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF));
1710 
1711   // Create a vector splat to use in the induction update.
1712   //
1713   // FIXME: If the step is non-constant, we create the vector splat with
1714   //        IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
1715   //        handle a constant vector splat.
1716   Value *SplatVF = isa<Constant>(Mul)
1717                        ? ConstantVector::getSplat(VF, cast<Constant>(Mul))
1718                        : Builder.CreateVectorSplat(VF, Mul);
1719   Builder.restoreIP(CurrIP);
1720 
1721   // We may need to add the step a number of times, depending on the unroll
1722   // factor. The last of those goes into the PHI.
1723   PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
1724                                     &*LoopVectorBody->getFirstInsertionPt());
1725   VecInd->setDebugLoc(EntryVal->getDebugLoc());
1726   Instruction *LastInduction = VecInd;
1727   for (unsigned Part = 0; Part < UF; ++Part) {
1728     VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction);
1729 
1730     if (isa<TruncInst>(EntryVal))
1731       addMetadata(LastInduction, EntryVal);
1732     recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, Part);
1733 
1734     LastInduction = cast<Instruction>(addFastMathFlag(
1735         Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")));
1736     LastInduction->setDebugLoc(EntryVal->getDebugLoc());
1737   }
1738 
1739   // Move the last step to the end of the latch block. This ensures consistent
1740   // placement of all induction updates.
1741   auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
1742   auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator());
1743   auto *ICmp = cast<Instruction>(Br->getCondition());
1744   LastInduction->moveBefore(ICmp);
1745   LastInduction->setName("vec.ind.next");
1746 
1747   VecInd->addIncoming(SteppedStart, LoopVectorPreHeader);
1748   VecInd->addIncoming(LastInduction, LoopVectorLatch);
1749 }
1750 
1751 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const {
1752   return Cost->isScalarAfterVectorization(I, VF) ||
1753          Cost->isProfitableToScalarize(I, VF);
1754 }
1755 
1756 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const {
1757   if (shouldScalarizeInstruction(IV))
1758     return true;
1759   auto isScalarInst = [&](User *U) -> bool {
1760     auto *I = cast<Instruction>(U);
1761     return (OrigLoop->contains(I) && shouldScalarizeInstruction(I));
1762   };
1763   return llvm::any_of(IV->users(), isScalarInst);
1764 }
1765 
1766 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast(
1767     const InductionDescriptor &ID, const Instruction *EntryVal,
1768     Value *VectorLoopVal, unsigned Part, unsigned Lane) {
1769   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1770          "Expected either an induction phi-node or a truncate of it!");
1771 
1772   // This induction variable is not the phi from the original loop but the
1773   // newly-created IV based on the proof that casted Phi is equal to the
1774   // uncasted Phi in the vectorized loop (under a runtime guard possibly). It
1775   // re-uses the same InductionDescriptor that original IV uses but we don't
1776   // have to do any recording in this case - that is done when original IV is
1777   // processed.
1778   if (isa<TruncInst>(EntryVal))
1779     return;
1780 
1781   const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts();
1782   if (Casts.empty())
1783     return;
1784   // Only the first Cast instruction in the Casts vector is of interest.
1785   // The rest of the Casts (if exist) have no uses outside the
1786   // induction update chain itself.
1787   Instruction *CastInst = *Casts.begin();
1788   if (Lane < UINT_MAX)
1789     VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal);
1790   else
1791     VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal);
1792 }
1793 
1794 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) {
1795   assert((IV->getType()->isIntegerTy() || IV != OldInduction) &&
1796          "Primary induction variable must have an integer type");
1797 
1798   auto II = Legal->getInductionVars()->find(IV);
1799   assert(II != Legal->getInductionVars()->end() && "IV is not an induction");
1800 
1801   auto ID = II->second;
1802   assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
1803 
1804   // The scalar value to broadcast. This will be derived from the canonical
1805   // induction variable.
1806   Value *ScalarIV = nullptr;
1807 
1808   // The value from the original loop to which we are mapping the new induction
1809   // variable.
1810   Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
1811 
1812   // True if we have vectorized the induction variable.
1813   auto VectorizedIV = false;
1814 
1815   // Determine if we want a scalar version of the induction variable. This is
1816   // true if the induction variable itself is not widened, or if it has at
1817   // least one user in the loop that is not widened.
1818   auto NeedsScalarIV = VF > 1 && needsScalarInduction(EntryVal);
1819 
1820   // Generate code for the induction step. Note that induction steps are
1821   // required to be loop-invariant
1822   assert(PSE.getSE()->isLoopInvariant(ID.getStep(), OrigLoop) &&
1823          "Induction step should be loop invariant");
1824   auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
1825   Value *Step = nullptr;
1826   if (PSE.getSE()->isSCEVable(IV->getType())) {
1827     SCEVExpander Exp(*PSE.getSE(), DL, "induction");
1828     Step = Exp.expandCodeFor(ID.getStep(), ID.getStep()->getType(),
1829                              LoopVectorPreHeader->getTerminator());
1830   } else {
1831     Step = cast<SCEVUnknown>(ID.getStep())->getValue();
1832   }
1833 
1834   // Try to create a new independent vector induction variable. If we can't
1835   // create the phi node, we will splat the scalar induction variable in each
1836   // loop iteration.
1837   if (VF > 1 && !shouldScalarizeInstruction(EntryVal)) {
1838     createVectorIntOrFpInductionPHI(ID, Step, EntryVal);
1839     VectorizedIV = true;
1840   }
1841 
1842   // If we haven't yet vectorized the induction variable, or if we will create
1843   // a scalar one, we need to define the scalar induction variable and step
1844   // values. If we were given a truncation type, truncate the canonical
1845   // induction variable and step. Otherwise, derive these values from the
1846   // induction descriptor.
1847   if (!VectorizedIV || NeedsScalarIV) {
1848     ScalarIV = Induction;
1849     if (IV != OldInduction) {
1850       ScalarIV = IV->getType()->isIntegerTy()
1851                      ? Builder.CreateSExtOrTrunc(Induction, IV->getType())
1852                      : Builder.CreateCast(Instruction::SIToFP, Induction,
1853                                           IV->getType());
1854       ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID);
1855       ScalarIV->setName("offset.idx");
1856     }
1857     if (Trunc) {
1858       auto *TruncType = cast<IntegerType>(Trunc->getType());
1859       assert(Step->getType()->isIntegerTy() &&
1860              "Truncation requires an integer step");
1861       ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType);
1862       Step = Builder.CreateTrunc(Step, TruncType);
1863     }
1864   }
1865 
1866   // If we haven't yet vectorized the induction variable, splat the scalar
1867   // induction variable, and build the necessary step vectors.
1868   // TODO: Don't do it unless the vectorized IV is really required.
1869   if (!VectorizedIV) {
1870     Value *Broadcasted = getBroadcastInstrs(ScalarIV);
1871     for (unsigned Part = 0; Part < UF; ++Part) {
1872       Value *EntryPart =
1873           getStepVector(Broadcasted, VF * Part, Step, ID.getInductionOpcode());
1874       VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart);
1875       if (Trunc)
1876         addMetadata(EntryPart, Trunc);
1877       recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part);
1878     }
1879   }
1880 
1881   // If an induction variable is only used for counting loop iterations or
1882   // calculating addresses, it doesn't need to be widened. Create scalar steps
1883   // that can be used by instructions we will later scalarize. Note that the
1884   // addition of the scalar steps will not increase the number of instructions
1885   // in the loop in the common case prior to InstCombine. We will be trading
1886   // one vector extract for each scalar step.
1887   if (NeedsScalarIV)
1888     buildScalarSteps(ScalarIV, Step, EntryVal, ID);
1889 }
1890 
1891 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step,
1892                                           Instruction::BinaryOps BinOp) {
1893   // Create and check the types.
1894   assert(Val->getType()->isVectorTy() && "Must be a vector");
1895   int VLen = Val->getType()->getVectorNumElements();
1896 
1897   Type *STy = Val->getType()->getScalarType();
1898   assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
1899          "Induction Step must be an integer or FP");
1900   assert(Step->getType() == STy && "Step has wrong type");
1901 
1902   SmallVector<Constant *, 8> Indices;
1903 
1904   if (STy->isIntegerTy()) {
1905     // Create a vector of consecutive numbers from zero to VF.
1906     for (int i = 0; i < VLen; ++i)
1907       Indices.push_back(ConstantInt::get(STy, StartIdx + i));
1908 
1909     // Add the consecutive indices to the vector value.
1910     Constant *Cv = ConstantVector::get(Indices);
1911     assert(Cv->getType() == Val->getType() && "Invalid consecutive vec");
1912     Step = Builder.CreateVectorSplat(VLen, Step);
1913     assert(Step->getType() == Val->getType() && "Invalid step vec");
1914     // FIXME: The newly created binary instructions should contain nsw/nuw flags,
1915     // which can be found from the original scalar operations.
1916     Step = Builder.CreateMul(Cv, Step);
1917     return Builder.CreateAdd(Val, Step, "induction");
1918   }
1919 
1920   // Floating point induction.
1921   assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
1922          "Binary Opcode should be specified for FP induction");
1923   // Create a vector of consecutive numbers from zero to VF.
1924   for (int i = 0; i < VLen; ++i)
1925     Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i)));
1926 
1927   // Add the consecutive indices to the vector value.
1928   Constant *Cv = ConstantVector::get(Indices);
1929 
1930   Step = Builder.CreateVectorSplat(VLen, Step);
1931 
1932   // Floating point operations had to be 'fast' to enable the induction.
1933   FastMathFlags Flags;
1934   Flags.setFast();
1935 
1936   Value *MulOp = Builder.CreateFMul(Cv, Step);
1937   if (isa<Instruction>(MulOp))
1938     // Have to check, MulOp may be a constant
1939     cast<Instruction>(MulOp)->setFastMathFlags(Flags);
1940 
1941   Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
1942   if (isa<Instruction>(BOp))
1943     cast<Instruction>(BOp)->setFastMathFlags(Flags);
1944   return BOp;
1945 }
1946 
1947 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
1948                                            Instruction *EntryVal,
1949                                            const InductionDescriptor &ID) {
1950   // We shouldn't have to build scalar steps if we aren't vectorizing.
1951   assert(VF > 1 && "VF should be greater than one");
1952 
1953   // Get the value type and ensure it and the step have the same integer type.
1954   Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
1955   assert(ScalarIVTy == Step->getType() &&
1956          "Val and Step should have the same type");
1957 
1958   // We build scalar steps for both integer and floating-point induction
1959   // variables. Here, we determine the kind of arithmetic we will perform.
1960   Instruction::BinaryOps AddOp;
1961   Instruction::BinaryOps MulOp;
1962   if (ScalarIVTy->isIntegerTy()) {
1963     AddOp = Instruction::Add;
1964     MulOp = Instruction::Mul;
1965   } else {
1966     AddOp = ID.getInductionOpcode();
1967     MulOp = Instruction::FMul;
1968   }
1969 
1970   // Determine the number of scalars we need to generate for each unroll
1971   // iteration. If EntryVal is uniform, we only need to generate the first
1972   // lane. Otherwise, we generate all VF values.
1973   unsigned Lanes =
1974       Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) ? 1
1975                                                                          : VF;
1976   // Compute the scalar steps and save the results in VectorLoopValueMap.
1977   for (unsigned Part = 0; Part < UF; ++Part) {
1978     for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
1979       auto *StartIdx = getSignedIntOrFpConstant(ScalarIVTy, VF * Part + Lane);
1980       auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step));
1981       auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul));
1982       VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add);
1983       recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane);
1984     }
1985   }
1986 }
1987 
1988 Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) {
1989   assert(V != Induction && "The new induction variable should not be used.");
1990   assert(!V->getType()->isVectorTy() && "Can't widen a vector");
1991   assert(!V->getType()->isVoidTy() && "Type does not produce a value");
1992 
1993   // If we have a stride that is replaced by one, do it here. Defer this for
1994   // the VPlan-native path until we start running Legal checks in that path.
1995   if (!EnableVPlanNativePath && Legal->hasStride(V))
1996     V = ConstantInt::get(V->getType(), 1);
1997 
1998   // If we have a vector mapped to this value, return it.
1999   if (VectorLoopValueMap.hasVectorValue(V, Part))
2000     return VectorLoopValueMap.getVectorValue(V, Part);
2001 
2002   // If the value has not been vectorized, check if it has been scalarized
2003   // instead. If it has been scalarized, and we actually need the value in
2004   // vector form, we will construct the vector values on demand.
2005   if (VectorLoopValueMap.hasAnyScalarValue(V)) {
2006     Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0});
2007 
2008     // If we've scalarized a value, that value should be an instruction.
2009     auto *I = cast<Instruction>(V);
2010 
2011     // If we aren't vectorizing, we can just copy the scalar map values over to
2012     // the vector map.
2013     if (VF == 1) {
2014       VectorLoopValueMap.setVectorValue(V, Part, ScalarValue);
2015       return ScalarValue;
2016     }
2017 
2018     // Get the last scalar instruction we generated for V and Part. If the value
2019     // is known to be uniform after vectorization, this corresponds to lane zero
2020     // of the Part unroll iteration. Otherwise, the last instruction is the one
2021     // we created for the last vector lane of the Part unroll iteration.
2022     unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) ? 0 : VF - 1;
2023     auto *LastInst = cast<Instruction>(
2024         VectorLoopValueMap.getScalarValue(V, {Part, LastLane}));
2025 
2026     // Set the insert point after the last scalarized instruction. This ensures
2027     // the insertelement sequence will directly follow the scalar definitions.
2028     auto OldIP = Builder.saveIP();
2029     auto NewIP = std::next(BasicBlock::iterator(LastInst));
2030     Builder.SetInsertPoint(&*NewIP);
2031 
2032     // However, if we are vectorizing, we need to construct the vector values.
2033     // If the value is known to be uniform after vectorization, we can just
2034     // broadcast the scalar value corresponding to lane zero for each unroll
2035     // iteration. Otherwise, we construct the vector values using insertelement
2036     // instructions. Since the resulting vectors are stored in
2037     // VectorLoopValueMap, we will only generate the insertelements once.
2038     Value *VectorValue = nullptr;
2039     if (Cost->isUniformAfterVectorization(I, VF)) {
2040       VectorValue = getBroadcastInstrs(ScalarValue);
2041       VectorLoopValueMap.setVectorValue(V, Part, VectorValue);
2042     } else {
2043       // Initialize packing with insertelements to start from undef.
2044       Value *Undef = UndefValue::get(VectorType::get(V->getType(), VF));
2045       VectorLoopValueMap.setVectorValue(V, Part, Undef);
2046       for (unsigned Lane = 0; Lane < VF; ++Lane)
2047         packScalarIntoVectorValue(V, {Part, Lane});
2048       VectorValue = VectorLoopValueMap.getVectorValue(V, Part);
2049     }
2050     Builder.restoreIP(OldIP);
2051     return VectorValue;
2052   }
2053 
2054   // If this scalar is unknown, assume that it is a constant or that it is
2055   // loop invariant. Broadcast V and save the value for future uses.
2056   Value *B = getBroadcastInstrs(V);
2057   VectorLoopValueMap.setVectorValue(V, Part, B);
2058   return B;
2059 }
2060 
2061 Value *
2062 InnerLoopVectorizer::getOrCreateScalarValue(Value *V,
2063                                             const VPIteration &Instance) {
2064   // If the value is not an instruction contained in the loop, it should
2065   // already be scalar.
2066   if (OrigLoop->isLoopInvariant(V))
2067     return V;
2068 
2069   assert(Instance.Lane > 0
2070              ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF)
2071              : true && "Uniform values only have lane zero");
2072 
2073   // If the value from the original loop has not been vectorized, it is
2074   // represented by UF x VF scalar values in the new loop. Return the requested
2075   // scalar value.
2076   if (VectorLoopValueMap.hasScalarValue(V, Instance))
2077     return VectorLoopValueMap.getScalarValue(V, Instance);
2078 
2079   // If the value has not been scalarized, get its entry in VectorLoopValueMap
2080   // for the given unroll part. If this entry is not a vector type (i.e., the
2081   // vectorization factor is one), there is no need to generate an
2082   // extractelement instruction.
2083   auto *U = getOrCreateVectorValue(V, Instance.Part);
2084   if (!U->getType()->isVectorTy()) {
2085     assert(VF == 1 && "Value not scalarized has non-vector type");
2086     return U;
2087   }
2088 
2089   // Otherwise, the value from the original loop has been vectorized and is
2090   // represented by UF vector values. Extract and return the requested scalar
2091   // value from the appropriate vector lane.
2092   return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane));
2093 }
2094 
2095 void InnerLoopVectorizer::packScalarIntoVectorValue(
2096     Value *V, const VPIteration &Instance) {
2097   assert(V != Induction && "The new induction variable should not be used.");
2098   assert(!V->getType()->isVectorTy() && "Can't pack a vector");
2099   assert(!V->getType()->isVoidTy() && "Type does not produce a value");
2100 
2101   Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance);
2102   Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part);
2103   VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst,
2104                                             Builder.getInt32(Instance.Lane));
2105   VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue);
2106 }
2107 
2108 Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
2109   assert(Vec->getType()->isVectorTy() && "Invalid type");
2110   SmallVector<Constant *, 8> ShuffleMask;
2111   for (unsigned i = 0; i < VF; ++i)
2112     ShuffleMask.push_back(Builder.getInt32(VF - i - 1));
2113 
2114   return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()),
2115                                      ConstantVector::get(ShuffleMask),
2116                                      "reverse");
2117 }
2118 
2119 // Return whether we allow using masked interleave-groups (for dealing with
2120 // strided loads/stores that reside in predicated blocks, or for dealing
2121 // with gaps).
2122 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2123   // If an override option has been passed in for interleaved accesses, use it.
2124   if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2125     return EnableMaskedInterleavedMemAccesses;
2126 
2127   return TTI.enableMaskedInterleavedAccessVectorization();
2128 }
2129 
2130 // Try to vectorize the interleave group that \p Instr belongs to.
2131 //
2132 // E.g. Translate following interleaved load group (factor = 3):
2133 //   for (i = 0; i < N; i+=3) {
2134 //     R = Pic[i];             // Member of index 0
2135 //     G = Pic[i+1];           // Member of index 1
2136 //     B = Pic[i+2];           // Member of index 2
2137 //     ... // do something to R, G, B
2138 //   }
2139 // To:
2140 //   %wide.vec = load <12 x i32>                       ; Read 4 tuples of R,G,B
2141 //   %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9>   ; R elements
2142 //   %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10>  ; G elements
2143 //   %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11>  ; B elements
2144 //
2145 // Or translate following interleaved store group (factor = 3):
2146 //   for (i = 0; i < N; i+=3) {
2147 //     ... do something to R, G, B
2148 //     Pic[i]   = R;           // Member of index 0
2149 //     Pic[i+1] = G;           // Member of index 1
2150 //     Pic[i+2] = B;           // Member of index 2
2151 //   }
2152 // To:
2153 //   %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2154 //   %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u>
2155 //   %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2156 //        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements
2157 //   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
2158 void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr,
2159                                                    VectorParts *BlockInMask) {
2160   const InterleaveGroup<Instruction> *Group =
2161       Cost->getInterleavedAccessGroup(Instr);
2162   assert(Group && "Fail to get an interleaved access group.");
2163 
2164   // Skip if current instruction is not the insert position.
2165   if (Instr != Group->getInsertPos())
2166     return;
2167 
2168   const DataLayout &DL = Instr->getModule()->getDataLayout();
2169   Value *Ptr = getLoadStorePointerOperand(Instr);
2170 
2171   // Prepare for the vector type of the interleaved load/store.
2172   Type *ScalarTy = getMemInstValueType(Instr);
2173   unsigned InterleaveFactor = Group->getFactor();
2174   Type *VecTy = VectorType::get(ScalarTy, InterleaveFactor * VF);
2175   Type *PtrTy = VecTy->getPointerTo(getLoadStoreAddressSpace(Instr));
2176 
2177   // Prepare for the new pointers.
2178   setDebugLocFromInst(Builder, Ptr);
2179   SmallVector<Value *, 2> NewPtrs;
2180   unsigned Index = Group->getIndex(Instr);
2181 
2182   VectorParts Mask;
2183   bool IsMaskForCondRequired = BlockInMask;
2184   if (IsMaskForCondRequired) {
2185     Mask = *BlockInMask;
2186     // TODO: extend the masked interleaved-group support to reversed access.
2187     assert(!Group->isReverse() && "Reversed masked interleave-group "
2188                                   "not supported.");
2189   }
2190 
2191   // If the group is reverse, adjust the index to refer to the last vector lane
2192   // instead of the first. We adjust the index from the first vector lane,
2193   // rather than directly getting the pointer for lane VF - 1, because the
2194   // pointer operand of the interleaved access is supposed to be uniform. For
2195   // uniform instructions, we're only required to generate a value for the
2196   // first vector lane in each unroll iteration.
2197   if (Group->isReverse())
2198     Index += (VF - 1) * Group->getFactor();
2199 
2200   bool InBounds = false;
2201   if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
2202     InBounds = gep->isInBounds();
2203 
2204   for (unsigned Part = 0; Part < UF; Part++) {
2205     Value *NewPtr = getOrCreateScalarValue(Ptr, {Part, 0});
2206 
2207     // Notice current instruction could be any index. Need to adjust the address
2208     // to the member of index 0.
2209     //
2210     // E.g.  a = A[i+1];     // Member of index 1 (Current instruction)
2211     //       b = A[i];       // Member of index 0
2212     // Current pointer is pointed to A[i+1], adjust it to A[i].
2213     //
2214     // E.g.  A[i+1] = a;     // Member of index 1
2215     //       A[i]   = b;     // Member of index 0
2216     //       A[i+2] = c;     // Member of index 2 (Current instruction)
2217     // Current pointer is pointed to A[i+2], adjust it to A[i].
2218     NewPtr = Builder.CreateGEP(ScalarTy, NewPtr, Builder.getInt32(-Index));
2219     if (InBounds)
2220       cast<GetElementPtrInst>(NewPtr)->setIsInBounds(true);
2221 
2222     // Cast to the vector pointer type.
2223     NewPtrs.push_back(Builder.CreateBitCast(NewPtr, PtrTy));
2224   }
2225 
2226   setDebugLocFromInst(Builder, Instr);
2227   Value *UndefVec = UndefValue::get(VecTy);
2228 
2229   Value *MaskForGaps = nullptr;
2230   if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) {
2231     MaskForGaps = createBitMaskForGaps(Builder, VF, *Group);
2232     assert(MaskForGaps && "Mask for Gaps is required but it is null");
2233   }
2234 
2235   // Vectorize the interleaved load group.
2236   if (isa<LoadInst>(Instr)) {
2237     // For each unroll part, create a wide load for the group.
2238     SmallVector<Value *, 2> NewLoads;
2239     for (unsigned Part = 0; Part < UF; Part++) {
2240       Instruction *NewLoad;
2241       if (IsMaskForCondRequired || MaskForGaps) {
2242         assert(useMaskedInterleavedAccesses(*TTI) &&
2243                "masked interleaved groups are not allowed.");
2244         Value *GroupMask = MaskForGaps;
2245         if (IsMaskForCondRequired) {
2246           auto *Undefs = UndefValue::get(Mask[Part]->getType());
2247           auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF);
2248           Value *ShuffledMask = Builder.CreateShuffleVector(
2249               Mask[Part], Undefs, RepMask, "interleaved.mask");
2250           GroupMask = MaskForGaps
2251                           ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
2252                                                 MaskForGaps)
2253                           : ShuffledMask;
2254         }
2255         NewLoad =
2256             Builder.CreateMaskedLoad(NewPtrs[Part], Group->getAlignment(),
2257                                      GroupMask, UndefVec, "wide.masked.vec");
2258       }
2259       else
2260         NewLoad = Builder.CreateAlignedLoad(VecTy, NewPtrs[Part],
2261                                             Group->getAlignment(), "wide.vec");
2262       Group->addMetadata(NewLoad);
2263       NewLoads.push_back(NewLoad);
2264     }
2265 
2266     // For each member in the group, shuffle out the appropriate data from the
2267     // wide loads.
2268     for (unsigned I = 0; I < InterleaveFactor; ++I) {
2269       Instruction *Member = Group->getMember(I);
2270 
2271       // Skip the gaps in the group.
2272       if (!Member)
2273         continue;
2274 
2275       Constant *StrideMask = createStrideMask(Builder, I, InterleaveFactor, VF);
2276       for (unsigned Part = 0; Part < UF; Part++) {
2277         Value *StridedVec = Builder.CreateShuffleVector(
2278             NewLoads[Part], UndefVec, StrideMask, "strided.vec");
2279 
2280         // If this member has different type, cast the result type.
2281         if (Member->getType() != ScalarTy) {
2282           VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2283           StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2284         }
2285 
2286         if (Group->isReverse())
2287           StridedVec = reverseVector(StridedVec);
2288 
2289         VectorLoopValueMap.setVectorValue(Member, Part, StridedVec);
2290       }
2291     }
2292     return;
2293   }
2294 
2295   // The sub vector type for current instruction.
2296   VectorType *SubVT = VectorType::get(ScalarTy, VF);
2297 
2298   // Vectorize the interleaved store group.
2299   for (unsigned Part = 0; Part < UF; Part++) {
2300     // Collect the stored vector from each member.
2301     SmallVector<Value *, 4> StoredVecs;
2302     for (unsigned i = 0; i < InterleaveFactor; i++) {
2303       // Interleaved store group doesn't allow a gap, so each index has a member
2304       Instruction *Member = Group->getMember(i);
2305       assert(Member && "Fail to get a member from an interleaved store group");
2306 
2307       Value *StoredVec = getOrCreateVectorValue(
2308           cast<StoreInst>(Member)->getValueOperand(), Part);
2309       if (Group->isReverse())
2310         StoredVec = reverseVector(StoredVec);
2311 
2312       // If this member has different type, cast it to a unified type.
2313 
2314       if (StoredVec->getType() != SubVT)
2315         StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2316 
2317       StoredVecs.push_back(StoredVec);
2318     }
2319 
2320     // Concatenate all vectors into a wide vector.
2321     Value *WideVec = concatenateVectors(Builder, StoredVecs);
2322 
2323     // Interleave the elements in the wide vector.
2324     Constant *IMask = createInterleaveMask(Builder, VF, InterleaveFactor);
2325     Value *IVec = Builder.CreateShuffleVector(WideVec, UndefVec, IMask,
2326                                               "interleaved.vec");
2327 
2328     Instruction *NewStoreInstr;
2329     if (IsMaskForCondRequired) {
2330       auto *Undefs = UndefValue::get(Mask[Part]->getType());
2331       auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF);
2332       Value *ShuffledMask = Builder.CreateShuffleVector(
2333           Mask[Part], Undefs, RepMask, "interleaved.mask");
2334       NewStoreInstr = Builder.CreateMaskedStore(
2335           IVec, NewPtrs[Part], Group->getAlignment(), ShuffledMask);
2336     }
2337     else
2338       NewStoreInstr = Builder.CreateAlignedStore(IVec, NewPtrs[Part],
2339         Group->getAlignment());
2340 
2341     Group->addMetadata(NewStoreInstr);
2342   }
2343 }
2344 
2345 void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
2346                                                      VectorParts *BlockInMask) {
2347   // Attempt to issue a wide load.
2348   LoadInst *LI = dyn_cast<LoadInst>(Instr);
2349   StoreInst *SI = dyn_cast<StoreInst>(Instr);
2350 
2351   assert((LI || SI) && "Invalid Load/Store instruction");
2352 
2353   LoopVectorizationCostModel::InstWidening Decision =
2354       Cost->getWideningDecision(Instr, VF);
2355   assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
2356          "CM decision should be taken at this point");
2357   if (Decision == LoopVectorizationCostModel::CM_Interleave)
2358     return vectorizeInterleaveGroup(Instr);
2359 
2360   Type *ScalarDataTy = getMemInstValueType(Instr);
2361   Type *DataTy = VectorType::get(ScalarDataTy, VF);
2362   Value *Ptr = getLoadStorePointerOperand(Instr);
2363   // An alignment of 0 means target abi alignment. We need to use the scalar's
2364   // target abi alignment in such a case.
2365   const DataLayout &DL = Instr->getModule()->getDataLayout();
2366   const Align Alignment =
2367       DL.getValueOrABITypeAlignment(getLoadStoreAlignment(Instr), ScalarDataTy);
2368   unsigned AddressSpace = getLoadStoreAddressSpace(Instr);
2369 
2370   // Determine if the pointer operand of the access is either consecutive or
2371   // reverse consecutive.
2372   bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse);
2373   bool ConsecutiveStride =
2374       Reverse || (Decision == LoopVectorizationCostModel::CM_Widen);
2375   bool CreateGatherScatter =
2376       (Decision == LoopVectorizationCostModel::CM_GatherScatter);
2377 
2378   // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector
2379   // gather/scatter. Otherwise Decision should have been to Scalarize.
2380   assert((ConsecutiveStride || CreateGatherScatter) &&
2381          "The instruction should be scalarized");
2382 
2383   // Handle consecutive loads/stores.
2384   if (ConsecutiveStride)
2385     Ptr = getOrCreateScalarValue(Ptr, {0, 0});
2386 
2387   VectorParts Mask;
2388   bool isMaskRequired = BlockInMask;
2389   if (isMaskRequired)
2390     Mask = *BlockInMask;
2391 
2392   bool InBounds = false;
2393   if (auto *gep = dyn_cast<GetElementPtrInst>(
2394           getLoadStorePointerOperand(Instr)->stripPointerCasts()))
2395     InBounds = gep->isInBounds();
2396 
2397   const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
2398     // Calculate the pointer for the specific unroll-part.
2399     GetElementPtrInst *PartPtr = nullptr;
2400 
2401     if (Reverse) {
2402       // If the address is consecutive but reversed, then the
2403       // wide store needs to start at the last vector element.
2404       PartPtr = cast<GetElementPtrInst>(
2405           Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(-Part * VF)));
2406       PartPtr->setIsInBounds(InBounds);
2407       PartPtr = cast<GetElementPtrInst>(
2408           Builder.CreateGEP(ScalarDataTy, PartPtr, Builder.getInt32(1 - VF)));
2409       PartPtr->setIsInBounds(InBounds);
2410       if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
2411         Mask[Part] = reverseVector(Mask[Part]);
2412     } else {
2413       PartPtr = cast<GetElementPtrInst>(
2414           Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(Part * VF)));
2415       PartPtr->setIsInBounds(InBounds);
2416     }
2417 
2418     return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
2419   };
2420 
2421   // Handle Stores:
2422   if (SI) {
2423     setDebugLocFromInst(Builder, SI);
2424 
2425     for (unsigned Part = 0; Part < UF; ++Part) {
2426       Instruction *NewSI = nullptr;
2427       Value *StoredVal = getOrCreateVectorValue(SI->getValueOperand(), Part);
2428       if (CreateGatherScatter) {
2429         Value *MaskPart = isMaskRequired ? Mask[Part] : nullptr;
2430         Value *VectorGep = getOrCreateVectorValue(Ptr, Part);
2431         NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep,
2432                                             Alignment.value(), MaskPart);
2433       } else {
2434         if (Reverse) {
2435           // If we store to reverse consecutive memory locations, then we need
2436           // to reverse the order of elements in the stored value.
2437           StoredVal = reverseVector(StoredVal);
2438           // We don't want to update the value in the map as it might be used in
2439           // another expression. So don't call resetVectorValue(StoredVal).
2440         }
2441         auto *VecPtr = CreateVecPtr(Part, Ptr);
2442         if (isMaskRequired)
2443           NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr,
2444                                             Alignment.value(), Mask[Part]);
2445         else
2446           NewSI =
2447               Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment.value());
2448       }
2449       addMetadata(NewSI, SI);
2450     }
2451     return;
2452   }
2453 
2454   // Handle loads.
2455   assert(LI && "Must have a load instruction");
2456   setDebugLocFromInst(Builder, LI);
2457   for (unsigned Part = 0; Part < UF; ++Part) {
2458     Value *NewLI;
2459     if (CreateGatherScatter) {
2460       Value *MaskPart = isMaskRequired ? Mask[Part] : nullptr;
2461       Value *VectorGep = getOrCreateVectorValue(Ptr, Part);
2462       NewLI = Builder.CreateMaskedGather(VectorGep, Alignment.value(), MaskPart,
2463                                          nullptr, "wide.masked.gather");
2464       addMetadata(NewLI, LI);
2465     } else {
2466       auto *VecPtr = CreateVecPtr(Part, Ptr);
2467       if (isMaskRequired)
2468         NewLI = Builder.CreateMaskedLoad(VecPtr, Alignment.value(), Mask[Part],
2469                                          UndefValue::get(DataTy),
2470                                          "wide.masked.load");
2471       else
2472         NewLI = Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment.value(),
2473                                           "wide.load");
2474 
2475       // Add metadata to the load, but setVectorValue to the reverse shuffle.
2476       addMetadata(NewLI, LI);
2477       if (Reverse)
2478         NewLI = reverseVector(NewLI);
2479     }
2480     VectorLoopValueMap.setVectorValue(Instr, Part, NewLI);
2481   }
2482 }
2483 
2484 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
2485                                                const VPIteration &Instance,
2486                                                bool IfPredicateInstr) {
2487   assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
2488 
2489   setDebugLocFromInst(Builder, Instr);
2490 
2491   // Does this instruction return a value ?
2492   bool IsVoidRetTy = Instr->getType()->isVoidTy();
2493 
2494   Instruction *Cloned = Instr->clone();
2495   if (!IsVoidRetTy)
2496     Cloned->setName(Instr->getName() + ".cloned");
2497 
2498   // Replace the operands of the cloned instructions with their scalar
2499   // equivalents in the new loop.
2500   for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
2501     auto *NewOp = getOrCreateScalarValue(Instr->getOperand(op), Instance);
2502     Cloned->setOperand(op, NewOp);
2503   }
2504   addNewMetadata(Cloned, Instr);
2505 
2506   // Place the cloned scalar in the new loop.
2507   Builder.Insert(Cloned);
2508 
2509   // Add the cloned scalar to the scalar map entry.
2510   VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned);
2511 
2512   // If we just cloned a new assumption, add it the assumption cache.
2513   if (auto *II = dyn_cast<IntrinsicInst>(Cloned))
2514     if (II->getIntrinsicID() == Intrinsic::assume)
2515       AC->registerAssumption(II);
2516 
2517   // End if-block.
2518   if (IfPredicateInstr)
2519     PredicatedInstructions.push_back(Cloned);
2520 }
2521 
2522 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start,
2523                                                       Value *End, Value *Step,
2524                                                       Instruction *DL) {
2525   BasicBlock *Header = L->getHeader();
2526   BasicBlock *Latch = L->getLoopLatch();
2527   // As we're just creating this loop, it's possible no latch exists
2528   // yet. If so, use the header as this will be a single block loop.
2529   if (!Latch)
2530     Latch = Header;
2531 
2532   IRBuilder<> Builder(&*Header->getFirstInsertionPt());
2533   Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction);
2534   setDebugLocFromInst(Builder, OldInst);
2535   auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index");
2536 
2537   Builder.SetInsertPoint(Latch->getTerminator());
2538   setDebugLocFromInst(Builder, OldInst);
2539 
2540   // Create i+1 and fill the PHINode.
2541   Value *Next = Builder.CreateAdd(Induction, Step, "index.next");
2542   Induction->addIncoming(Start, L->getLoopPreheader());
2543   Induction->addIncoming(Next, Latch);
2544   // Create the compare.
2545   Value *ICmp = Builder.CreateICmpEQ(Next, End);
2546   Builder.CreateCondBr(ICmp, L->getExitBlock(), Header);
2547 
2548   // Now we have two terminators. Remove the old one from the block.
2549   Latch->getTerminator()->eraseFromParent();
2550 
2551   return Induction;
2552 }
2553 
2554 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
2555   if (TripCount)
2556     return TripCount;
2557 
2558   assert(L && "Create Trip Count for null loop.");
2559   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2560   // Find the loop boundaries.
2561   ScalarEvolution *SE = PSE.getSE();
2562   const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
2563   assert(BackedgeTakenCount != SE->getCouldNotCompute() &&
2564          "Invalid loop count");
2565 
2566   Type *IdxTy = Legal->getWidestInductionType();
2567   assert(IdxTy && "No type for induction");
2568 
2569   // The exit count might have the type of i64 while the phi is i32. This can
2570   // happen if we have an induction variable that is sign extended before the
2571   // compare. The only way that we get a backedge taken count is that the
2572   // induction variable was signed and as such will not overflow. In such a case
2573   // truncation is legal.
2574   if (BackedgeTakenCount->getType()->getPrimitiveSizeInBits() >
2575       IdxTy->getPrimitiveSizeInBits())
2576     BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
2577   BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
2578 
2579   // Get the total trip count from the count by adding 1.
2580   const SCEV *ExitCount = SE->getAddExpr(
2581       BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
2582 
2583   const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
2584 
2585   // Expand the trip count and place the new instructions in the preheader.
2586   // Notice that the pre-header does not change, only the loop body.
2587   SCEVExpander Exp(*SE, DL, "induction");
2588 
2589   // Count holds the overall loop count (N).
2590   TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
2591                                 L->getLoopPreheader()->getTerminator());
2592 
2593   if (TripCount->getType()->isPointerTy())
2594     TripCount =
2595         CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
2596                                     L->getLoopPreheader()->getTerminator());
2597 
2598   return TripCount;
2599 }
2600 
2601 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
2602   if (VectorTripCount)
2603     return VectorTripCount;
2604 
2605   Value *TC = getOrCreateTripCount(L);
2606   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2607 
2608   Type *Ty = TC->getType();
2609   Constant *Step = ConstantInt::get(Ty, VF * UF);
2610 
2611   // If the tail is to be folded by masking, round the number of iterations N
2612   // up to a multiple of Step instead of rounding down. This is done by first
2613   // adding Step-1 and then rounding down. Note that it's ok if this addition
2614   // overflows: the vector induction variable will eventually wrap to zero given
2615   // that it starts at zero and its Step is a power of two; the loop will then
2616   // exit, with the last early-exit vector comparison also producing all-true.
2617   if (Cost->foldTailByMasking()) {
2618     assert(isPowerOf2_32(VF * UF) &&
2619            "VF*UF must be a power of 2 when folding tail by masking");
2620     TC = Builder.CreateAdd(TC, ConstantInt::get(Ty, VF * UF - 1), "n.rnd.up");
2621   }
2622 
2623   // Now we need to generate the expression for the part of the loop that the
2624   // vectorized body will execute. This is equal to N - (N % Step) if scalar
2625   // iterations are not required for correctness, or N - Step, otherwise. Step
2626   // is equal to the vectorization factor (number of SIMD elements) times the
2627   // unroll factor (number of SIMD instructions).
2628   Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
2629 
2630   // If there is a non-reversed interleaved group that may speculatively access
2631   // memory out-of-bounds, we need to ensure that there will be at least one
2632   // iteration of the scalar epilogue loop. Thus, if the step evenly divides
2633   // the trip count, we set the remainder to be equal to the step. If the step
2634   // does not evenly divide the trip count, no adjustment is necessary since
2635   // there will already be scalar iterations. Note that the minimum iterations
2636   // check ensures that N >= Step.
2637   if (VF > 1 && Cost->requiresScalarEpilogue()) {
2638     auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
2639     R = Builder.CreateSelect(IsZero, Step, R);
2640   }
2641 
2642   VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
2643 
2644   return VectorTripCount;
2645 }
2646 
2647 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
2648                                                    const DataLayout &DL) {
2649   // Verify that V is a vector type with same number of elements as DstVTy.
2650   unsigned VF = DstVTy->getNumElements();
2651   VectorType *SrcVecTy = cast<VectorType>(V->getType());
2652   assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match");
2653   Type *SrcElemTy = SrcVecTy->getElementType();
2654   Type *DstElemTy = DstVTy->getElementType();
2655   assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
2656          "Vector elements must have same size");
2657 
2658   // Do a direct cast if element types are castable.
2659   if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
2660     return Builder.CreateBitOrPointerCast(V, DstVTy);
2661   }
2662   // V cannot be directly casted to desired vector type.
2663   // May happen when V is a floating point vector but DstVTy is a vector of
2664   // pointers or vice-versa. Handle this using a two-step bitcast using an
2665   // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
2666   assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
2667          "Only one type should be a pointer type");
2668   assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
2669          "Only one type should be a floating point type");
2670   Type *IntTy =
2671       IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
2672   VectorType *VecIntTy = VectorType::get(IntTy, VF);
2673   Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
2674   return Builder.CreateBitOrPointerCast(CastVal, DstVTy);
2675 }
2676 
2677 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
2678                                                          BasicBlock *Bypass) {
2679   Value *Count = getOrCreateTripCount(L);
2680   BasicBlock *BB = L->getLoopPreheader();
2681   IRBuilder<> Builder(BB->getTerminator());
2682 
2683   // Generate code to check if the loop's trip count is less than VF * UF, or
2684   // equal to it in case a scalar epilogue is required; this implies that the
2685   // vector trip count is zero. This check also covers the case where adding one
2686   // to the backedge-taken count overflowed leading to an incorrect trip count
2687   // of zero. In this case we will also jump to the scalar loop.
2688   auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE
2689                                           : ICmpInst::ICMP_ULT;
2690 
2691   // If tail is to be folded, vector loop takes care of all iterations.
2692   Value *CheckMinIters = Builder.getFalse();
2693   if (!Cost->foldTailByMasking())
2694     CheckMinIters = Builder.CreateICmp(
2695         P, Count, ConstantInt::get(Count->getType(), VF * UF),
2696         "min.iters.check");
2697 
2698   BasicBlock *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
2699   // Update dominator tree immediately if the generated block is a
2700   // LoopBypassBlock because SCEV expansions to generate loop bypass
2701   // checks may query it before the current function is finished.
2702   DT->addNewBlock(NewBB, BB);
2703   if (L->getParentLoop())
2704     L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
2705   ReplaceInstWithInst(BB->getTerminator(),
2706                       BranchInst::Create(Bypass, NewBB, CheckMinIters));
2707   LoopBypassBlocks.push_back(BB);
2708 }
2709 
2710 void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
2711   BasicBlock *BB = L->getLoopPreheader();
2712 
2713   // Generate the code to check that the SCEV assumptions that we made.
2714   // We want the new basic block to start at the first instruction in a
2715   // sequence of instructions that form a check.
2716   SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(),
2717                    "scev.check");
2718   Value *SCEVCheck =
2719       Exp.expandCodeForPredicate(&PSE.getUnionPredicate(), BB->getTerminator());
2720 
2721   if (auto *C = dyn_cast<ConstantInt>(SCEVCheck))
2722     if (C->isZero())
2723       return;
2724 
2725   assert(!BB->getParent()->hasOptSize() &&
2726          "Cannot SCEV check stride or overflow when optimizing for size");
2727 
2728   // Create a new block containing the stride check.
2729   BB->setName("vector.scevcheck");
2730   auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
2731   // Update dominator tree immediately if the generated block is a
2732   // LoopBypassBlock because SCEV expansions to generate loop bypass
2733   // checks may query it before the current function is finished.
2734   DT->addNewBlock(NewBB, BB);
2735   if (L->getParentLoop())
2736     L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
2737   ReplaceInstWithInst(BB->getTerminator(),
2738                       BranchInst::Create(Bypass, NewBB, SCEVCheck));
2739   LoopBypassBlocks.push_back(BB);
2740   AddedSafetyChecks = true;
2741 }
2742 
2743 void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {
2744   // VPlan-native path does not do any analysis for runtime checks currently.
2745   if (EnableVPlanNativePath)
2746     return;
2747 
2748   BasicBlock *BB = L->getLoopPreheader();
2749 
2750   // Generate the code that checks in runtime if arrays overlap. We put the
2751   // checks into a separate block to make the more common case of few elements
2752   // faster.
2753   Instruction *FirstCheckInst;
2754   Instruction *MemRuntimeCheck;
2755   std::tie(FirstCheckInst, MemRuntimeCheck) =
2756       Legal->getLAI()->addRuntimeChecks(BB->getTerminator());
2757   if (!MemRuntimeCheck)
2758     return;
2759 
2760   if (BB->getParent()->hasOptSize()) {
2761     assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
2762            "Cannot emit memory checks when optimizing for size, unless forced "
2763            "to vectorize.");
2764     ORE->emit([&]() {
2765       return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
2766                                         L->getStartLoc(), L->getHeader())
2767              << "Code-size may be reduced by not forcing "
2768                 "vectorization, or by source-code modifications "
2769                 "eliminating the need for runtime checks "
2770                 "(e.g., adding 'restrict').";
2771     });
2772   }
2773 
2774   // Create a new block containing the memory check.
2775   BB->setName("vector.memcheck");
2776   auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
2777   // Update dominator tree immediately if the generated block is a
2778   // LoopBypassBlock because SCEV expansions to generate loop bypass
2779   // checks may query it before the current function is finished.
2780   DT->addNewBlock(NewBB, BB);
2781   if (L->getParentLoop())
2782     L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
2783   ReplaceInstWithInst(BB->getTerminator(),
2784                       BranchInst::Create(Bypass, NewBB, MemRuntimeCheck));
2785   LoopBypassBlocks.push_back(BB);
2786   AddedSafetyChecks = true;
2787 
2788   // We currently don't use LoopVersioning for the actual loop cloning but we
2789   // still use it to add the noalias metadata.
2790   LVer = std::make_unique<LoopVersioning>(*Legal->getLAI(), OrigLoop, LI, DT,
2791                                            PSE.getSE());
2792   LVer->prepareNoAliasMetadata();
2793 }
2794 
2795 Value *InnerLoopVectorizer::emitTransformedIndex(
2796     IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL,
2797     const InductionDescriptor &ID) const {
2798 
2799   SCEVExpander Exp(*SE, DL, "induction");
2800   auto Step = ID.getStep();
2801   auto StartValue = ID.getStartValue();
2802   assert(Index->getType() == Step->getType() &&
2803          "Index type does not match StepValue type");
2804 
2805   // Note: the IR at this point is broken. We cannot use SE to create any new
2806   // SCEV and then expand it, hoping that SCEV's simplification will give us
2807   // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
2808   // lead to various SCEV crashes. So all we can do is to use builder and rely
2809   // on InstCombine for future simplifications. Here we handle some trivial
2810   // cases only.
2811   auto CreateAdd = [&B](Value *X, Value *Y) {
2812     assert(X->getType() == Y->getType() && "Types don't match!");
2813     if (auto *CX = dyn_cast<ConstantInt>(X))
2814       if (CX->isZero())
2815         return Y;
2816     if (auto *CY = dyn_cast<ConstantInt>(Y))
2817       if (CY->isZero())
2818         return X;
2819     return B.CreateAdd(X, Y);
2820   };
2821 
2822   auto CreateMul = [&B](Value *X, Value *Y) {
2823     assert(X->getType() == Y->getType() && "Types don't match!");
2824     if (auto *CX = dyn_cast<ConstantInt>(X))
2825       if (CX->isOne())
2826         return Y;
2827     if (auto *CY = dyn_cast<ConstantInt>(Y))
2828       if (CY->isOne())
2829         return X;
2830     return B.CreateMul(X, Y);
2831   };
2832 
2833   switch (ID.getKind()) {
2834   case InductionDescriptor::IK_IntInduction: {
2835     assert(Index->getType() == StartValue->getType() &&
2836            "Index type does not match StartValue type");
2837     if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne())
2838       return B.CreateSub(StartValue, Index);
2839     auto *Offset = CreateMul(
2840         Index, Exp.expandCodeFor(Step, Index->getType(), &*B.GetInsertPoint()));
2841     return CreateAdd(StartValue, Offset);
2842   }
2843   case InductionDescriptor::IK_PtrInduction: {
2844     assert(isa<SCEVConstant>(Step) &&
2845            "Expected constant step for pointer induction");
2846     return B.CreateGEP(
2847         StartValue->getType()->getPointerElementType(), StartValue,
2848         CreateMul(Index, Exp.expandCodeFor(Step, Index->getType(),
2849                                            &*B.GetInsertPoint())));
2850   }
2851   case InductionDescriptor::IK_FpInduction: {
2852     assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
2853     auto InductionBinOp = ID.getInductionBinOp();
2854     assert(InductionBinOp &&
2855            (InductionBinOp->getOpcode() == Instruction::FAdd ||
2856             InductionBinOp->getOpcode() == Instruction::FSub) &&
2857            "Original bin op should be defined for FP induction");
2858 
2859     Value *StepValue = cast<SCEVUnknown>(Step)->getValue();
2860 
2861     // Floating point operations had to be 'fast' to enable the induction.
2862     FastMathFlags Flags;
2863     Flags.setFast();
2864 
2865     Value *MulExp = B.CreateFMul(StepValue, Index);
2866     if (isa<Instruction>(MulExp))
2867       // We have to check, the MulExp may be a constant.
2868       cast<Instruction>(MulExp)->setFastMathFlags(Flags);
2869 
2870     Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
2871                                "induction");
2872     if (isa<Instruction>(BOp))
2873       cast<Instruction>(BOp)->setFastMathFlags(Flags);
2874 
2875     return BOp;
2876   }
2877   case InductionDescriptor::IK_NoInduction:
2878     return nullptr;
2879   }
2880   llvm_unreachable("invalid enum");
2881 }
2882 
2883 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
2884   /*
2885    In this function we generate a new loop. The new loop will contain
2886    the vectorized instructions while the old loop will continue to run the
2887    scalar remainder.
2888 
2889        [ ] <-- loop iteration number check.
2890     /   |
2891    /    v
2892   |    [ ] <-- vector loop bypass (may consist of multiple blocks).
2893   |  /  |
2894   | /   v
2895   ||   [ ]     <-- vector pre header.
2896   |/    |
2897   |     v
2898   |    [  ] \
2899   |    [  ]_|   <-- vector loop.
2900   |     |
2901   |     v
2902   |   -[ ]   <--- middle-block.
2903   |  /  |
2904   | /   v
2905   -|- >[ ]     <--- new preheader.
2906    |    |
2907    |    v
2908    |   [ ] \
2909    |   [ ]_|   <-- old scalar loop to handle remainder.
2910     \   |
2911      \  v
2912       >[ ]     <-- exit block.
2913    ...
2914    */
2915 
2916   BasicBlock *OldBasicBlock = OrigLoop->getHeader();
2917   BasicBlock *VectorPH = OrigLoop->getLoopPreheader();
2918   BasicBlock *ExitBlock = OrigLoop->getExitBlock();
2919   MDNode *OrigLoopID = OrigLoop->getLoopID();
2920   assert(VectorPH && "Invalid loop structure");
2921   assert(ExitBlock && "Must have an exit block");
2922 
2923   // Some loops have a single integer induction variable, while other loops
2924   // don't. One example is c++ iterators that often have multiple pointer
2925   // induction variables. In the code below we also support a case where we
2926   // don't have a single induction variable.
2927   //
2928   // We try to obtain an induction variable from the original loop as hard
2929   // as possible. However if we don't find one that:
2930   //   - is an integer
2931   //   - counts from zero, stepping by one
2932   //   - is the size of the widest induction variable type
2933   // then we create a new one.
2934   OldInduction = Legal->getPrimaryInduction();
2935   Type *IdxTy = Legal->getWidestInductionType();
2936 
2937   // Split the single block loop into the two loop structure described above.
2938   BasicBlock *VecBody =
2939       VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.body");
2940   BasicBlock *MiddleBlock =
2941       VecBody->splitBasicBlock(VecBody->getTerminator(), "middle.block");
2942   BasicBlock *ScalarPH =
2943       MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), "scalar.ph");
2944 
2945   // Create and register the new vector loop.
2946   Loop *Lp = LI->AllocateLoop();
2947   Loop *ParentLoop = OrigLoop->getParentLoop();
2948 
2949   // Insert the new loop into the loop nest and register the new basic blocks
2950   // before calling any utilities such as SCEV that require valid LoopInfo.
2951   if (ParentLoop) {
2952     ParentLoop->addChildLoop(Lp);
2953     ParentLoop->addBasicBlockToLoop(ScalarPH, *LI);
2954     ParentLoop->addBasicBlockToLoop(MiddleBlock, *LI);
2955   } else {
2956     LI->addTopLevelLoop(Lp);
2957   }
2958   Lp->addBasicBlockToLoop(VecBody, *LI);
2959 
2960   // Find the loop boundaries.
2961   Value *Count = getOrCreateTripCount(Lp);
2962 
2963   Value *StartIdx = ConstantInt::get(IdxTy, 0);
2964 
2965   // Now, compare the new count to zero. If it is zero skip the vector loop and
2966   // jump to the scalar loop. This check also covers the case where the
2967   // backedge-taken count is uint##_max: adding one to it will overflow leading
2968   // to an incorrect trip count of zero. In this (rare) case we will also jump
2969   // to the scalar loop.
2970   emitMinimumIterationCountCheck(Lp, ScalarPH);
2971 
2972   // Generate the code to check any assumptions that we've made for SCEV
2973   // expressions.
2974   emitSCEVChecks(Lp, ScalarPH);
2975 
2976   // Generate the code that checks in runtime if arrays overlap. We put the
2977   // checks into a separate block to make the more common case of few elements
2978   // faster.
2979   emitMemRuntimeChecks(Lp, ScalarPH);
2980 
2981   // Generate the induction variable.
2982   // The loop step is equal to the vectorization factor (num of SIMD elements)
2983   // times the unroll factor (num of SIMD instructions).
2984   Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
2985   Constant *Step = ConstantInt::get(IdxTy, VF * UF);
2986   Induction =
2987       createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
2988                               getDebugLocFromInstOrOperands(OldInduction));
2989 
2990   // We are going to resume the execution of the scalar loop.
2991   // Go over all of the induction variables that we found and fix the
2992   // PHIs that are left in the scalar version of the loop.
2993   // The starting values of PHI nodes depend on the counter of the last
2994   // iteration in the vectorized loop.
2995   // If we come from a bypass edge then we need to start from the original
2996   // start value.
2997 
2998   // This variable saves the new starting index for the scalar loop. It is used
2999   // to test if there are any tail iterations left once the vector loop has
3000   // completed.
3001   LoopVectorizationLegality::InductionList *List = Legal->getInductionVars();
3002   for (auto &InductionEntry : *List) {
3003     PHINode *OrigPhi = InductionEntry.first;
3004     InductionDescriptor II = InductionEntry.second;
3005 
3006     // Create phi nodes to merge from the  backedge-taken check block.
3007     PHINode *BCResumeVal = PHINode::Create(
3008         OrigPhi->getType(), 3, "bc.resume.val", ScalarPH->getTerminator());
3009     // Copy original phi DL over to the new one.
3010     BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
3011     Value *&EndValue = IVEndValues[OrigPhi];
3012     if (OrigPhi == OldInduction) {
3013       // We know what the end value is.
3014       EndValue = CountRoundDown;
3015     } else {
3016       IRBuilder<> B(Lp->getLoopPreheader()->getTerminator());
3017       Type *StepType = II.getStep()->getType();
3018       Instruction::CastOps CastOp =
3019         CastInst::getCastOpcode(CountRoundDown, true, StepType, true);
3020       Value *CRD = B.CreateCast(CastOp, CountRoundDown, StepType, "cast.crd");
3021       const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
3022       EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
3023       EndValue->setName("ind.end");
3024     }
3025 
3026     // The new PHI merges the original incoming value, in case of a bypass,
3027     // or the value at the end of the vectorized loop.
3028     BCResumeVal->addIncoming(EndValue, MiddleBlock);
3029 
3030     // Fix the scalar body counter (PHI node).
3031     // The old induction's phi node in the scalar body needs the truncated
3032     // value.
3033     for (BasicBlock *BB : LoopBypassBlocks)
3034       BCResumeVal->addIncoming(II.getStartValue(), BB);
3035     OrigPhi->setIncomingValueForBlock(ScalarPH, BCResumeVal);
3036   }
3037 
3038   // We need the OrigLoop (scalar loop part) latch terminator to help
3039   // produce correct debug info for the middle block BB instructions.
3040   // The legality check stage guarantees that the loop will have a single
3041   // latch.
3042   assert(isa<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()) &&
3043          "Scalar loop latch terminator isn't a branch");
3044   BranchInst *ScalarLatchBr =
3045       cast<BranchInst>(OrigLoop->getLoopLatch()->getTerminator());
3046 
3047   // Add a check in the middle block to see if we have completed
3048   // all of the iterations in the first vector loop.
3049   // If (N - N%VF) == N, then we *don't* need to run the remainder.
3050   // If tail is to be folded, we know we don't need to run the remainder.
3051   Value *CmpN = Builder.getTrue();
3052   if (!Cost->foldTailByMasking()) {
3053     CmpN =
3054         CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count,
3055                         CountRoundDown, "cmp.n", MiddleBlock->getTerminator());
3056 
3057     // Here we use the same DebugLoc as the scalar loop latch branch instead
3058     // of the corresponding compare because they may have ended up with
3059     // different line numbers and we want to avoid awkward line stepping while
3060     // debugging. Eg. if the compare has got a line number inside the loop.
3061     cast<Instruction>(CmpN)->setDebugLoc(ScalarLatchBr->getDebugLoc());
3062   }
3063 
3064   BranchInst *BrInst = BranchInst::Create(ExitBlock, ScalarPH, CmpN);
3065   BrInst->setDebugLoc(ScalarLatchBr->getDebugLoc());
3066   ReplaceInstWithInst(MiddleBlock->getTerminator(), BrInst);
3067 
3068   // Get ready to start creating new instructions into the vectorized body.
3069   Builder.SetInsertPoint(&*VecBody->getFirstInsertionPt());
3070 
3071   // Save the state.
3072   LoopVectorPreHeader = Lp->getLoopPreheader();
3073   LoopScalarPreHeader = ScalarPH;
3074   LoopMiddleBlock = MiddleBlock;
3075   LoopExitBlock = ExitBlock;
3076   LoopVectorBody = VecBody;
3077   LoopScalarBody = OldBasicBlock;
3078 
3079   Optional<MDNode *> VectorizedLoopID =
3080       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
3081                                       LLVMLoopVectorizeFollowupVectorized});
3082   if (VectorizedLoopID.hasValue()) {
3083     Lp->setLoopID(VectorizedLoopID.getValue());
3084 
3085     // Do not setAlreadyVectorized if loop attributes have been defined
3086     // explicitly.
3087     return LoopVectorPreHeader;
3088   }
3089 
3090   // Keep all loop hints from the original loop on the vector loop (we'll
3091   // replace the vectorizer-specific hints below).
3092   if (MDNode *LID = OrigLoop->getLoopID())
3093     Lp->setLoopID(LID);
3094 
3095   LoopVectorizeHints Hints(Lp, true, *ORE);
3096   Hints.setAlreadyVectorized();
3097 
3098   return LoopVectorPreHeader;
3099 }
3100 
3101 // Fix up external users of the induction variable. At this point, we are
3102 // in LCSSA form, with all external PHIs that use the IV having one input value,
3103 // coming from the remainder loop. We need those PHIs to also have a correct
3104 // value for the IV when arriving directly from the middle block.
3105 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
3106                                        const InductionDescriptor &II,
3107                                        Value *CountRoundDown, Value *EndValue,
3108                                        BasicBlock *MiddleBlock) {
3109   // There are two kinds of external IV usages - those that use the value
3110   // computed in the last iteration (the PHI) and those that use the penultimate
3111   // value (the value that feeds into the phi from the loop latch).
3112   // We allow both, but they, obviously, have different values.
3113 
3114   assert(OrigLoop->getExitBlock() && "Expected a single exit block");
3115 
3116   DenseMap<Value *, Value *> MissingVals;
3117 
3118   // An external user of the last iteration's value should see the value that
3119   // the remainder loop uses to initialize its own IV.
3120   Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
3121   for (User *U : PostInc->users()) {
3122     Instruction *UI = cast<Instruction>(U);
3123     if (!OrigLoop->contains(UI)) {
3124       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3125       MissingVals[UI] = EndValue;
3126     }
3127   }
3128 
3129   // An external user of the penultimate value need to see EndValue - Step.
3130   // The simplest way to get this is to recompute it from the constituent SCEVs,
3131   // that is Start + (Step * (CRD - 1)).
3132   for (User *U : OrigPhi->users()) {
3133     auto *UI = cast<Instruction>(U);
3134     if (!OrigLoop->contains(UI)) {
3135       const DataLayout &DL =
3136           OrigLoop->getHeader()->getModule()->getDataLayout();
3137       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3138 
3139       IRBuilder<> B(MiddleBlock->getTerminator());
3140       Value *CountMinusOne = B.CreateSub(
3141           CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1));
3142       Value *CMO =
3143           !II.getStep()->getType()->isIntegerTy()
3144               ? B.CreateCast(Instruction::SIToFP, CountMinusOne,
3145                              II.getStep()->getType())
3146               : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType());
3147       CMO->setName("cast.cmo");
3148       Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II);
3149       Escape->setName("ind.escape");
3150       MissingVals[UI] = Escape;
3151     }
3152   }
3153 
3154   for (auto &I : MissingVals) {
3155     PHINode *PHI = cast<PHINode>(I.first);
3156     // One corner case we have to handle is two IVs "chasing" each-other,
3157     // that is %IV2 = phi [...], [ %IV1, %latch ]
3158     // In this case, if IV1 has an external use, we need to avoid adding both
3159     // "last value of IV1" and "penultimate value of IV2". So, verify that we
3160     // don't already have an incoming value for the middle block.
3161     if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
3162       PHI->addIncoming(I.second, MiddleBlock);
3163   }
3164 }
3165 
3166 namespace {
3167 
3168 struct CSEDenseMapInfo {
3169   static bool canHandle(const Instruction *I) {
3170     return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
3171            isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
3172   }
3173 
3174   static inline Instruction *getEmptyKey() {
3175     return DenseMapInfo<Instruction *>::getEmptyKey();
3176   }
3177 
3178   static inline Instruction *getTombstoneKey() {
3179     return DenseMapInfo<Instruction *>::getTombstoneKey();
3180   }
3181 
3182   static unsigned getHashValue(const Instruction *I) {
3183     assert(canHandle(I) && "Unknown instruction!");
3184     return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
3185                                                            I->value_op_end()));
3186   }
3187 
3188   static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
3189     if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3190         LHS == getTombstoneKey() || RHS == getTombstoneKey())
3191       return LHS == RHS;
3192     return LHS->isIdenticalTo(RHS);
3193   }
3194 };
3195 
3196 } // end anonymous namespace
3197 
3198 ///Perform cse of induction variable instructions.
3199 static void cse(BasicBlock *BB) {
3200   // Perform simple cse.
3201   SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
3202   for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
3203     Instruction *In = &*I++;
3204 
3205     if (!CSEDenseMapInfo::canHandle(In))
3206       continue;
3207 
3208     // Check if we can replace this instruction with any of the
3209     // visited instructions.
3210     if (Instruction *V = CSEMap.lookup(In)) {
3211       In->replaceAllUsesWith(V);
3212       In->eraseFromParent();
3213       continue;
3214     }
3215 
3216     CSEMap[In] = In;
3217   }
3218 }
3219 
3220 unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
3221                                                        unsigned VF,
3222                                                        bool &NeedToScalarize) {
3223   Function *F = CI->getCalledFunction();
3224   StringRef FnName = CI->getCalledFunction()->getName();
3225   Type *ScalarRetTy = CI->getType();
3226   SmallVector<Type *, 4> Tys, ScalarTys;
3227   for (auto &ArgOp : CI->arg_operands())
3228     ScalarTys.push_back(ArgOp->getType());
3229 
3230   // Estimate cost of scalarized vector call. The source operands are assumed
3231   // to be vectors, so we need to extract individual elements from there,
3232   // execute VF scalar calls, and then gather the result into the vector return
3233   // value.
3234   unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys);
3235   if (VF == 1)
3236     return ScalarCallCost;
3237 
3238   // Compute corresponding vector type for return value and arguments.
3239   Type *RetTy = ToVectorTy(ScalarRetTy, VF);
3240   for (Type *ScalarTy : ScalarTys)
3241     Tys.push_back(ToVectorTy(ScalarTy, VF));
3242 
3243   // Compute costs of unpacking argument values for the scalar calls and
3244   // packing the return values to a vector.
3245   unsigned ScalarizationCost = getScalarizationOverhead(CI, VF);
3246 
3247   unsigned Cost = ScalarCallCost * VF + ScalarizationCost;
3248 
3249   // If we can't emit a vector call for this function, then the currently found
3250   // cost is the cost we need to return.
3251   NeedToScalarize = true;
3252   if (!TLI || !TLI->isFunctionVectorizable(FnName, VF) || CI->isNoBuiltin())
3253     return Cost;
3254 
3255   // If the corresponding vector cost is cheaper, return its cost.
3256   unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys);
3257   if (VectorCallCost < Cost) {
3258     NeedToScalarize = false;
3259     return VectorCallCost;
3260   }
3261   return Cost;
3262 }
3263 
3264 unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
3265                                                             unsigned VF) {
3266   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
3267   assert(ID && "Expected intrinsic call!");
3268 
3269   FastMathFlags FMF;
3270   if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
3271     FMF = FPMO->getFastMathFlags();
3272 
3273   SmallVector<Value *, 4> Operands(CI->arg_operands());
3274   return TTI.getIntrinsicInstrCost(ID, CI->getType(), Operands, FMF, VF);
3275 }
3276 
3277 static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
3278   auto *I1 = cast<IntegerType>(T1->getVectorElementType());
3279   auto *I2 = cast<IntegerType>(T2->getVectorElementType());
3280   return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3281 }
3282 static Type *largestIntegerVectorType(Type *T1, Type *T2) {
3283   auto *I1 = cast<IntegerType>(T1->getVectorElementType());
3284   auto *I2 = cast<IntegerType>(T2->getVectorElementType());
3285   return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3286 }
3287 
3288 void InnerLoopVectorizer::truncateToMinimalBitwidths() {
3289   // For every instruction `I` in MinBWs, truncate the operands, create a
3290   // truncated version of `I` and reextend its result. InstCombine runs
3291   // later and will remove any ext/trunc pairs.
3292   SmallPtrSet<Value *, 4> Erased;
3293   for (const auto &KV : Cost->getMinimalBitwidths()) {
3294     // If the value wasn't vectorized, we must maintain the original scalar
3295     // type. The absence of the value from VectorLoopValueMap indicates that it
3296     // wasn't vectorized.
3297     if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3298       continue;
3299     for (unsigned Part = 0; Part < UF; ++Part) {
3300       Value *I = getOrCreateVectorValue(KV.first, Part);
3301       if (Erased.find(I) != Erased.end() || I->use_empty() ||
3302           !isa<Instruction>(I))
3303         continue;
3304       Type *OriginalTy = I->getType();
3305       Type *ScalarTruncatedTy =
3306           IntegerType::get(OriginalTy->getContext(), KV.second);
3307       Type *TruncatedTy = VectorType::get(ScalarTruncatedTy,
3308                                           OriginalTy->getVectorNumElements());
3309       if (TruncatedTy == OriginalTy)
3310         continue;
3311 
3312       IRBuilder<> B(cast<Instruction>(I));
3313       auto ShrinkOperand = [&](Value *V) -> Value * {
3314         if (auto *ZI = dyn_cast<ZExtInst>(V))
3315           if (ZI->getSrcTy() == TruncatedTy)
3316             return ZI->getOperand(0);
3317         return B.CreateZExtOrTrunc(V, TruncatedTy);
3318       };
3319 
3320       // The actual instruction modification depends on the instruction type,
3321       // unfortunately.
3322       Value *NewI = nullptr;
3323       if (auto *BO = dyn_cast<BinaryOperator>(I)) {
3324         NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
3325                              ShrinkOperand(BO->getOperand(1)));
3326 
3327         // Any wrapping introduced by shrinking this operation shouldn't be
3328         // considered undefined behavior. So, we can't unconditionally copy
3329         // arithmetic wrapping flags to NewI.
3330         cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
3331       } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
3332         NewI =
3333             B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
3334                          ShrinkOperand(CI->getOperand(1)));
3335       } else if (auto *SI = dyn_cast<SelectInst>(I)) {
3336         NewI = B.CreateSelect(SI->getCondition(),
3337                               ShrinkOperand(SI->getTrueValue()),
3338                               ShrinkOperand(SI->getFalseValue()));
3339       } else if (auto *CI = dyn_cast<CastInst>(I)) {
3340         switch (CI->getOpcode()) {
3341         default:
3342           llvm_unreachable("Unhandled cast!");
3343         case Instruction::Trunc:
3344           NewI = ShrinkOperand(CI->getOperand(0));
3345           break;
3346         case Instruction::SExt:
3347           NewI = B.CreateSExtOrTrunc(
3348               CI->getOperand(0),
3349               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3350           break;
3351         case Instruction::ZExt:
3352           NewI = B.CreateZExtOrTrunc(
3353               CI->getOperand(0),
3354               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3355           break;
3356         }
3357       } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
3358         auto Elements0 = SI->getOperand(0)->getType()->getVectorNumElements();
3359         auto *O0 = B.CreateZExtOrTrunc(
3360             SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0));
3361         auto Elements1 = SI->getOperand(1)->getType()->getVectorNumElements();
3362         auto *O1 = B.CreateZExtOrTrunc(
3363             SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1));
3364 
3365         NewI = B.CreateShuffleVector(O0, O1, SI->getMask());
3366       } else if (isa<LoadInst>(I) || isa<PHINode>(I)) {
3367         // Don't do anything with the operands, just extend the result.
3368         continue;
3369       } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
3370         auto Elements = IE->getOperand(0)->getType()->getVectorNumElements();
3371         auto *O0 = B.CreateZExtOrTrunc(
3372             IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3373         auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
3374         NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
3375       } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
3376         auto Elements = EE->getOperand(0)->getType()->getVectorNumElements();
3377         auto *O0 = B.CreateZExtOrTrunc(
3378             EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3379         NewI = B.CreateExtractElement(O0, EE->getOperand(2));
3380       } else {
3381         // If we don't know what to do, be conservative and don't do anything.
3382         continue;
3383       }
3384 
3385       // Lastly, extend the result.
3386       NewI->takeName(cast<Instruction>(I));
3387       Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
3388       I->replaceAllUsesWith(Res);
3389       cast<Instruction>(I)->eraseFromParent();
3390       Erased.insert(I);
3391       VectorLoopValueMap.resetVectorValue(KV.first, Part, Res);
3392     }
3393   }
3394 
3395   // We'll have created a bunch of ZExts that are now parentless. Clean up.
3396   for (const auto &KV : Cost->getMinimalBitwidths()) {
3397     // If the value wasn't vectorized, we must maintain the original scalar
3398     // type. The absence of the value from VectorLoopValueMap indicates that it
3399     // wasn't vectorized.
3400     if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3401       continue;
3402     for (unsigned Part = 0; Part < UF; ++Part) {
3403       Value *I = getOrCreateVectorValue(KV.first, Part);
3404       ZExtInst *Inst = dyn_cast<ZExtInst>(I);
3405       if (Inst && Inst->use_empty()) {
3406         Value *NewI = Inst->getOperand(0);
3407         Inst->eraseFromParent();
3408         VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI);
3409       }
3410     }
3411   }
3412 }
3413 
3414 void InnerLoopVectorizer::fixVectorizedLoop() {
3415   // Insert truncates and extends for any truncated instructions as hints to
3416   // InstCombine.
3417   if (VF > 1)
3418     truncateToMinimalBitwidths();
3419 
3420   // Fix widened non-induction PHIs by setting up the PHI operands.
3421   if (OrigPHIsToFix.size()) {
3422     assert(EnableVPlanNativePath &&
3423            "Unexpected non-induction PHIs for fixup in non VPlan-native path");
3424     fixNonInductionPHIs();
3425   }
3426 
3427   // At this point every instruction in the original loop is widened to a
3428   // vector form. Now we need to fix the recurrences in the loop. These PHI
3429   // nodes are currently empty because we did not want to introduce cycles.
3430   // This is the second stage of vectorizing recurrences.
3431   fixCrossIterationPHIs();
3432 
3433   // Update the dominator tree.
3434   //
3435   // FIXME: After creating the structure of the new loop, the dominator tree is
3436   //        no longer up-to-date, and it remains that way until we update it
3437   //        here. An out-of-date dominator tree is problematic for SCEV,
3438   //        because SCEVExpander uses it to guide code generation. The
3439   //        vectorizer use SCEVExpanders in several places. Instead, we should
3440   //        keep the dominator tree up-to-date as we go.
3441   updateAnalysis();
3442 
3443   // Fix-up external users of the induction variables.
3444   for (auto &Entry : *Legal->getInductionVars())
3445     fixupIVUsers(Entry.first, Entry.second,
3446                  getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)),
3447                  IVEndValues[Entry.first], LoopMiddleBlock);
3448 
3449   fixLCSSAPHIs();
3450   for (Instruction *PI : PredicatedInstructions)
3451     sinkScalarOperands(&*PI);
3452 
3453   // Remove redundant induction instructions.
3454   cse(LoopVectorBody);
3455 }
3456 
3457 void InnerLoopVectorizer::fixCrossIterationPHIs() {
3458   // In order to support recurrences we need to be able to vectorize Phi nodes.
3459   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3460   // stage #2: We now need to fix the recurrences by adding incoming edges to
3461   // the currently empty PHI nodes. At this point every instruction in the
3462   // original loop is widened to a vector form so we can use them to construct
3463   // the incoming edges.
3464   for (PHINode &Phi : OrigLoop->getHeader()->phis()) {
3465     // Handle first-order recurrences and reductions that need to be fixed.
3466     if (Legal->isFirstOrderRecurrence(&Phi))
3467       fixFirstOrderRecurrence(&Phi);
3468     else if (Legal->isReductionVariable(&Phi))
3469       fixReduction(&Phi);
3470   }
3471 }
3472 
3473 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
3474   // This is the second phase of vectorizing first-order recurrences. An
3475   // overview of the transformation is described below. Suppose we have the
3476   // following loop.
3477   //
3478   //   for (int i = 0; i < n; ++i)
3479   //     b[i] = a[i] - a[i - 1];
3480   //
3481   // There is a first-order recurrence on "a". For this loop, the shorthand
3482   // scalar IR looks like:
3483   //
3484   //   scalar.ph:
3485   //     s_init = a[-1]
3486   //     br scalar.body
3487   //
3488   //   scalar.body:
3489   //     i = phi [0, scalar.ph], [i+1, scalar.body]
3490   //     s1 = phi [s_init, scalar.ph], [s2, scalar.body]
3491   //     s2 = a[i]
3492   //     b[i] = s2 - s1
3493   //     br cond, scalar.body, ...
3494   //
3495   // In this example, s1 is a recurrence because it's value depends on the
3496   // previous iteration. In the first phase of vectorization, we created a
3497   // temporary value for s1. We now complete the vectorization and produce the
3498   // shorthand vector IR shown below (for VF = 4, UF = 1).
3499   //
3500   //   vector.ph:
3501   //     v_init = vector(..., ..., ..., a[-1])
3502   //     br vector.body
3503   //
3504   //   vector.body
3505   //     i = phi [0, vector.ph], [i+4, vector.body]
3506   //     v1 = phi [v_init, vector.ph], [v2, vector.body]
3507   //     v2 = a[i, i+1, i+2, i+3];
3508   //     v3 = vector(v1(3), v2(0, 1, 2))
3509   //     b[i, i+1, i+2, i+3] = v2 - v3
3510   //     br cond, vector.body, middle.block
3511   //
3512   //   middle.block:
3513   //     x = v2(3)
3514   //     br scalar.ph
3515   //
3516   //   scalar.ph:
3517   //     s_init = phi [x, middle.block], [a[-1], otherwise]
3518   //     br scalar.body
3519   //
3520   // After execution completes the vector loop, we extract the next value of
3521   // the recurrence (x) to use as the initial value in the scalar loop.
3522 
3523   // Get the original loop preheader and single loop latch.
3524   auto *Preheader = OrigLoop->getLoopPreheader();
3525   auto *Latch = OrigLoop->getLoopLatch();
3526 
3527   // Get the initial and previous values of the scalar recurrence.
3528   auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader);
3529   auto *Previous = Phi->getIncomingValueForBlock(Latch);
3530 
3531   // Create a vector from the initial value.
3532   auto *VectorInit = ScalarInit;
3533   if (VF > 1) {
3534     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
3535     VectorInit = Builder.CreateInsertElement(
3536         UndefValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit,
3537         Builder.getInt32(VF - 1), "vector.recur.init");
3538   }
3539 
3540   // We constructed a temporary phi node in the first phase of vectorization.
3541   // This phi node will eventually be deleted.
3542   Builder.SetInsertPoint(
3543       cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0)));
3544 
3545   // Create a phi node for the new recurrence. The current value will either be
3546   // the initial value inserted into a vector or loop-varying vector value.
3547   auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur");
3548   VecPhi->addIncoming(VectorInit, LoopVectorPreHeader);
3549 
3550   // Get the vectorized previous value of the last part UF - 1. It appears last
3551   // among all unrolled iterations, due to the order of their construction.
3552   Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1);
3553 
3554   // Set the insertion point after the previous value if it is an instruction.
3555   // Note that the previous value may have been constant-folded so it is not
3556   // guaranteed to be an instruction in the vector loop. Also, if the previous
3557   // value is a phi node, we should insert after all the phi nodes to avoid
3558   // breaking basic block verification.
3559   if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart) ||
3560       isa<PHINode>(PreviousLastPart))
3561     Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt());
3562   else
3563     Builder.SetInsertPoint(
3564         &*++BasicBlock::iterator(cast<Instruction>(PreviousLastPart)));
3565 
3566   // We will construct a vector for the recurrence by combining the values for
3567   // the current and previous iterations. This is the required shuffle mask.
3568   SmallVector<Constant *, 8> ShuffleMask(VF);
3569   ShuffleMask[0] = Builder.getInt32(VF - 1);
3570   for (unsigned I = 1; I < VF; ++I)
3571     ShuffleMask[I] = Builder.getInt32(I + VF - 1);
3572 
3573   // The vector from which to take the initial value for the current iteration
3574   // (actual or unrolled). Initially, this is the vector phi node.
3575   Value *Incoming = VecPhi;
3576 
3577   // Shuffle the current and previous vector and update the vector parts.
3578   for (unsigned Part = 0; Part < UF; ++Part) {
3579     Value *PreviousPart = getOrCreateVectorValue(Previous, Part);
3580     Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part);
3581     auto *Shuffle =
3582         VF > 1 ? Builder.CreateShuffleVector(Incoming, PreviousPart,
3583                                              ConstantVector::get(ShuffleMask))
3584                : Incoming;
3585     PhiPart->replaceAllUsesWith(Shuffle);
3586     cast<Instruction>(PhiPart)->eraseFromParent();
3587     VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle);
3588     Incoming = PreviousPart;
3589   }
3590 
3591   // Fix the latch value of the new recurrence in the vector loop.
3592   VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
3593 
3594   // Extract the last vector element in the middle block. This will be the
3595   // initial value for the recurrence when jumping to the scalar loop.
3596   auto *ExtractForScalar = Incoming;
3597   if (VF > 1) {
3598     Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
3599     ExtractForScalar = Builder.CreateExtractElement(
3600         ExtractForScalar, Builder.getInt32(VF - 1), "vector.recur.extract");
3601   }
3602   // Extract the second last element in the middle block if the
3603   // Phi is used outside the loop. We need to extract the phi itself
3604   // and not the last element (the phi update in the current iteration). This
3605   // will be the value when jumping to the exit block from the LoopMiddleBlock,
3606   // when the scalar loop is not run at all.
3607   Value *ExtractForPhiUsedOutsideLoop = nullptr;
3608   if (VF > 1)
3609     ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
3610         Incoming, Builder.getInt32(VF - 2), "vector.recur.extract.for.phi");
3611   // When loop is unrolled without vectorizing, initialize
3612   // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of
3613   // `Incoming`. This is analogous to the vectorized case above: extracting the
3614   // second last element when VF > 1.
3615   else if (UF > 1)
3616     ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2);
3617 
3618   // Fix the initial value of the original recurrence in the scalar loop.
3619   Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
3620   auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
3621   for (auto *BB : predecessors(LoopScalarPreHeader)) {
3622     auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
3623     Start->addIncoming(Incoming, BB);
3624   }
3625 
3626   Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start);
3627   Phi->setName("scalar.recur");
3628 
3629   // Finally, fix users of the recurrence outside the loop. The users will need
3630   // either the last value of the scalar recurrence or the last value of the
3631   // vector recurrence we extracted in the middle block. Since the loop is in
3632   // LCSSA form, we just need to find all the phi nodes for the original scalar
3633   // recurrence in the exit block, and then add an edge for the middle block.
3634   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3635     if (LCSSAPhi.getIncomingValue(0) == Phi) {
3636       LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
3637     }
3638   }
3639 }
3640 
3641 void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
3642   Constant *Zero = Builder.getInt32(0);
3643 
3644   // Get it's reduction variable descriptor.
3645   assert(Legal->isReductionVariable(Phi) &&
3646          "Unable to find the reduction variable");
3647   RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[Phi];
3648 
3649   RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();
3650   TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
3651   Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
3652   RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind =
3653     RdxDesc.getMinMaxRecurrenceKind();
3654   setDebugLocFromInst(Builder, ReductionStartValue);
3655 
3656   // We need to generate a reduction vector from the incoming scalar.
3657   // To do so, we need to generate the 'identity' vector and override
3658   // one of the elements with the incoming scalar reduction. We need
3659   // to do it in the vector-loop preheader.
3660   Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
3661 
3662   // This is the vector-clone of the value that leaves the loop.
3663   Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType();
3664 
3665   // Find the reduction identity variable. Zero for addition, or, xor,
3666   // one for multiplication, -1 for And.
3667   Value *Identity;
3668   Value *VectorStart;
3669   if (RK == RecurrenceDescriptor::RK_IntegerMinMax ||
3670       RK == RecurrenceDescriptor::RK_FloatMinMax) {
3671     // MinMax reduction have the start value as their identify.
3672     if (VF == 1) {
3673       VectorStart = Identity = ReductionStartValue;
3674     } else {
3675       VectorStart = Identity =
3676         Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident");
3677     }
3678   } else {
3679     // Handle other reduction kinds:
3680     Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity(
3681         RK, VecTy->getScalarType());
3682     if (VF == 1) {
3683       Identity = Iden;
3684       // This vector is the Identity vector where the first element is the
3685       // incoming scalar reduction.
3686       VectorStart = ReductionStartValue;
3687     } else {
3688       Identity = ConstantVector::getSplat(VF, Iden);
3689 
3690       // This vector is the Identity vector where the first element is the
3691       // incoming scalar reduction.
3692       VectorStart =
3693         Builder.CreateInsertElement(Identity, ReductionStartValue, Zero);
3694     }
3695   }
3696 
3697   // Fix the vector-loop phi.
3698 
3699   // Reductions do not have to start at zero. They can start with
3700   // any loop invariant values.
3701   BasicBlock *Latch = OrigLoop->getLoopLatch();
3702   Value *LoopVal = Phi->getIncomingValueForBlock(Latch);
3703   for (unsigned Part = 0; Part < UF; ++Part) {
3704     Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part);
3705     Value *Val = getOrCreateVectorValue(LoopVal, Part);
3706     // Make sure to add the reduction stat value only to the
3707     // first unroll part.
3708     Value *StartVal = (Part == 0) ? VectorStart : Identity;
3709     cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader);
3710     cast<PHINode>(VecRdxPhi)
3711       ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
3712   }
3713 
3714   // Before each round, move the insertion point right between
3715   // the PHIs and the values we are going to write.
3716   // This allows us to write both PHINodes and the extractelement
3717   // instructions.
3718   Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3719 
3720   setDebugLocFromInst(Builder, LoopExitInst);
3721 
3722   // If tail is folded by masking, the vector value to leave the loop should be
3723   // a Select choosing between the vectorized LoopExitInst and vectorized Phi,
3724   // instead of the former.
3725   if (Cost->foldTailByMasking()) {
3726     for (unsigned Part = 0; Part < UF; ++Part) {
3727       Value *VecLoopExitInst =
3728           VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3729       Value *Sel = nullptr;
3730       for (User *U : VecLoopExitInst->users()) {
3731         if (isa<SelectInst>(U)) {
3732           assert(!Sel && "Reduction exit feeding two selects");
3733           Sel = U;
3734         } else
3735           assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select");
3736       }
3737       assert(Sel && "Reduction exit feeds no select");
3738       VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, Sel);
3739     }
3740   }
3741 
3742   // If the vector reduction can be performed in a smaller type, we truncate
3743   // then extend the loop exit value to enable InstCombine to evaluate the
3744   // entire expression in the smaller type.
3745   if (VF > 1 && Phi->getType() != RdxDesc.getRecurrenceType()) {
3746     Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
3747     Builder.SetInsertPoint(
3748         LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator());
3749     VectorParts RdxParts(UF);
3750     for (unsigned Part = 0; Part < UF; ++Part) {
3751       RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3752       Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3753       Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
3754                                         : Builder.CreateZExt(Trunc, VecTy);
3755       for (Value::user_iterator UI = RdxParts[Part]->user_begin();
3756            UI != RdxParts[Part]->user_end();)
3757         if (*UI != Trunc) {
3758           (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd);
3759           RdxParts[Part] = Extnd;
3760         } else {
3761           ++UI;
3762         }
3763     }
3764     Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3765     for (unsigned Part = 0; Part < UF; ++Part) {
3766       RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3767       VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]);
3768     }
3769   }
3770 
3771   // Reduce all of the unrolled parts into a single vector.
3772   Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0);
3773   unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK);
3774 
3775   // The middle block terminator has already been assigned a DebugLoc here (the
3776   // OrigLoop's single latch terminator). We want the whole middle block to
3777   // appear to execute on this line because: (a) it is all compiler generated,
3778   // (b) these instructions are always executed after evaluating the latch
3779   // conditional branch, and (c) other passes may add new predecessors which
3780   // terminate on this line. This is the easiest way to ensure we don't
3781   // accidentally cause an extra step back into the loop while debugging.
3782   setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator());
3783   for (unsigned Part = 1; Part < UF; ++Part) {
3784     Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3785     if (Op != Instruction::ICmp && Op != Instruction::FCmp)
3786       // Floating point operations had to be 'fast' to enable the reduction.
3787       ReducedPartRdx = addFastMathFlag(
3788           Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxPart,
3789                               ReducedPartRdx, "bin.rdx"),
3790           RdxDesc.getFastMathFlags());
3791     else
3792       ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx,
3793                                       RdxPart);
3794   }
3795 
3796   if (VF > 1) {
3797     bool NoNaN = Legal->hasFunNoNaNAttr();
3798     ReducedPartRdx =
3799         createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN);
3800     // If the reduction can be performed in a smaller type, we need to extend
3801     // the reduction to the wider type before we branch to the original loop.
3802     if (Phi->getType() != RdxDesc.getRecurrenceType())
3803       ReducedPartRdx =
3804         RdxDesc.isSigned()
3805         ? Builder.CreateSExt(ReducedPartRdx, Phi->getType())
3806         : Builder.CreateZExt(ReducedPartRdx, Phi->getType());
3807   }
3808 
3809   // Create a phi node that merges control-flow from the backedge-taken check
3810   // block and the middle block.
3811   PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx",
3812                                         LoopScalarPreHeader->getTerminator());
3813   for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
3814     BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]);
3815   BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
3816 
3817   // Now, we need to fix the users of the reduction variable
3818   // inside and outside of the scalar remainder loop.
3819   // We know that the loop is in LCSSA form. We need to update the
3820   // PHI nodes in the exit blocks.
3821   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3822     // All PHINodes need to have a single entry edge, or two if
3823     // we already fixed them.
3824     assert(LCSSAPhi.getNumIncomingValues() < 3 && "Invalid LCSSA PHI");
3825 
3826     // We found a reduction value exit-PHI. Update it with the
3827     // incoming bypass edge.
3828     if (LCSSAPhi.getIncomingValue(0) == LoopExitInst)
3829       LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
3830   } // end of the LCSSA phi scan.
3831 
3832     // Fix the scalar loop reduction variable with the incoming reduction sum
3833     // from the vector body and from the backedge value.
3834   int IncomingEdgeBlockIdx =
3835     Phi->getBasicBlockIndex(OrigLoop->getLoopLatch());
3836   assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
3837   // Pick the other block.
3838   int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
3839   Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
3840   Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
3841 }
3842 
3843 void InnerLoopVectorizer::fixLCSSAPHIs() {
3844   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3845     if (LCSSAPhi.getNumIncomingValues() == 1) {
3846       auto *IncomingValue = LCSSAPhi.getIncomingValue(0);
3847       // Non-instruction incoming values will have only one value.
3848       unsigned LastLane = 0;
3849       if (isa<Instruction>(IncomingValue))
3850           LastLane = Cost->isUniformAfterVectorization(
3851                          cast<Instruction>(IncomingValue), VF)
3852                          ? 0
3853                          : VF - 1;
3854       // Can be a loop invariant incoming value or the last scalar value to be
3855       // extracted from the vectorized loop.
3856       Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
3857       Value *lastIncomingValue =
3858           getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane });
3859       LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock);
3860     }
3861   }
3862 }
3863 
3864 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
3865   // The basic block and loop containing the predicated instruction.
3866   auto *PredBB = PredInst->getParent();
3867   auto *VectorLoop = LI->getLoopFor(PredBB);
3868 
3869   // Initialize a worklist with the operands of the predicated instruction.
3870   SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
3871 
3872   // Holds instructions that we need to analyze again. An instruction may be
3873   // reanalyzed if we don't yet know if we can sink it or not.
3874   SmallVector<Instruction *, 8> InstsToReanalyze;
3875 
3876   // Returns true if a given use occurs in the predicated block. Phi nodes use
3877   // their operands in their corresponding predecessor blocks.
3878   auto isBlockOfUsePredicated = [&](Use &U) -> bool {
3879     auto *I = cast<Instruction>(U.getUser());
3880     BasicBlock *BB = I->getParent();
3881     if (auto *Phi = dyn_cast<PHINode>(I))
3882       BB = Phi->getIncomingBlock(
3883           PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
3884     return BB == PredBB;
3885   };
3886 
3887   // Iteratively sink the scalarized operands of the predicated instruction
3888   // into the block we created for it. When an instruction is sunk, it's
3889   // operands are then added to the worklist. The algorithm ends after one pass
3890   // through the worklist doesn't sink a single instruction.
3891   bool Changed;
3892   do {
3893     // Add the instructions that need to be reanalyzed to the worklist, and
3894     // reset the changed indicator.
3895     Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
3896     InstsToReanalyze.clear();
3897     Changed = false;
3898 
3899     while (!Worklist.empty()) {
3900       auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
3901 
3902       // We can't sink an instruction if it is a phi node, is already in the
3903       // predicated block, is not in the loop, or may have side effects.
3904       if (!I || isa<PHINode>(I) || I->getParent() == PredBB ||
3905           !VectorLoop->contains(I) || I->mayHaveSideEffects())
3906         continue;
3907 
3908       // It's legal to sink the instruction if all its uses occur in the
3909       // predicated block. Otherwise, there's nothing to do yet, and we may
3910       // need to reanalyze the instruction.
3911       if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
3912         InstsToReanalyze.push_back(I);
3913         continue;
3914       }
3915 
3916       // Move the instruction to the beginning of the predicated block, and add
3917       // it's operands to the worklist.
3918       I->moveBefore(&*PredBB->getFirstInsertionPt());
3919       Worklist.insert(I->op_begin(), I->op_end());
3920 
3921       // The sinking may have enabled other instructions to be sunk, so we will
3922       // need to iterate.
3923       Changed = true;
3924     }
3925   } while (Changed);
3926 }
3927 
3928 void InnerLoopVectorizer::fixNonInductionPHIs() {
3929   for (PHINode *OrigPhi : OrigPHIsToFix) {
3930     PHINode *NewPhi =
3931         cast<PHINode>(VectorLoopValueMap.getVectorValue(OrigPhi, 0));
3932     unsigned NumIncomingValues = OrigPhi->getNumIncomingValues();
3933 
3934     SmallVector<BasicBlock *, 2> ScalarBBPredecessors(
3935         predecessors(OrigPhi->getParent()));
3936     SmallVector<BasicBlock *, 2> VectorBBPredecessors(
3937         predecessors(NewPhi->getParent()));
3938     assert(ScalarBBPredecessors.size() == VectorBBPredecessors.size() &&
3939            "Scalar and Vector BB should have the same number of predecessors");
3940 
3941     // The insertion point in Builder may be invalidated by the time we get
3942     // here. Force the Builder insertion point to something valid so that we do
3943     // not run into issues during insertion point restore in
3944     // getOrCreateVectorValue calls below.
3945     Builder.SetInsertPoint(NewPhi);
3946 
3947     // The predecessor order is preserved and we can rely on mapping between
3948     // scalar and vector block predecessors.
3949     for (unsigned i = 0; i < NumIncomingValues; ++i) {
3950       BasicBlock *NewPredBB = VectorBBPredecessors[i];
3951 
3952       // When looking up the new scalar/vector values to fix up, use incoming
3953       // values from original phi.
3954       Value *ScIncV =
3955           OrigPhi->getIncomingValueForBlock(ScalarBBPredecessors[i]);
3956 
3957       // Scalar incoming value may need a broadcast
3958       Value *NewIncV = getOrCreateVectorValue(ScIncV, 0);
3959       NewPhi->addIncoming(NewIncV, NewPredBB);
3960     }
3961   }
3962 }
3963 
3964 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
3965                                               unsigned VF) {
3966   PHINode *P = cast<PHINode>(PN);
3967   if (EnableVPlanNativePath) {
3968     // Currently we enter here in the VPlan-native path for non-induction
3969     // PHIs where all control flow is uniform. We simply widen these PHIs.
3970     // Create a vector phi with no operands - the vector phi operands will be
3971     // set at the end of vector code generation.
3972     Type *VecTy =
3973         (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF);
3974     Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi");
3975     VectorLoopValueMap.setVectorValue(P, 0, VecPhi);
3976     OrigPHIsToFix.push_back(P);
3977 
3978     return;
3979   }
3980 
3981   assert(PN->getParent() == OrigLoop->getHeader() &&
3982          "Non-header phis should have been handled elsewhere");
3983 
3984   // In order to support recurrences we need to be able to vectorize Phi nodes.
3985   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3986   // stage #1: We create a new vector PHI node with no incoming edges. We'll use
3987   // this value when we vectorize all of the instructions that use the PHI.
3988   if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) {
3989     for (unsigned Part = 0; Part < UF; ++Part) {
3990       // This is phase one of vectorizing PHIs.
3991       Type *VecTy =
3992           (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF);
3993       Value *EntryPart = PHINode::Create(
3994           VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt());
3995       VectorLoopValueMap.setVectorValue(P, Part, EntryPart);
3996     }
3997     return;
3998   }
3999 
4000   setDebugLocFromInst(Builder, P);
4001 
4002   // This PHINode must be an induction variable.
4003   // Make sure that we know about it.
4004   assert(Legal->getInductionVars()->count(P) && "Not an induction variable");
4005 
4006   InductionDescriptor II = Legal->getInductionVars()->lookup(P);
4007   const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
4008 
4009   // FIXME: The newly created binary instructions should contain nsw/nuw flags,
4010   // which can be found from the original scalar operations.
4011   switch (II.getKind()) {
4012   case InductionDescriptor::IK_NoInduction:
4013     llvm_unreachable("Unknown induction");
4014   case InductionDescriptor::IK_IntInduction:
4015   case InductionDescriptor::IK_FpInduction:
4016     llvm_unreachable("Integer/fp induction is handled elsewhere.");
4017   case InductionDescriptor::IK_PtrInduction: {
4018     // Handle the pointer induction variable case.
4019     assert(P->getType()->isPointerTy() && "Unexpected type.");
4020     // This is the normalized GEP that starts counting at zero.
4021     Value *PtrInd = Induction;
4022     PtrInd = Builder.CreateSExtOrTrunc(PtrInd, II.getStep()->getType());
4023     // Determine the number of scalars we need to generate for each unroll
4024     // iteration. If the instruction is uniform, we only need to generate the
4025     // first lane. Otherwise, we generate all VF values.
4026     unsigned Lanes = Cost->isUniformAfterVectorization(P, VF) ? 1 : VF;
4027     // These are the scalar results. Notice that we don't generate vector GEPs
4028     // because scalar GEPs result in better code.
4029     for (unsigned Part = 0; Part < UF; ++Part) {
4030       for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
4031         Constant *Idx = ConstantInt::get(PtrInd->getType(), Lane + Part * VF);
4032         Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
4033         Value *SclrGep =
4034             emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II);
4035         SclrGep->setName("next.gep");
4036         VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep);
4037       }
4038     }
4039     return;
4040   }
4041   }
4042 }
4043 
4044 /// A helper function for checking whether an integer division-related
4045 /// instruction may divide by zero (in which case it must be predicated if
4046 /// executed conditionally in the scalar code).
4047 /// TODO: It may be worthwhile to generalize and check isKnownNonZero().
4048 /// Non-zero divisors that are non compile-time constants will not be
4049 /// converted into multiplication, so we will still end up scalarizing
4050 /// the division, but can do so w/o predication.
4051 static bool mayDivideByZero(Instruction &I) {
4052   assert((I.getOpcode() == Instruction::UDiv ||
4053           I.getOpcode() == Instruction::SDiv ||
4054           I.getOpcode() == Instruction::URem ||
4055           I.getOpcode() == Instruction::SRem) &&
4056          "Unexpected instruction");
4057   Value *Divisor = I.getOperand(1);
4058   auto *CInt = dyn_cast<ConstantInt>(Divisor);
4059   return !CInt || CInt->isZero();
4060 }
4061 
4062 void InnerLoopVectorizer::widenInstruction(Instruction &I) {
4063   switch (I.getOpcode()) {
4064   case Instruction::Br:
4065   case Instruction::PHI:
4066     llvm_unreachable("This instruction is handled by a different recipe.");
4067   case Instruction::GetElementPtr: {
4068     // Construct a vector GEP by widening the operands of the scalar GEP as
4069     // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
4070     // results in a vector of pointers when at least one operand of the GEP
4071     // is vector-typed. Thus, to keep the representation compact, we only use
4072     // vector-typed operands for loop-varying values.
4073     auto *GEP = cast<GetElementPtrInst>(&I);
4074 
4075     if (VF > 1 && OrigLoop->hasLoopInvariantOperands(GEP)) {
4076       // If we are vectorizing, but the GEP has only loop-invariant operands,
4077       // the GEP we build (by only using vector-typed operands for
4078       // loop-varying values) would be a scalar pointer. Thus, to ensure we
4079       // produce a vector of pointers, we need to either arbitrarily pick an
4080       // operand to broadcast, or broadcast a clone of the original GEP.
4081       // Here, we broadcast a clone of the original.
4082       //
4083       // TODO: If at some point we decide to scalarize instructions having
4084       //       loop-invariant operands, this special case will no longer be
4085       //       required. We would add the scalarization decision to
4086       //       collectLoopScalars() and teach getVectorValue() to broadcast
4087       //       the lane-zero scalar value.
4088       auto *Clone = Builder.Insert(GEP->clone());
4089       for (unsigned Part = 0; Part < UF; ++Part) {
4090         Value *EntryPart = Builder.CreateVectorSplat(VF, Clone);
4091         VectorLoopValueMap.setVectorValue(&I, Part, EntryPart);
4092         addMetadata(EntryPart, GEP);
4093       }
4094     } else {
4095       // If the GEP has at least one loop-varying operand, we are sure to
4096       // produce a vector of pointers. But if we are only unrolling, we want
4097       // to produce a scalar GEP for each unroll part. Thus, the GEP we
4098       // produce with the code below will be scalar (if VF == 1) or vector
4099       // (otherwise). Note that for the unroll-only case, we still maintain
4100       // values in the vector mapping with initVector, as we do for other
4101       // instructions.
4102       for (unsigned Part = 0; Part < UF; ++Part) {
4103         // The pointer operand of the new GEP. If it's loop-invariant, we
4104         // won't broadcast it.
4105         auto *Ptr =
4106             OrigLoop->isLoopInvariant(GEP->getPointerOperand())
4107                 ? GEP->getPointerOperand()
4108                 : getOrCreateVectorValue(GEP->getPointerOperand(), Part);
4109 
4110         // Collect all the indices for the new GEP. If any index is
4111         // loop-invariant, we won't broadcast it.
4112         SmallVector<Value *, 4> Indices;
4113         for (auto &U : make_range(GEP->idx_begin(), GEP->idx_end())) {
4114           if (OrigLoop->isLoopInvariant(U.get()))
4115             Indices.push_back(U.get());
4116           else
4117             Indices.push_back(getOrCreateVectorValue(U.get(), Part));
4118         }
4119 
4120         // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
4121         // but it should be a vector, otherwise.
4122         auto *NewGEP =
4123             GEP->isInBounds()
4124                 ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr,
4125                                             Indices)
4126                 : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices);
4127         assert((VF == 1 || NewGEP->getType()->isVectorTy()) &&
4128                "NewGEP is not a pointer vector");
4129         VectorLoopValueMap.setVectorValue(&I, Part, NewGEP);
4130         addMetadata(NewGEP, GEP);
4131       }
4132     }
4133 
4134     break;
4135   }
4136   case Instruction::UDiv:
4137   case Instruction::SDiv:
4138   case Instruction::SRem:
4139   case Instruction::URem:
4140   case Instruction::Add:
4141   case Instruction::FAdd:
4142   case Instruction::Sub:
4143   case Instruction::FSub:
4144   case Instruction::FNeg:
4145   case Instruction::Mul:
4146   case Instruction::FMul:
4147   case Instruction::FDiv:
4148   case Instruction::FRem:
4149   case Instruction::Shl:
4150   case Instruction::LShr:
4151   case Instruction::AShr:
4152   case Instruction::And:
4153   case Instruction::Or:
4154   case Instruction::Xor: {
4155     // Just widen unops and binops.
4156     setDebugLocFromInst(Builder, &I);
4157 
4158     for (unsigned Part = 0; Part < UF; ++Part) {
4159       SmallVector<Value *, 2> Ops;
4160       for (Value *Op : I.operands())
4161         Ops.push_back(getOrCreateVectorValue(Op, Part));
4162 
4163       Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops);
4164 
4165       if (auto *VecOp = dyn_cast<Instruction>(V))
4166         VecOp->copyIRFlags(&I);
4167 
4168       // Use this vector value for all users of the original instruction.
4169       VectorLoopValueMap.setVectorValue(&I, Part, V);
4170       addMetadata(V, &I);
4171     }
4172 
4173     break;
4174   }
4175   case Instruction::Select: {
4176     // Widen selects.
4177     // If the selector is loop invariant we can create a select
4178     // instruction with a scalar condition. Otherwise, use vector-select.
4179     auto *SE = PSE.getSE();
4180     bool InvariantCond =
4181         SE->isLoopInvariant(PSE.getSCEV(I.getOperand(0)), OrigLoop);
4182     setDebugLocFromInst(Builder, &I);
4183 
4184     // The condition can be loop invariant  but still defined inside the
4185     // loop. This means that we can't just use the original 'cond' value.
4186     // We have to take the 'vectorized' value and pick the first lane.
4187     // Instcombine will make this a no-op.
4188 
4189     auto *ScalarCond = getOrCreateScalarValue(I.getOperand(0), {0, 0});
4190 
4191     for (unsigned Part = 0; Part < UF; ++Part) {
4192       Value *Cond = getOrCreateVectorValue(I.getOperand(0), Part);
4193       Value *Op0 = getOrCreateVectorValue(I.getOperand(1), Part);
4194       Value *Op1 = getOrCreateVectorValue(I.getOperand(2), Part);
4195       Value *Sel =
4196           Builder.CreateSelect(InvariantCond ? ScalarCond : Cond, Op0, Op1);
4197       VectorLoopValueMap.setVectorValue(&I, Part, Sel);
4198       addMetadata(Sel, &I);
4199     }
4200 
4201     break;
4202   }
4203 
4204   case Instruction::ICmp:
4205   case Instruction::FCmp: {
4206     // Widen compares. Generate vector compares.
4207     bool FCmp = (I.getOpcode() == Instruction::FCmp);
4208     auto *Cmp = cast<CmpInst>(&I);
4209     setDebugLocFromInst(Builder, Cmp);
4210     for (unsigned Part = 0; Part < UF; ++Part) {
4211       Value *A = getOrCreateVectorValue(Cmp->getOperand(0), Part);
4212       Value *B = getOrCreateVectorValue(Cmp->getOperand(1), Part);
4213       Value *C = nullptr;
4214       if (FCmp) {
4215         // Propagate fast math flags.
4216         IRBuilder<>::FastMathFlagGuard FMFG(Builder);
4217         Builder.setFastMathFlags(Cmp->getFastMathFlags());
4218         C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
4219       } else {
4220         C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
4221       }
4222       VectorLoopValueMap.setVectorValue(&I, Part, C);
4223       addMetadata(C, &I);
4224     }
4225 
4226     break;
4227   }
4228 
4229   case Instruction::ZExt:
4230   case Instruction::SExt:
4231   case Instruction::FPToUI:
4232   case Instruction::FPToSI:
4233   case Instruction::FPExt:
4234   case Instruction::PtrToInt:
4235   case Instruction::IntToPtr:
4236   case Instruction::SIToFP:
4237   case Instruction::UIToFP:
4238   case Instruction::Trunc:
4239   case Instruction::FPTrunc:
4240   case Instruction::BitCast: {
4241     auto *CI = cast<CastInst>(&I);
4242     setDebugLocFromInst(Builder, CI);
4243 
4244     /// Vectorize casts.
4245     Type *DestTy =
4246         (VF == 1) ? CI->getType() : VectorType::get(CI->getType(), VF);
4247 
4248     for (unsigned Part = 0; Part < UF; ++Part) {
4249       Value *A = getOrCreateVectorValue(CI->getOperand(0), Part);
4250       Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
4251       VectorLoopValueMap.setVectorValue(&I, Part, Cast);
4252       addMetadata(Cast, &I);
4253     }
4254     break;
4255   }
4256 
4257   case Instruction::Call: {
4258     // Ignore dbg intrinsics.
4259     if (isa<DbgInfoIntrinsic>(I))
4260       break;
4261     setDebugLocFromInst(Builder, &I);
4262 
4263     Module *M = I.getParent()->getParent()->getParent();
4264     auto *CI = cast<CallInst>(&I);
4265 
4266     StringRef FnName = CI->getCalledFunction()->getName();
4267     Function *F = CI->getCalledFunction();
4268     Type *RetTy = ToVectorTy(CI->getType(), VF);
4269     SmallVector<Type *, 4> Tys;
4270     for (Value *ArgOperand : CI->arg_operands())
4271       Tys.push_back(ToVectorTy(ArgOperand->getType(), VF));
4272 
4273     Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
4274 
4275     // The flag shows whether we use Intrinsic or a usual Call for vectorized
4276     // version of the instruction.
4277     // Is it beneficial to perform intrinsic call compared to lib call?
4278     bool NeedToScalarize;
4279     unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize);
4280     bool UseVectorIntrinsic =
4281         ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost;
4282     assert((UseVectorIntrinsic || !NeedToScalarize) &&
4283            "Instruction should be scalarized elsewhere.");
4284 
4285     for (unsigned Part = 0; Part < UF; ++Part) {
4286       SmallVector<Value *, 4> Args;
4287       for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) {
4288         Value *Arg = CI->getArgOperand(i);
4289         // Some intrinsics have a scalar argument - don't replace it with a
4290         // vector.
4291         if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, i))
4292           Arg = getOrCreateVectorValue(CI->getArgOperand(i), Part);
4293         Args.push_back(Arg);
4294       }
4295 
4296       Function *VectorF;
4297       if (UseVectorIntrinsic) {
4298         // Use vector version of the intrinsic.
4299         Type *TysForDecl[] = {CI->getType()};
4300         if (VF > 1)
4301           TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
4302         VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
4303       } else {
4304         // Use vector version of the library call.
4305         StringRef VFnName = TLI->getVectorizedFunction(FnName, VF);
4306         assert(!VFnName.empty() && "Vector function name is empty.");
4307         VectorF = M->getFunction(VFnName);
4308         if (!VectorF) {
4309           // Generate a declaration
4310           FunctionType *FTy = FunctionType::get(RetTy, Tys, false);
4311           VectorF =
4312               Function::Create(FTy, Function::ExternalLinkage, VFnName, M);
4313           VectorF->copyAttributesFrom(F);
4314         }
4315       }
4316       assert(VectorF && "Can't create vector function.");
4317 
4318       SmallVector<OperandBundleDef, 1> OpBundles;
4319       CI->getOperandBundlesAsDefs(OpBundles);
4320       CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
4321 
4322       if (isa<FPMathOperator>(V))
4323         V->copyFastMathFlags(CI);
4324 
4325       VectorLoopValueMap.setVectorValue(&I, Part, V);
4326       addMetadata(V, &I);
4327     }
4328 
4329     break;
4330   }
4331 
4332   default:
4333     // This instruction is not vectorized by simple widening.
4334     LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
4335     llvm_unreachable("Unhandled instruction!");
4336   } // end of switch.
4337 }
4338 
4339 void InnerLoopVectorizer::updateAnalysis() {
4340   // Forget the original basic block.
4341   PSE.getSE()->forgetLoop(OrigLoop);
4342 
4343   // DT is not kept up-to-date for outer loop vectorization
4344   if (EnableVPlanNativePath)
4345     return;
4346 
4347   // Update the dominator tree information.
4348   assert(DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) &&
4349          "Entry does not dominate exit.");
4350 
4351   DT->addNewBlock(LoopMiddleBlock,
4352                   LI->getLoopFor(LoopVectorBody)->getLoopLatch());
4353   DT->addNewBlock(LoopScalarPreHeader, LoopBypassBlocks[0]);
4354   DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader);
4355   DT->changeImmediateDominator(LoopExitBlock, LoopBypassBlocks[0]);
4356   assert(DT->verify(DominatorTree::VerificationLevel::Fast));
4357 }
4358 
4359 void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
4360   // We should not collect Scalars more than once per VF. Right now, this
4361   // function is called from collectUniformsAndScalars(), which already does
4362   // this check. Collecting Scalars for VF=1 does not make any sense.
4363   assert(VF >= 2 && Scalars.find(VF) == Scalars.end() &&
4364          "This function should not be visited twice for the same VF");
4365 
4366   SmallSetVector<Instruction *, 8> Worklist;
4367 
4368   // These sets are used to seed the analysis with pointers used by memory
4369   // accesses that will remain scalar.
4370   SmallSetVector<Instruction *, 8> ScalarPtrs;
4371   SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
4372 
4373   // A helper that returns true if the use of Ptr by MemAccess will be scalar.
4374   // The pointer operands of loads and stores will be scalar as long as the
4375   // memory access is not a gather or scatter operation. The value operand of a
4376   // store will remain scalar if the store is scalarized.
4377   auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
4378     InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
4379     assert(WideningDecision != CM_Unknown &&
4380            "Widening decision should be ready at this moment");
4381     if (auto *Store = dyn_cast<StoreInst>(MemAccess))
4382       if (Ptr == Store->getValueOperand())
4383         return WideningDecision == CM_Scalarize;
4384     assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
4385            "Ptr is neither a value or pointer operand");
4386     return WideningDecision != CM_GatherScatter;
4387   };
4388 
4389   // A helper that returns true if the given value is a bitcast or
4390   // getelementptr instruction contained in the loop.
4391   auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
4392     return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
4393             isa<GetElementPtrInst>(V)) &&
4394            !TheLoop->isLoopInvariant(V);
4395   };
4396 
4397   // A helper that evaluates a memory access's use of a pointer. If the use
4398   // will be a scalar use, and the pointer is only used by memory accesses, we
4399   // place the pointer in ScalarPtrs. Otherwise, the pointer is placed in
4400   // PossibleNonScalarPtrs.
4401   auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
4402     // We only care about bitcast and getelementptr instructions contained in
4403     // the loop.
4404     if (!isLoopVaryingBitCastOrGEP(Ptr))
4405       return;
4406 
4407     // If the pointer has already been identified as scalar (e.g., if it was
4408     // also identified as uniform), there's nothing to do.
4409     auto *I = cast<Instruction>(Ptr);
4410     if (Worklist.count(I))
4411       return;
4412 
4413     // If the use of the pointer will be a scalar use, and all users of the
4414     // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
4415     // place the pointer in PossibleNonScalarPtrs.
4416     if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
4417           return isa<LoadInst>(U) || isa<StoreInst>(U);
4418         }))
4419       ScalarPtrs.insert(I);
4420     else
4421       PossibleNonScalarPtrs.insert(I);
4422   };
4423 
4424   // We seed the scalars analysis with three classes of instructions: (1)
4425   // instructions marked uniform-after-vectorization, (2) bitcast and
4426   // getelementptr instructions used by memory accesses requiring a scalar use,
4427   // and (3) pointer induction variables and their update instructions (we
4428   // currently only scalarize these).
4429   //
4430   // (1) Add to the worklist all instructions that have been identified as
4431   // uniform-after-vectorization.
4432   Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
4433 
4434   // (2) Add to the worklist all bitcast and getelementptr instructions used by
4435   // memory accesses requiring a scalar use. The pointer operands of loads and
4436   // stores will be scalar as long as the memory accesses is not a gather or
4437   // scatter operation. The value operand of a store will remain scalar if the
4438   // store is scalarized.
4439   for (auto *BB : TheLoop->blocks())
4440     for (auto &I : *BB) {
4441       if (auto *Load = dyn_cast<LoadInst>(&I)) {
4442         evaluatePtrUse(Load, Load->getPointerOperand());
4443       } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
4444         evaluatePtrUse(Store, Store->getPointerOperand());
4445         evaluatePtrUse(Store, Store->getValueOperand());
4446       }
4447     }
4448   for (auto *I : ScalarPtrs)
4449     if (PossibleNonScalarPtrs.find(I) == PossibleNonScalarPtrs.end()) {
4450       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
4451       Worklist.insert(I);
4452     }
4453 
4454   // (3) Add to the worklist all pointer induction variables and their update
4455   // instructions.
4456   //
4457   // TODO: Once we are able to vectorize pointer induction variables we should
4458   //       no longer insert them into the worklist here.
4459   auto *Latch = TheLoop->getLoopLatch();
4460   for (auto &Induction : *Legal->getInductionVars()) {
4461     auto *Ind = Induction.first;
4462     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4463     if (Induction.second.getKind() != InductionDescriptor::IK_PtrInduction)
4464       continue;
4465     Worklist.insert(Ind);
4466     Worklist.insert(IndUpdate);
4467     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
4468     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
4469                       << "\n");
4470   }
4471 
4472   // Insert the forced scalars.
4473   // FIXME: Currently widenPHIInstruction() often creates a dead vector
4474   // induction variable when the PHI user is scalarized.
4475   auto ForcedScalar = ForcedScalars.find(VF);
4476   if (ForcedScalar != ForcedScalars.end())
4477     for (auto *I : ForcedScalar->second)
4478       Worklist.insert(I);
4479 
4480   // Expand the worklist by looking through any bitcasts and getelementptr
4481   // instructions we've already identified as scalar. This is similar to the
4482   // expansion step in collectLoopUniforms(); however, here we're only
4483   // expanding to include additional bitcasts and getelementptr instructions.
4484   unsigned Idx = 0;
4485   while (Idx != Worklist.size()) {
4486     Instruction *Dst = Worklist[Idx++];
4487     if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
4488       continue;
4489     auto *Src = cast<Instruction>(Dst->getOperand(0));
4490     if (llvm::all_of(Src->users(), [&](User *U) -> bool {
4491           auto *J = cast<Instruction>(U);
4492           return !TheLoop->contains(J) || Worklist.count(J) ||
4493                  ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
4494                   isScalarUse(J, Src));
4495         })) {
4496       Worklist.insert(Src);
4497       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
4498     }
4499   }
4500 
4501   // An induction variable will remain scalar if all users of the induction
4502   // variable and induction variable update remain scalar.
4503   for (auto &Induction : *Legal->getInductionVars()) {
4504     auto *Ind = Induction.first;
4505     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4506 
4507     // We already considered pointer induction variables, so there's no reason
4508     // to look at their users again.
4509     //
4510     // TODO: Once we are able to vectorize pointer induction variables we
4511     //       should no longer skip over them here.
4512     if (Induction.second.getKind() == InductionDescriptor::IK_PtrInduction)
4513       continue;
4514 
4515     // Determine if all users of the induction variable are scalar after
4516     // vectorization.
4517     auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4518       auto *I = cast<Instruction>(U);
4519       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I);
4520     });
4521     if (!ScalarInd)
4522       continue;
4523 
4524     // Determine if all users of the induction variable update instruction are
4525     // scalar after vectorization.
4526     auto ScalarIndUpdate =
4527         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4528           auto *I = cast<Instruction>(U);
4529           return I == Ind || !TheLoop->contains(I) || Worklist.count(I);
4530         });
4531     if (!ScalarIndUpdate)
4532       continue;
4533 
4534     // The induction variable and its update instruction will remain scalar.
4535     Worklist.insert(Ind);
4536     Worklist.insert(IndUpdate);
4537     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
4538     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
4539                       << "\n");
4540   }
4541 
4542   Scalars[VF].insert(Worklist.begin(), Worklist.end());
4543 }
4544 
4545 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, unsigned VF) {
4546   if (!blockNeedsPredication(I->getParent()))
4547     return false;
4548   switch(I->getOpcode()) {
4549   default:
4550     break;
4551   case Instruction::Load:
4552   case Instruction::Store: {
4553     if (!Legal->isMaskRequired(I))
4554       return false;
4555     auto *Ptr = getLoadStorePointerOperand(I);
4556     auto *Ty = getMemInstValueType(I);
4557     // We have already decided how to vectorize this instruction, get that
4558     // result.
4559     if (VF > 1) {
4560       InstWidening WideningDecision = getWideningDecision(I, VF);
4561       assert(WideningDecision != CM_Unknown &&
4562              "Widening decision should be ready at this moment");
4563       return WideningDecision == CM_Scalarize;
4564     }
4565     const MaybeAlign Alignment = getLoadStoreAlignment(I);
4566     return isa<LoadInst>(I) ?
4567         !(isLegalMaskedLoad(Ty, Ptr, Alignment) || isLegalMaskedGather(Ty))
4568       : !(isLegalMaskedStore(Ty, Ptr, Alignment) || isLegalMaskedScatter(Ty));
4569   }
4570   case Instruction::UDiv:
4571   case Instruction::SDiv:
4572   case Instruction::SRem:
4573   case Instruction::URem:
4574     return mayDivideByZero(*I);
4575   }
4576   return false;
4577 }
4578 
4579 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(Instruction *I,
4580                                                                unsigned VF) {
4581   assert(isAccessInterleaved(I) && "Expecting interleaved access.");
4582   assert(getWideningDecision(I, VF) == CM_Unknown &&
4583          "Decision should not be set yet.");
4584   auto *Group = getInterleavedAccessGroup(I);
4585   assert(Group && "Must have a group.");
4586 
4587   // If the instruction's allocated size doesn't equal it's type size, it
4588   // requires padding and will be scalarized.
4589   auto &DL = I->getModule()->getDataLayout();
4590   auto *ScalarTy = getMemInstValueType(I);
4591   if (hasIrregularType(ScalarTy, DL, VF))
4592     return false;
4593 
4594   // Check if masking is required.
4595   // A Group may need masking for one of two reasons: it resides in a block that
4596   // needs predication, or it was decided to use masking to deal with gaps.
4597   bool PredicatedAccessRequiresMasking =
4598       Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I);
4599   bool AccessWithGapsRequiresMasking =
4600       Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
4601   if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking)
4602     return true;
4603 
4604   // If masked interleaving is required, we expect that the user/target had
4605   // enabled it, because otherwise it either wouldn't have been created or
4606   // it should have been invalidated by the CostModel.
4607   assert(useMaskedInterleavedAccesses(TTI) &&
4608          "Masked interleave-groups for predicated accesses are not enabled.");
4609 
4610   auto *Ty = getMemInstValueType(I);
4611   const MaybeAlign Alignment = getLoadStoreAlignment(I);
4612   return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
4613                           : TTI.isLegalMaskedStore(Ty, Alignment);
4614 }
4615 
4616 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(Instruction *I,
4617                                                                unsigned VF) {
4618   // Get and ensure we have a valid memory instruction.
4619   LoadInst *LI = dyn_cast<LoadInst>(I);
4620   StoreInst *SI = dyn_cast<StoreInst>(I);
4621   assert((LI || SI) && "Invalid memory instruction");
4622 
4623   auto *Ptr = getLoadStorePointerOperand(I);
4624 
4625   // In order to be widened, the pointer should be consecutive, first of all.
4626   if (!Legal->isConsecutivePtr(Ptr))
4627     return false;
4628 
4629   // If the instruction is a store located in a predicated block, it will be
4630   // scalarized.
4631   if (isScalarWithPredication(I))
4632     return false;
4633 
4634   // If the instruction's allocated size doesn't equal it's type size, it
4635   // requires padding and will be scalarized.
4636   auto &DL = I->getModule()->getDataLayout();
4637   auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType();
4638   if (hasIrregularType(ScalarTy, DL, VF))
4639     return false;
4640 
4641   return true;
4642 }
4643 
4644 void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
4645   // We should not collect Uniforms more than once per VF. Right now,
4646   // this function is called from collectUniformsAndScalars(), which
4647   // already does this check. Collecting Uniforms for VF=1 does not make any
4648   // sense.
4649 
4650   assert(VF >= 2 && Uniforms.find(VF) == Uniforms.end() &&
4651          "This function should not be visited twice for the same VF");
4652 
4653   // Visit the list of Uniforms. If we'll not find any uniform value, we'll
4654   // not analyze again.  Uniforms.count(VF) will return 1.
4655   Uniforms[VF].clear();
4656 
4657   // We now know that the loop is vectorizable!
4658   // Collect instructions inside the loop that will remain uniform after
4659   // vectorization.
4660 
4661   // Global values, params and instructions outside of current loop are out of
4662   // scope.
4663   auto isOutOfScope = [&](Value *V) -> bool {
4664     Instruction *I = dyn_cast<Instruction>(V);
4665     return (!I || !TheLoop->contains(I));
4666   };
4667 
4668   SetVector<Instruction *> Worklist;
4669   BasicBlock *Latch = TheLoop->getLoopLatch();
4670 
4671   // Start with the conditional branch. If the branch condition is an
4672   // instruction contained in the loop that is only used by the branch, it is
4673   // uniform.
4674   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
4675   if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) {
4676     Worklist.insert(Cmp);
4677     LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *Cmp << "\n");
4678   }
4679 
4680   // Holds consecutive and consecutive-like pointers. Consecutive-like pointers
4681   // are pointers that are treated like consecutive pointers during
4682   // vectorization. The pointer operands of interleaved accesses are an
4683   // example.
4684   SmallSetVector<Instruction *, 8> ConsecutiveLikePtrs;
4685 
4686   // Holds pointer operands of instructions that are possibly non-uniform.
4687   SmallPtrSet<Instruction *, 8> PossibleNonUniformPtrs;
4688 
4689   auto isUniformDecision = [&](Instruction *I, unsigned VF) {
4690     InstWidening WideningDecision = getWideningDecision(I, VF);
4691     assert(WideningDecision != CM_Unknown &&
4692            "Widening decision should be ready at this moment");
4693 
4694     return (WideningDecision == CM_Widen ||
4695             WideningDecision == CM_Widen_Reverse ||
4696             WideningDecision == CM_Interleave);
4697   };
4698   // Iterate over the instructions in the loop, and collect all
4699   // consecutive-like pointer operands in ConsecutiveLikePtrs. If it's possible
4700   // that a consecutive-like pointer operand will be scalarized, we collect it
4701   // in PossibleNonUniformPtrs instead. We use two sets here because a single
4702   // getelementptr instruction can be used by both vectorized and scalarized
4703   // memory instructions. For example, if a loop loads and stores from the same
4704   // location, but the store is conditional, the store will be scalarized, and
4705   // the getelementptr won't remain uniform.
4706   for (auto *BB : TheLoop->blocks())
4707     for (auto &I : *BB) {
4708       // If there's no pointer operand, there's nothing to do.
4709       auto *Ptr = dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
4710       if (!Ptr)
4711         continue;
4712 
4713       // True if all users of Ptr are memory accesses that have Ptr as their
4714       // pointer operand.
4715       auto UsersAreMemAccesses =
4716           llvm::all_of(Ptr->users(), [&](User *U) -> bool {
4717             return getLoadStorePointerOperand(U) == Ptr;
4718           });
4719 
4720       // Ensure the memory instruction will not be scalarized or used by
4721       // gather/scatter, making its pointer operand non-uniform. If the pointer
4722       // operand is used by any instruction other than a memory access, we
4723       // conservatively assume the pointer operand may be non-uniform.
4724       if (!UsersAreMemAccesses || !isUniformDecision(&I, VF))
4725         PossibleNonUniformPtrs.insert(Ptr);
4726 
4727       // If the memory instruction will be vectorized and its pointer operand
4728       // is consecutive-like, or interleaving - the pointer operand should
4729       // remain uniform.
4730       else
4731         ConsecutiveLikePtrs.insert(Ptr);
4732     }
4733 
4734   // Add to the Worklist all consecutive and consecutive-like pointers that
4735   // aren't also identified as possibly non-uniform.
4736   for (auto *V : ConsecutiveLikePtrs)
4737     if (PossibleNonUniformPtrs.find(V) == PossibleNonUniformPtrs.end()) {
4738       LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *V << "\n");
4739       Worklist.insert(V);
4740     }
4741 
4742   // Expand Worklist in topological order: whenever a new instruction
4743   // is added , its users should be already inside Worklist.  It ensures
4744   // a uniform instruction will only be used by uniform instructions.
4745   unsigned idx = 0;
4746   while (idx != Worklist.size()) {
4747     Instruction *I = Worklist[idx++];
4748 
4749     for (auto OV : I->operand_values()) {
4750       // isOutOfScope operands cannot be uniform instructions.
4751       if (isOutOfScope(OV))
4752         continue;
4753       // First order recurrence Phi's should typically be considered
4754       // non-uniform.
4755       auto *OP = dyn_cast<PHINode>(OV);
4756       if (OP && Legal->isFirstOrderRecurrence(OP))
4757         continue;
4758       // If all the users of the operand are uniform, then add the
4759       // operand into the uniform worklist.
4760       auto *OI = cast<Instruction>(OV);
4761       if (llvm::all_of(OI->users(), [&](User *U) -> bool {
4762             auto *J = cast<Instruction>(U);
4763             return Worklist.count(J) ||
4764                    (OI == getLoadStorePointerOperand(J) &&
4765                     isUniformDecision(J, VF));
4766           })) {
4767         Worklist.insert(OI);
4768         LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *OI << "\n");
4769       }
4770     }
4771   }
4772 
4773   // Returns true if Ptr is the pointer operand of a memory access instruction
4774   // I, and I is known to not require scalarization.
4775   auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
4776     return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF);
4777   };
4778 
4779   // For an instruction to be added into Worklist above, all its users inside
4780   // the loop should also be in Worklist. However, this condition cannot be
4781   // true for phi nodes that form a cyclic dependence. We must process phi
4782   // nodes separately. An induction variable will remain uniform if all users
4783   // of the induction variable and induction variable update remain uniform.
4784   // The code below handles both pointer and non-pointer induction variables.
4785   for (auto &Induction : *Legal->getInductionVars()) {
4786     auto *Ind = Induction.first;
4787     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4788 
4789     // Determine if all users of the induction variable are uniform after
4790     // vectorization.
4791     auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4792       auto *I = cast<Instruction>(U);
4793       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
4794              isVectorizedMemAccessUse(I, Ind);
4795     });
4796     if (!UniformInd)
4797       continue;
4798 
4799     // Determine if all users of the induction variable update instruction are
4800     // uniform after vectorization.
4801     auto UniformIndUpdate =
4802         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4803           auto *I = cast<Instruction>(U);
4804           return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
4805                  isVectorizedMemAccessUse(I, IndUpdate);
4806         });
4807     if (!UniformIndUpdate)
4808       continue;
4809 
4810     // The induction variable and its update instruction will remain uniform.
4811     Worklist.insert(Ind);
4812     Worklist.insert(IndUpdate);
4813     LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *Ind << "\n");
4814     LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *IndUpdate
4815                       << "\n");
4816   }
4817 
4818   Uniforms[VF].insert(Worklist.begin(), Worklist.end());
4819 }
4820 
4821 bool LoopVectorizationCostModel::runtimeChecksRequired() {
4822   LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
4823 
4824   if (Legal->getRuntimePointerChecking()->Need) {
4825     reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
4826         "runtime pointer checks needed. Enable vectorization of this "
4827         "loop with '#pragma clang loop vectorize(enable)' when "
4828         "compiling with -Os/-Oz",
4829         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4830     return true;
4831   }
4832 
4833   if (!PSE.getUnionPredicate().getPredicates().empty()) {
4834     reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
4835         "runtime SCEV checks needed. Enable vectorization of this "
4836         "loop with '#pragma clang loop vectorize(enable)' when "
4837         "compiling with -Os/-Oz",
4838         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4839     return true;
4840   }
4841 
4842   // FIXME: Avoid specializing for stride==1 instead of bailing out.
4843   if (!Legal->getLAI()->getSymbolicStrides().empty()) {
4844     reportVectorizationFailure("Runtime stride check is required with -Os/-Oz",
4845         "runtime stride == 1 checks needed. Enable vectorization of "
4846         "this loop with '#pragma clang loop vectorize(enable)' when "
4847         "compiling with -Os/-Oz",
4848         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4849     return true;
4850   }
4851 
4852   return false;
4853 }
4854 
4855 Optional<unsigned> LoopVectorizationCostModel::computeMaxVF() {
4856   if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
4857     // TODO: It may by useful to do since it's still likely to be dynamically
4858     // uniform if the target can skip.
4859     reportVectorizationFailure(
4860         "Not inserting runtime ptr check for divergent target",
4861         "runtime pointer checks needed. Not enabled for divergent target",
4862         "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
4863     return None;
4864   }
4865 
4866   unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
4867   LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
4868   if (TC == 1) {
4869     reportVectorizationFailure("Single iteration (non) loop",
4870         "loop trip count is one, irrelevant for vectorization",
4871         "SingleIterationLoop", ORE, TheLoop);
4872     return None;
4873   }
4874 
4875   switch (ScalarEpilogueStatus) {
4876   case CM_ScalarEpilogueAllowed:
4877     return computeFeasibleMaxVF(TC);
4878   case CM_ScalarEpilogueNotNeededUsePredicate:
4879     LLVM_DEBUG(
4880         dbgs() << "LV: vector predicate hint/switch found.\n"
4881                << "LV: Not allowing scalar epilogue, creating predicated "
4882                << "vector loop.\n");
4883     break;
4884   case CM_ScalarEpilogueNotAllowedLowTripLoop:
4885     // fallthrough as a special case of OptForSize
4886   case CM_ScalarEpilogueNotAllowedOptSize:
4887     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
4888       LLVM_DEBUG(
4889           dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
4890     else
4891       LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
4892                         << "count.\n");
4893 
4894     // Bail if runtime checks are required, which are not good when optimising
4895     // for size.
4896     if (runtimeChecksRequired())
4897       return None;
4898     break;
4899   }
4900 
4901   // Now try the tail folding
4902 
4903   // Invalidate interleave groups that require an epilogue if we can't mask
4904   // the interleave-group.
4905   if (!useMaskedInterleavedAccesses(TTI))
4906     InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
4907 
4908   unsigned MaxVF = computeFeasibleMaxVF(TC);
4909   if (TC > 0 && TC % MaxVF == 0) {
4910     // Accept MaxVF if we do not have a tail.
4911     LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
4912     return MaxVF;
4913   }
4914 
4915   // If we don't know the precise trip count, or if the trip count that we
4916   // found modulo the vectorization factor is not zero, try to fold the tail
4917   // by masking.
4918   // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
4919   if (Legal->prepareToFoldTailByMasking()) {
4920     FoldTailByMasking = true;
4921     return MaxVF;
4922   }
4923 
4924   if (TC == 0) {
4925     reportVectorizationFailure(
4926         "Unable to calculate the loop count due to complex control flow",
4927         "unable to calculate the loop count due to complex control flow",
4928         "UnknownLoopCountComplexCFG", ORE, TheLoop);
4929     return None;
4930   }
4931 
4932   reportVectorizationFailure(
4933       "Cannot optimize for size and vectorize at the same time.",
4934       "cannot optimize for size and vectorize at the same time. "
4935       "Enable vectorization of this loop with '#pragma clang loop "
4936       "vectorize(enable)' when compiling with -Os/-Oz",
4937       "NoTailLoopWithOptForSize", ORE, TheLoop);
4938   return None;
4939 }
4940 
4941 unsigned
4942 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) {
4943   MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
4944   unsigned SmallestType, WidestType;
4945   std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
4946   unsigned WidestRegister = TTI.getRegisterBitWidth(true);
4947 
4948   // Get the maximum safe dependence distance in bits computed by LAA.
4949   // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
4950   // the memory accesses that is most restrictive (involved in the smallest
4951   // dependence distance).
4952   unsigned MaxSafeRegisterWidth = Legal->getMaxSafeRegisterWidth();
4953 
4954   WidestRegister = std::min(WidestRegister, MaxSafeRegisterWidth);
4955 
4956   unsigned MaxVectorSize = WidestRegister / WidestType;
4957 
4958   LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
4959                     << " / " << WidestType << " bits.\n");
4960   LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
4961                     << WidestRegister << " bits.\n");
4962 
4963   assert(MaxVectorSize <= 256 && "Did not expect to pack so many elements"
4964                                  " into one vector!");
4965   if (MaxVectorSize == 0) {
4966     LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n");
4967     MaxVectorSize = 1;
4968     return MaxVectorSize;
4969   } else if (ConstTripCount && ConstTripCount < MaxVectorSize &&
4970              isPowerOf2_32(ConstTripCount)) {
4971     // We need to clamp the VF to be the ConstTripCount. There is no point in
4972     // choosing a higher viable VF as done in the loop below.
4973     LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: "
4974                       << ConstTripCount << "\n");
4975     MaxVectorSize = ConstTripCount;
4976     return MaxVectorSize;
4977   }
4978 
4979   unsigned MaxVF = MaxVectorSize;
4980   if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) ||
4981       (MaximizeBandwidth && isScalarEpilogueAllowed())) {
4982     // Collect all viable vectorization factors larger than the default MaxVF
4983     // (i.e. MaxVectorSize).
4984     SmallVector<unsigned, 8> VFs;
4985     unsigned NewMaxVectorSize = WidestRegister / SmallestType;
4986     for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2)
4987       VFs.push_back(VS);
4988 
4989     // For each VF calculate its register usage.
4990     auto RUs = calculateRegisterUsage(VFs);
4991 
4992     // Select the largest VF which doesn't require more registers than existing
4993     // ones.
4994     for (int i = RUs.size() - 1; i >= 0; --i) {
4995       bool Selected = true;
4996       for (auto& pair : RUs[i].MaxLocalUsers) {
4997         unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
4998         if (pair.second > TargetNumRegisters)
4999           Selected = false;
5000       }
5001       if (Selected) {
5002         MaxVF = VFs[i];
5003         break;
5004       }
5005     }
5006     if (unsigned MinVF = TTI.getMinimumVF(SmallestType)) {
5007       if (MaxVF < MinVF) {
5008         LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
5009                           << ") with target's minimum: " << MinVF << '\n');
5010         MaxVF = MinVF;
5011       }
5012     }
5013   }
5014   return MaxVF;
5015 }
5016 
5017 VectorizationFactor
5018 LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) {
5019   float Cost = expectedCost(1).first;
5020   const float ScalarCost = Cost;
5021   unsigned Width = 1;
5022   LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n");
5023 
5024   bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
5025   if (ForceVectorization && MaxVF > 1) {
5026     // Ignore scalar width, because the user explicitly wants vectorization.
5027     // Initialize cost to max so that VF = 2 is, at least, chosen during cost
5028     // evaluation.
5029     Cost = std::numeric_limits<float>::max();
5030   }
5031 
5032   for (unsigned i = 2; i <= MaxVF; i *= 2) {
5033     // Notice that the vector loop needs to be executed less times, so
5034     // we need to divide the cost of the vector loops by the width of
5035     // the vector elements.
5036     VectorizationCostTy C = expectedCost(i);
5037     float VectorCost = C.first / (float)i;
5038     LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
5039                       << " costs: " << (int)VectorCost << ".\n");
5040     if (!C.second && !ForceVectorization) {
5041       LLVM_DEBUG(
5042           dbgs() << "LV: Not considering vector loop of width " << i
5043                  << " because it will not generate any vector instructions.\n");
5044       continue;
5045     }
5046     if (VectorCost < Cost) {
5047       Cost = VectorCost;
5048       Width = i;
5049     }
5050   }
5051 
5052   if (!EnableCondStoresVectorization && NumPredStores) {
5053     reportVectorizationFailure("There are conditional stores.",
5054         "store that is conditionally executed prevents vectorization",
5055         "ConditionalStore", ORE, TheLoop);
5056     Width = 1;
5057     Cost = ScalarCost;
5058   }
5059 
5060   LLVM_DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs()
5061              << "LV: Vectorization seems to be not beneficial, "
5062              << "but was forced by a user.\n");
5063   LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n");
5064   VectorizationFactor Factor = {Width, (unsigned)(Width * Cost)};
5065   return Factor;
5066 }
5067 
5068 std::pair<unsigned, unsigned>
5069 LoopVectorizationCostModel::getSmallestAndWidestTypes() {
5070   unsigned MinWidth = -1U;
5071   unsigned MaxWidth = 8;
5072   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5073 
5074   // For each block.
5075   for (BasicBlock *BB : TheLoop->blocks()) {
5076     // For each instruction in the loop.
5077     for (Instruction &I : BB->instructionsWithoutDebug()) {
5078       Type *T = I.getType();
5079 
5080       // Skip ignored values.
5081       if (ValuesToIgnore.find(&I) != ValuesToIgnore.end())
5082         continue;
5083 
5084       // Only examine Loads, Stores and PHINodes.
5085       if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
5086         continue;
5087 
5088       // Examine PHI nodes that are reduction variables. Update the type to
5089       // account for the recurrence type.
5090       if (auto *PN = dyn_cast<PHINode>(&I)) {
5091         if (!Legal->isReductionVariable(PN))
5092           continue;
5093         RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[PN];
5094         T = RdxDesc.getRecurrenceType();
5095       }
5096 
5097       // Examine the stored values.
5098       if (auto *ST = dyn_cast<StoreInst>(&I))
5099         T = ST->getValueOperand()->getType();
5100 
5101       // Ignore loaded pointer types and stored pointer types that are not
5102       // vectorizable.
5103       //
5104       // FIXME: The check here attempts to predict whether a load or store will
5105       //        be vectorized. We only know this for certain after a VF has
5106       //        been selected. Here, we assume that if an access can be
5107       //        vectorized, it will be. We should also look at extending this
5108       //        optimization to non-pointer types.
5109       //
5110       if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) &&
5111           !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I))
5112         continue;
5113 
5114       MinWidth = std::min(MinWidth,
5115                           (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
5116       MaxWidth = std::max(MaxWidth,
5117                           (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
5118     }
5119   }
5120 
5121   return {MinWidth, MaxWidth};
5122 }
5123 
5124 unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF,
5125                                                            unsigned LoopCost) {
5126   // -- The interleave heuristics --
5127   // We interleave the loop in order to expose ILP and reduce the loop overhead.
5128   // There are many micro-architectural considerations that we can't predict
5129   // at this level. For example, frontend pressure (on decode or fetch) due to
5130   // code size, or the number and capabilities of the execution ports.
5131   //
5132   // We use the following heuristics to select the interleave count:
5133   // 1. If the code has reductions, then we interleave to break the cross
5134   // iteration dependency.
5135   // 2. If the loop is really small, then we interleave to reduce the loop
5136   // overhead.
5137   // 3. We don't interleave if we think that we will spill registers to memory
5138   // due to the increased register pressure.
5139 
5140   if (!isScalarEpilogueAllowed())
5141     return 1;
5142 
5143   // We used the distance for the interleave count.
5144   if (Legal->getMaxSafeDepDistBytes() != -1U)
5145     return 1;
5146 
5147   // Do not interleave loops with a relatively small known or estimated trip
5148   // count.
5149   auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
5150   if (BestKnownTC && *BestKnownTC < TinyTripCountInterleaveThreshold)
5151     return 1;
5152 
5153   RegisterUsage R = calculateRegisterUsage({VF})[0];
5154   // We divide by these constants so assume that we have at least one
5155   // instruction that uses at least one register.
5156   for (auto& pair : R.MaxLocalUsers) {
5157     pair.second = std::max(pair.second, 1U);
5158   }
5159 
5160   // We calculate the interleave count using the following formula.
5161   // Subtract the number of loop invariants from the number of available
5162   // registers. These registers are used by all of the interleaved instances.
5163   // Next, divide the remaining registers by the number of registers that is
5164   // required by the loop, in order to estimate how many parallel instances
5165   // fit without causing spills. All of this is rounded down if necessary to be
5166   // a power of two. We want power of two interleave count to simplify any
5167   // addressing operations or alignment considerations.
5168   // We also want power of two interleave counts to ensure that the induction
5169   // variable of the vector loop wraps to zero, when tail is folded by masking;
5170   // this currently happens when OptForSize, in which case IC is set to 1 above.
5171   unsigned IC = UINT_MAX;
5172 
5173   for (auto& pair : R.MaxLocalUsers) {
5174     unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5175     LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
5176                       << " registers of "
5177                       << TTI.getRegisterClassName(pair.first) << " register class\n");
5178     if (VF == 1) {
5179       if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
5180         TargetNumRegisters = ForceTargetNumScalarRegs;
5181     } else {
5182       if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
5183         TargetNumRegisters = ForceTargetNumVectorRegs;
5184     }
5185     unsigned MaxLocalUsers = pair.second;
5186     unsigned LoopInvariantRegs = 0;
5187     if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end())
5188       LoopInvariantRegs = R.LoopInvariantRegs[pair.first];
5189 
5190     unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers);
5191     // Don't count the induction variable as interleaved.
5192     if (EnableIndVarRegisterHeur) {
5193       TmpIC =
5194           PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) /
5195                         std::max(1U, (MaxLocalUsers - 1)));
5196     }
5197 
5198     IC = std::min(IC, TmpIC);
5199   }
5200 
5201   // Clamp the interleave ranges to reasonable counts.
5202   unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF);
5203 
5204   // Check if the user has overridden the max.
5205   if (VF == 1) {
5206     if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
5207       MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
5208   } else {
5209     if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
5210       MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
5211   }
5212 
5213   // If trip count is known or estimated compile time constant, limit the
5214   // interleave count to be less than the trip count divided by VF.
5215   if (BestKnownTC) {
5216     MaxInterleaveCount = std::min(*BestKnownTC / VF, MaxInterleaveCount);
5217   }
5218 
5219   // If we did not calculate the cost for VF (because the user selected the VF)
5220   // then we calculate the cost of VF here.
5221   if (LoopCost == 0)
5222     LoopCost = expectedCost(VF).first;
5223 
5224   assert(LoopCost && "Non-zero loop cost expected");
5225 
5226   // Clamp the calculated IC to be between the 1 and the max interleave count
5227   // that the target and trip count allows.
5228   if (IC > MaxInterleaveCount)
5229     IC = MaxInterleaveCount;
5230   else if (IC < 1)
5231     IC = 1;
5232 
5233   // Interleave if we vectorized this loop and there is a reduction that could
5234   // benefit from interleaving.
5235   if (VF > 1 && !Legal->getReductionVars()->empty()) {
5236     LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
5237     return IC;
5238   }
5239 
5240   // Note that if we've already vectorized the loop we will have done the
5241   // runtime check and so interleaving won't require further checks.
5242   bool InterleavingRequiresRuntimePointerCheck =
5243       (VF == 1 && Legal->getRuntimePointerChecking()->Need);
5244 
5245   // We want to interleave small loops in order to reduce the loop overhead and
5246   // potentially expose ILP opportunities.
5247   LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n');
5248   if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) {
5249     // We assume that the cost overhead is 1 and we use the cost model
5250     // to estimate the cost of the loop and interleave until the cost of the
5251     // loop overhead is about 5% of the cost of the loop.
5252     unsigned SmallIC =
5253         std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost));
5254 
5255     // Interleave until store/load ports (estimated by max interleave count) are
5256     // saturated.
5257     unsigned NumStores = Legal->getNumStores();
5258     unsigned NumLoads = Legal->getNumLoads();
5259     unsigned StoresIC = IC / (NumStores ? NumStores : 1);
5260     unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
5261 
5262     // If we have a scalar reduction (vector reductions are already dealt with
5263     // by this point), we can increase the critical path length if the loop
5264     // we're interleaving is inside another loop. Limit, by default to 2, so the
5265     // critical path only gets increased by one reduction operation.
5266     if (!Legal->getReductionVars()->empty() && TheLoop->getLoopDepth() > 1) {
5267       unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
5268       SmallIC = std::min(SmallIC, F);
5269       StoresIC = std::min(StoresIC, F);
5270       LoadsIC = std::min(LoadsIC, F);
5271     }
5272 
5273     if (EnableLoadStoreRuntimeInterleave &&
5274         std::max(StoresIC, LoadsIC) > SmallIC) {
5275       LLVM_DEBUG(
5276           dbgs() << "LV: Interleaving to saturate store or load ports.\n");
5277       return std::max(StoresIC, LoadsIC);
5278     }
5279 
5280     LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
5281     return SmallIC;
5282   }
5283 
5284   // Interleave if this is a large loop (small loops are already dealt with by
5285   // this point) that could benefit from interleaving.
5286   bool HasReductions = !Legal->getReductionVars()->empty();
5287   if (TTI.enableAggressiveInterleaving(HasReductions)) {
5288     LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5289     return IC;
5290   }
5291 
5292   LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
5293   return 1;
5294 }
5295 
5296 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
5297 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {
5298   // This function calculates the register usage by measuring the highest number
5299   // of values that are alive at a single location. Obviously, this is a very
5300   // rough estimation. We scan the loop in a topological order in order and
5301   // assign a number to each instruction. We use RPO to ensure that defs are
5302   // met before their users. We assume that each instruction that has in-loop
5303   // users starts an interval. We record every time that an in-loop value is
5304   // used, so we have a list of the first and last occurrences of each
5305   // instruction. Next, we transpose this data structure into a multi map that
5306   // holds the list of intervals that *end* at a specific location. This multi
5307   // map allows us to perform a linear search. We scan the instructions linearly
5308   // and record each time that a new interval starts, by placing it in a set.
5309   // If we find this value in the multi-map then we remove it from the set.
5310   // The max register usage is the maximum size of the set.
5311   // We also search for instructions that are defined outside the loop, but are
5312   // used inside the loop. We need this number separately from the max-interval
5313   // usage number because when we unroll, loop-invariant values do not take
5314   // more register.
5315   LoopBlocksDFS DFS(TheLoop);
5316   DFS.perform(LI);
5317 
5318   RegisterUsage RU;
5319 
5320   // Each 'key' in the map opens a new interval. The values
5321   // of the map are the index of the 'last seen' usage of the
5322   // instruction that is the key.
5323   using IntervalMap = DenseMap<Instruction *, unsigned>;
5324 
5325   // Maps instruction to its index.
5326   SmallVector<Instruction *, 64> IdxToInstr;
5327   // Marks the end of each interval.
5328   IntervalMap EndPoint;
5329   // Saves the list of instruction indices that are used in the loop.
5330   SmallPtrSet<Instruction *, 8> Ends;
5331   // Saves the list of values that are used in the loop but are
5332   // defined outside the loop, such as arguments and constants.
5333   SmallPtrSet<Value *, 8> LoopInvariants;
5334 
5335   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
5336     for (Instruction &I : BB->instructionsWithoutDebug()) {
5337       IdxToInstr.push_back(&I);
5338 
5339       // Save the end location of each USE.
5340       for (Value *U : I.operands()) {
5341         auto *Instr = dyn_cast<Instruction>(U);
5342 
5343         // Ignore non-instruction values such as arguments, constants, etc.
5344         if (!Instr)
5345           continue;
5346 
5347         // If this instruction is outside the loop then record it and continue.
5348         if (!TheLoop->contains(Instr)) {
5349           LoopInvariants.insert(Instr);
5350           continue;
5351         }
5352 
5353         // Overwrite previous end points.
5354         EndPoint[Instr] = IdxToInstr.size();
5355         Ends.insert(Instr);
5356       }
5357     }
5358   }
5359 
5360   // Saves the list of intervals that end with the index in 'key'.
5361   using InstrList = SmallVector<Instruction *, 2>;
5362   DenseMap<unsigned, InstrList> TransposeEnds;
5363 
5364   // Transpose the EndPoints to a list of values that end at each index.
5365   for (auto &Interval : EndPoint)
5366     TransposeEnds[Interval.second].push_back(Interval.first);
5367 
5368   SmallPtrSet<Instruction *, 8> OpenIntervals;
5369 
5370   // Get the size of the widest register.
5371   unsigned MaxSafeDepDist = -1U;
5372   if (Legal->getMaxSafeDepDistBytes() != -1U)
5373     MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8;
5374   unsigned WidestRegister =
5375       std::min(TTI.getRegisterBitWidth(true), MaxSafeDepDist);
5376   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5377 
5378   SmallVector<RegisterUsage, 8> RUs(VFs.size());
5379   SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size());
5380 
5381   LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
5382 
5383   // A lambda that gets the register usage for the given type and VF.
5384   auto GetRegUsage = [&DL, WidestRegister](Type *Ty, unsigned VF) {
5385     if (Ty->isTokenTy())
5386       return 0U;
5387     unsigned TypeSize = DL.getTypeSizeInBits(Ty->getScalarType());
5388     return std::max<unsigned>(1, VF * TypeSize / WidestRegister);
5389   };
5390 
5391   for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
5392     Instruction *I = IdxToInstr[i];
5393 
5394     // Remove all of the instructions that end at this location.
5395     InstrList &List = TransposeEnds[i];
5396     for (Instruction *ToRemove : List)
5397       OpenIntervals.erase(ToRemove);
5398 
5399     // Ignore instructions that are never used within the loop.
5400     if (Ends.find(I) == Ends.end())
5401       continue;
5402 
5403     // Skip ignored values.
5404     if (ValuesToIgnore.find(I) != ValuesToIgnore.end())
5405       continue;
5406 
5407     // For each VF find the maximum usage of registers.
5408     for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
5409       // Count the number of live intervals.
5410       SmallMapVector<unsigned, unsigned, 4> RegUsage;
5411 
5412       if (VFs[j] == 1) {
5413         for (auto Inst : OpenIntervals) {
5414           unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
5415           if (RegUsage.find(ClassID) == RegUsage.end())
5416             RegUsage[ClassID] = 1;
5417           else
5418             RegUsage[ClassID] += 1;
5419         }
5420       } else {
5421         collectUniformsAndScalars(VFs[j]);
5422         for (auto Inst : OpenIntervals) {
5423           // Skip ignored values for VF > 1.
5424           if (VecValuesToIgnore.find(Inst) != VecValuesToIgnore.end())
5425             continue;
5426           if (isScalarAfterVectorization(Inst, VFs[j])) {
5427             unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
5428             if (RegUsage.find(ClassID) == RegUsage.end())
5429               RegUsage[ClassID] = 1;
5430             else
5431               RegUsage[ClassID] += 1;
5432           } else {
5433             unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType());
5434             if (RegUsage.find(ClassID) == RegUsage.end())
5435               RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]);
5436             else
5437               RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]);
5438           }
5439         }
5440       }
5441 
5442       for (auto& pair : RegUsage) {
5443         if (MaxUsages[j].find(pair.first) != MaxUsages[j].end())
5444           MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second);
5445         else
5446           MaxUsages[j][pair.first] = pair.second;
5447       }
5448     }
5449 
5450     LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
5451                       << OpenIntervals.size() << '\n');
5452 
5453     // Add the current instruction to the list of open intervals.
5454     OpenIntervals.insert(I);
5455   }
5456 
5457   for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
5458     SmallMapVector<unsigned, unsigned, 4> Invariant;
5459 
5460     for (auto Inst : LoopInvariants) {
5461       unsigned Usage = VFs[i] == 1 ? 1 : GetRegUsage(Inst->getType(), VFs[i]);
5462       unsigned ClassID = TTI.getRegisterClassForType(VFs[i] > 1, Inst->getType());
5463       if (Invariant.find(ClassID) == Invariant.end())
5464         Invariant[ClassID] = Usage;
5465       else
5466         Invariant[ClassID] += Usage;
5467     }
5468 
5469     LLVM_DEBUG({
5470       dbgs() << "LV(REG): VF = " << VFs[i] << '\n';
5471       dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size()
5472              << " item\n";
5473       for (const auto &pair : MaxUsages[i]) {
5474         dbgs() << "LV(REG): RegisterClass: "
5475                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5476                << " registers\n";
5477       }
5478       dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()
5479              << " item\n";
5480       for (const auto &pair : Invariant) {
5481         dbgs() << "LV(REG): RegisterClass: "
5482                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5483                << " registers\n";
5484       }
5485     });
5486 
5487     RU.LoopInvariantRegs = Invariant;
5488     RU.MaxLocalUsers = MaxUsages[i];
5489     RUs[i] = RU;
5490   }
5491 
5492   return RUs;
5493 }
5494 
5495 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){
5496   // TODO: Cost model for emulated masked load/store is completely
5497   // broken. This hack guides the cost model to use an artificially
5498   // high enough value to practically disable vectorization with such
5499   // operations, except where previously deployed legality hack allowed
5500   // using very low cost values. This is to avoid regressions coming simply
5501   // from moving "masked load/store" check from legality to cost model.
5502   // Masked Load/Gather emulation was previously never allowed.
5503   // Limited number of Masked Store/Scatter emulation was allowed.
5504   assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction");
5505   return isa<LoadInst>(I) ||
5506          (isa<StoreInst>(I) &&
5507           NumPredStores > NumberOfStoresToPredicate);
5508 }
5509 
5510 void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) {
5511   // If we aren't vectorizing the loop, or if we've already collected the
5512   // instructions to scalarize, there's nothing to do. Collection may already
5513   // have occurred if we have a user-selected VF and are now computing the
5514   // expected cost for interleaving.
5515   if (VF < 2 || InstsToScalarize.find(VF) != InstsToScalarize.end())
5516     return;
5517 
5518   // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
5519   // not profitable to scalarize any instructions, the presence of VF in the
5520   // map will indicate that we've analyzed it already.
5521   ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
5522 
5523   // Find all the instructions that are scalar with predication in the loop and
5524   // determine if it would be better to not if-convert the blocks they are in.
5525   // If so, we also record the instructions to scalarize.
5526   for (BasicBlock *BB : TheLoop->blocks()) {
5527     if (!blockNeedsPredication(BB))
5528       continue;
5529     for (Instruction &I : *BB)
5530       if (isScalarWithPredication(&I)) {
5531         ScalarCostsTy ScalarCosts;
5532         // Do not apply discount logic if hacked cost is needed
5533         // for emulated masked memrefs.
5534         if (!useEmulatedMaskMemRefHack(&I) &&
5535             computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
5536           ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
5537         // Remember that BB will remain after vectorization.
5538         PredicatedBBsAfterVectorization.insert(BB);
5539       }
5540   }
5541 }
5542 
5543 int LoopVectorizationCostModel::computePredInstDiscount(
5544     Instruction *PredInst, DenseMap<Instruction *, unsigned> &ScalarCosts,
5545     unsigned VF) {
5546   assert(!isUniformAfterVectorization(PredInst, VF) &&
5547          "Instruction marked uniform-after-vectorization will be predicated");
5548 
5549   // Initialize the discount to zero, meaning that the scalar version and the
5550   // vector version cost the same.
5551   int Discount = 0;
5552 
5553   // Holds instructions to analyze. The instructions we visit are mapped in
5554   // ScalarCosts. Those instructions are the ones that would be scalarized if
5555   // we find that the scalar version costs less.
5556   SmallVector<Instruction *, 8> Worklist;
5557 
5558   // Returns true if the given instruction can be scalarized.
5559   auto canBeScalarized = [&](Instruction *I) -> bool {
5560     // We only attempt to scalarize instructions forming a single-use chain
5561     // from the original predicated block that would otherwise be vectorized.
5562     // Although not strictly necessary, we give up on instructions we know will
5563     // already be scalar to avoid traversing chains that are unlikely to be
5564     // beneficial.
5565     if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
5566         isScalarAfterVectorization(I, VF))
5567       return false;
5568 
5569     // If the instruction is scalar with predication, it will be analyzed
5570     // separately. We ignore it within the context of PredInst.
5571     if (isScalarWithPredication(I))
5572       return false;
5573 
5574     // If any of the instruction's operands are uniform after vectorization,
5575     // the instruction cannot be scalarized. This prevents, for example, a
5576     // masked load from being scalarized.
5577     //
5578     // We assume we will only emit a value for lane zero of an instruction
5579     // marked uniform after vectorization, rather than VF identical values.
5580     // Thus, if we scalarize an instruction that uses a uniform, we would
5581     // create uses of values corresponding to the lanes we aren't emitting code
5582     // for. This behavior can be changed by allowing getScalarValue to clone
5583     // the lane zero values for uniforms rather than asserting.
5584     for (Use &U : I->operands())
5585       if (auto *J = dyn_cast<Instruction>(U.get()))
5586         if (isUniformAfterVectorization(J, VF))
5587           return false;
5588 
5589     // Otherwise, we can scalarize the instruction.
5590     return true;
5591   };
5592 
5593   // Compute the expected cost discount from scalarizing the entire expression
5594   // feeding the predicated instruction. We currently only consider expressions
5595   // that are single-use instruction chains.
5596   Worklist.push_back(PredInst);
5597   while (!Worklist.empty()) {
5598     Instruction *I = Worklist.pop_back_val();
5599 
5600     // If we've already analyzed the instruction, there's nothing to do.
5601     if (ScalarCosts.find(I) != ScalarCosts.end())
5602       continue;
5603 
5604     // Compute the cost of the vector instruction. Note that this cost already
5605     // includes the scalarization overhead of the predicated instruction.
5606     unsigned VectorCost = getInstructionCost(I, VF).first;
5607 
5608     // Compute the cost of the scalarized instruction. This cost is the cost of
5609     // the instruction as if it wasn't if-converted and instead remained in the
5610     // predicated block. We will scale this cost by block probability after
5611     // computing the scalarization overhead.
5612     unsigned ScalarCost = VF * getInstructionCost(I, 1).first;
5613 
5614     // Compute the scalarization overhead of needed insertelement instructions
5615     // and phi nodes.
5616     if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) {
5617       ScalarCost += TTI.getScalarizationOverhead(ToVectorTy(I->getType(), VF),
5618                                                  true, false);
5619       ScalarCost += VF * TTI.getCFInstrCost(Instruction::PHI);
5620     }
5621 
5622     // Compute the scalarization overhead of needed extractelement
5623     // instructions. For each of the instruction's operands, if the operand can
5624     // be scalarized, add it to the worklist; otherwise, account for the
5625     // overhead.
5626     for (Use &U : I->operands())
5627       if (auto *J = dyn_cast<Instruction>(U.get())) {
5628         assert(VectorType::isValidElementType(J->getType()) &&
5629                "Instruction has non-scalar type");
5630         if (canBeScalarized(J))
5631           Worklist.push_back(J);
5632         else if (needsExtract(J, VF))
5633           ScalarCost += TTI.getScalarizationOverhead(
5634                               ToVectorTy(J->getType(),VF), false, true);
5635       }
5636 
5637     // Scale the total scalar cost by block probability.
5638     ScalarCost /= getReciprocalPredBlockProb();
5639 
5640     // Compute the discount. A non-negative discount means the vector version
5641     // of the instruction costs more, and scalarizing would be beneficial.
5642     Discount += VectorCost - ScalarCost;
5643     ScalarCosts[I] = ScalarCost;
5644   }
5645 
5646   return Discount;
5647 }
5648 
5649 LoopVectorizationCostModel::VectorizationCostTy
5650 LoopVectorizationCostModel::expectedCost(unsigned VF) {
5651   VectorizationCostTy Cost;
5652 
5653   // For each block.
5654   for (BasicBlock *BB : TheLoop->blocks()) {
5655     VectorizationCostTy BlockCost;
5656 
5657     // For each instruction in the old loop.
5658     for (Instruction &I : BB->instructionsWithoutDebug()) {
5659       // Skip ignored values.
5660       if (ValuesToIgnore.find(&I) != ValuesToIgnore.end() ||
5661           (VF > 1 && VecValuesToIgnore.find(&I) != VecValuesToIgnore.end()))
5662         continue;
5663 
5664       VectorizationCostTy C = getInstructionCost(&I, VF);
5665 
5666       // Check if we should override the cost.
5667       if (ForceTargetInstructionCost.getNumOccurrences() > 0)
5668         C.first = ForceTargetInstructionCost;
5669 
5670       BlockCost.first += C.first;
5671       BlockCost.second |= C.second;
5672       LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first
5673                         << " for VF " << VF << " For instruction: " << I
5674                         << '\n');
5675     }
5676 
5677     // If we are vectorizing a predicated block, it will have been
5678     // if-converted. This means that the block's instructions (aside from
5679     // stores and instructions that may divide by zero) will now be
5680     // unconditionally executed. For the scalar case, we may not always execute
5681     // the predicated block. Thus, scale the block's cost by the probability of
5682     // executing it.
5683     if (VF == 1 && blockNeedsPredication(BB))
5684       BlockCost.first /= getReciprocalPredBlockProb();
5685 
5686     Cost.first += BlockCost.first;
5687     Cost.second |= BlockCost.second;
5688   }
5689 
5690   return Cost;
5691 }
5692 
5693 /// Gets Address Access SCEV after verifying that the access pattern
5694 /// is loop invariant except the induction variable dependence.
5695 ///
5696 /// This SCEV can be sent to the Target in order to estimate the address
5697 /// calculation cost.
5698 static const SCEV *getAddressAccessSCEV(
5699               Value *Ptr,
5700               LoopVectorizationLegality *Legal,
5701               PredicatedScalarEvolution &PSE,
5702               const Loop *TheLoop) {
5703 
5704   auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
5705   if (!Gep)
5706     return nullptr;
5707 
5708   // We are looking for a gep with all loop invariant indices except for one
5709   // which should be an induction variable.
5710   auto SE = PSE.getSE();
5711   unsigned NumOperands = Gep->getNumOperands();
5712   for (unsigned i = 1; i < NumOperands; ++i) {
5713     Value *Opd = Gep->getOperand(i);
5714     if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
5715         !Legal->isInductionVariable(Opd))
5716       return nullptr;
5717   }
5718 
5719   // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
5720   return PSE.getSCEV(Ptr);
5721 }
5722 
5723 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
5724   return Legal->hasStride(I->getOperand(0)) ||
5725          Legal->hasStride(I->getOperand(1));
5726 }
5727 
5728 unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
5729                                                                  unsigned VF) {
5730   assert(VF > 1 && "Scalarization cost of instruction implies vectorization.");
5731   Type *ValTy = getMemInstValueType(I);
5732   auto SE = PSE.getSE();
5733 
5734   unsigned AS = getLoadStoreAddressSpace(I);
5735   Value *Ptr = getLoadStorePointerOperand(I);
5736   Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
5737 
5738   // Figure out whether the access is strided and get the stride value
5739   // if it's known in compile time
5740   const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
5741 
5742   // Get the cost of the scalar memory instruction and address computation.
5743   unsigned Cost = VF * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
5744 
5745   // Don't pass *I here, since it is scalar but will actually be part of a
5746   // vectorized loop where the user of it is a vectorized instruction.
5747   const MaybeAlign Alignment = getLoadStoreAlignment(I);
5748   Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(),
5749                                    Alignment, AS);
5750 
5751   // Get the overhead of the extractelement and insertelement instructions
5752   // we might create due to scalarization.
5753   Cost += getScalarizationOverhead(I, VF);
5754 
5755   // If we have a predicated store, it may not be executed for each vector
5756   // lane. Scale the cost by the probability of executing the predicated
5757   // block.
5758   if (isPredicatedInst(I)) {
5759     Cost /= getReciprocalPredBlockProb();
5760 
5761     if (useEmulatedMaskMemRefHack(I))
5762       // Artificially setting to a high enough value to practically disable
5763       // vectorization with such operations.
5764       Cost = 3000000;
5765   }
5766 
5767   return Cost;
5768 }
5769 
5770 unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
5771                                                              unsigned VF) {
5772   Type *ValTy = getMemInstValueType(I);
5773   Type *VectorTy = ToVectorTy(ValTy, VF);
5774   Value *Ptr = getLoadStorePointerOperand(I);
5775   unsigned AS = getLoadStoreAddressSpace(I);
5776   int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
5777 
5778   assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
5779          "Stride should be 1 or -1 for consecutive memory access");
5780   const MaybeAlign Alignment = getLoadStoreAlignment(I);
5781   unsigned Cost = 0;
5782   if (Legal->isMaskRequired(I))
5783     Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy,
5784                                       Alignment ? Alignment->value() : 0, AS);
5785   else
5786     Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, I);
5787 
5788   bool Reverse = ConsecutiveStride < 0;
5789   if (Reverse)
5790     Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
5791   return Cost;
5792 }
5793 
5794 unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
5795                                                          unsigned VF) {
5796   Type *ValTy = getMemInstValueType(I);
5797   Type *VectorTy = ToVectorTy(ValTy, VF);
5798   const MaybeAlign Alignment = getLoadStoreAlignment(I);
5799   unsigned AS = getLoadStoreAddressSpace(I);
5800   if (isa<LoadInst>(I)) {
5801     return TTI.getAddressComputationCost(ValTy) +
5802            TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS) +
5803            TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
5804   }
5805   StoreInst *SI = cast<StoreInst>(I);
5806 
5807   bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand());
5808   return TTI.getAddressComputationCost(ValTy) +
5809          TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS) +
5810          (isLoopInvariantStoreValue
5811               ? 0
5812               : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
5813                                        VF - 1));
5814 }
5815 
5816 unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
5817                                                           unsigned VF) {
5818   Type *ValTy = getMemInstValueType(I);
5819   Type *VectorTy = ToVectorTy(ValTy, VF);
5820   const MaybeAlign Alignment = getLoadStoreAlignment(I);
5821   Value *Ptr = getLoadStorePointerOperand(I);
5822 
5823   return TTI.getAddressComputationCost(VectorTy) +
5824          TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr,
5825                                     Legal->isMaskRequired(I),
5826                                     Alignment ? Alignment->value() : 0);
5827 }
5828 
5829 unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
5830                                                             unsigned VF) {
5831   Type *ValTy = getMemInstValueType(I);
5832   Type *VectorTy = ToVectorTy(ValTy, VF);
5833   unsigned AS = getLoadStoreAddressSpace(I);
5834 
5835   auto Group = getInterleavedAccessGroup(I);
5836   assert(Group && "Fail to get an interleaved access group.");
5837 
5838   unsigned InterleaveFactor = Group->getFactor();
5839   Type *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
5840 
5841   // Holds the indices of existing members in an interleaved load group.
5842   // An interleaved store group doesn't need this as it doesn't allow gaps.
5843   SmallVector<unsigned, 4> Indices;
5844   if (isa<LoadInst>(I)) {
5845     for (unsigned i = 0; i < InterleaveFactor; i++)
5846       if (Group->getMember(i))
5847         Indices.push_back(i);
5848   }
5849 
5850   // Calculate the cost of the whole interleaved group.
5851   bool UseMaskForGaps =
5852       Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
5853   unsigned Cost = TTI.getInterleavedMemoryOpCost(
5854       I->getOpcode(), WideVecTy, Group->getFactor(), Indices,
5855       Group->getAlignment(), AS, Legal->isMaskRequired(I), UseMaskForGaps);
5856 
5857   if (Group->isReverse()) {
5858     // TODO: Add support for reversed masked interleaved access.
5859     assert(!Legal->isMaskRequired(I) &&
5860            "Reverse masked interleaved access not supported.");
5861     Cost += Group->getNumMembers() *
5862             TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
5863   }
5864   return Cost;
5865 }
5866 
5867 unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
5868                                                               unsigned VF) {
5869   // Calculate scalar cost only. Vectorization cost should be ready at this
5870   // moment.
5871   if (VF == 1) {
5872     Type *ValTy = getMemInstValueType(I);
5873     const MaybeAlign Alignment = getLoadStoreAlignment(I);
5874     unsigned AS = getLoadStoreAddressSpace(I);
5875 
5876     return TTI.getAddressComputationCost(ValTy) +
5877            TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, I);
5878   }
5879   return getWideningCost(I, VF);
5880 }
5881 
5882 LoopVectorizationCostModel::VectorizationCostTy
5883 LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
5884   // If we know that this instruction will remain uniform, check the cost of
5885   // the scalar version.
5886   if (isUniformAfterVectorization(I, VF))
5887     VF = 1;
5888 
5889   if (VF > 1 && isProfitableToScalarize(I, VF))
5890     return VectorizationCostTy(InstsToScalarize[VF][I], false);
5891 
5892   // Forced scalars do not have any scalarization overhead.
5893   auto ForcedScalar = ForcedScalars.find(VF);
5894   if (VF > 1 && ForcedScalar != ForcedScalars.end()) {
5895     auto InstSet = ForcedScalar->second;
5896     if (InstSet.find(I) != InstSet.end())
5897       return VectorizationCostTy((getInstructionCost(I, 1).first * VF), false);
5898   }
5899 
5900   Type *VectorTy;
5901   unsigned C = getInstructionCost(I, VF, VectorTy);
5902 
5903   bool TypeNotScalarized =
5904       VF > 1 && VectorTy->isVectorTy() && TTI.getNumberOfParts(VectorTy) < VF;
5905   return VectorizationCostTy(C, TypeNotScalarized);
5906 }
5907 
5908 unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
5909                                                               unsigned VF) {
5910 
5911   if (VF == 1)
5912     return 0;
5913 
5914   unsigned Cost = 0;
5915   Type *RetTy = ToVectorTy(I->getType(), VF);
5916   if (!RetTy->isVoidTy() &&
5917       (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
5918     Cost += TTI.getScalarizationOverhead(RetTy, true, false);
5919 
5920   // Some targets keep addresses scalar.
5921   if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
5922     return Cost;
5923 
5924   // Some targets support efficient element stores.
5925   if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
5926     return Cost;
5927 
5928   // Collect operands to consider.
5929   CallInst *CI = dyn_cast<CallInst>(I);
5930   Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands();
5931 
5932   // Skip operands that do not require extraction/scalarization and do not incur
5933   // any overhead.
5934   return Cost + TTI.getOperandsScalarizationOverhead(
5935                     filterExtractingOperands(Ops, VF), VF);
5936 }
5937 
5938 void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {
5939   if (VF == 1)
5940     return;
5941   NumPredStores = 0;
5942   for (BasicBlock *BB : TheLoop->blocks()) {
5943     // For each instruction in the old loop.
5944     for (Instruction &I : *BB) {
5945       Value *Ptr =  getLoadStorePointerOperand(&I);
5946       if (!Ptr)
5947         continue;
5948 
5949       // TODO: We should generate better code and update the cost model for
5950       // predicated uniform stores. Today they are treated as any other
5951       // predicated store (see added test cases in
5952       // invariant-store-vectorization.ll).
5953       if (isa<StoreInst>(&I) && isScalarWithPredication(&I))
5954         NumPredStores++;
5955 
5956       if (Legal->isUniform(Ptr) &&
5957           // Conditional loads and stores should be scalarized and predicated.
5958           // isScalarWithPredication cannot be used here since masked
5959           // gather/scatters are not considered scalar with predication.
5960           !Legal->blockNeedsPredication(I.getParent())) {
5961         // TODO: Avoid replicating loads and stores instead of
5962         // relying on instcombine to remove them.
5963         // Load: Scalar load + broadcast
5964         // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
5965         unsigned Cost = getUniformMemOpCost(&I, VF);
5966         setWideningDecision(&I, VF, CM_Scalarize, Cost);
5967         continue;
5968       }
5969 
5970       // We assume that widening is the best solution when possible.
5971       if (memoryInstructionCanBeWidened(&I, VF)) {
5972         unsigned Cost = getConsecutiveMemOpCost(&I, VF);
5973         int ConsecutiveStride =
5974                Legal->isConsecutivePtr(getLoadStorePointerOperand(&I));
5975         assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
5976                "Expected consecutive stride.");
5977         InstWidening Decision =
5978             ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
5979         setWideningDecision(&I, VF, Decision, Cost);
5980         continue;
5981       }
5982 
5983       // Choose between Interleaving, Gather/Scatter or Scalarization.
5984       unsigned InterleaveCost = std::numeric_limits<unsigned>::max();
5985       unsigned NumAccesses = 1;
5986       if (isAccessInterleaved(&I)) {
5987         auto Group = getInterleavedAccessGroup(&I);
5988         assert(Group && "Fail to get an interleaved access group.");
5989 
5990         // Make one decision for the whole group.
5991         if (getWideningDecision(&I, VF) != CM_Unknown)
5992           continue;
5993 
5994         NumAccesses = Group->getNumMembers();
5995         if (interleavedAccessCanBeWidened(&I, VF))
5996           InterleaveCost = getInterleaveGroupCost(&I, VF);
5997       }
5998 
5999       unsigned GatherScatterCost =
6000           isLegalGatherOrScatter(&I)
6001               ? getGatherScatterCost(&I, VF) * NumAccesses
6002               : std::numeric_limits<unsigned>::max();
6003 
6004       unsigned ScalarizationCost =
6005           getMemInstScalarizationCost(&I, VF) * NumAccesses;
6006 
6007       // Choose better solution for the current VF,
6008       // write down this decision and use it during vectorization.
6009       unsigned Cost;
6010       InstWidening Decision;
6011       if (InterleaveCost <= GatherScatterCost &&
6012           InterleaveCost < ScalarizationCost) {
6013         Decision = CM_Interleave;
6014         Cost = InterleaveCost;
6015       } else if (GatherScatterCost < ScalarizationCost) {
6016         Decision = CM_GatherScatter;
6017         Cost = GatherScatterCost;
6018       } else {
6019         Decision = CM_Scalarize;
6020         Cost = ScalarizationCost;
6021       }
6022       // If the instructions belongs to an interleave group, the whole group
6023       // receives the same decision. The whole group receives the cost, but
6024       // the cost will actually be assigned to one instruction.
6025       if (auto Group = getInterleavedAccessGroup(&I))
6026         setWideningDecision(Group, VF, Decision, Cost);
6027       else
6028         setWideningDecision(&I, VF, Decision, Cost);
6029     }
6030   }
6031 
6032   // Make sure that any load of address and any other address computation
6033   // remains scalar unless there is gather/scatter support. This avoids
6034   // inevitable extracts into address registers, and also has the benefit of
6035   // activating LSR more, since that pass can't optimize vectorized
6036   // addresses.
6037   if (TTI.prefersVectorizedAddressing())
6038     return;
6039 
6040   // Start with all scalar pointer uses.
6041   SmallPtrSet<Instruction *, 8> AddrDefs;
6042   for (BasicBlock *BB : TheLoop->blocks())
6043     for (Instruction &I : *BB) {
6044       Instruction *PtrDef =
6045         dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
6046       if (PtrDef && TheLoop->contains(PtrDef) &&
6047           getWideningDecision(&I, VF) != CM_GatherScatter)
6048         AddrDefs.insert(PtrDef);
6049     }
6050 
6051   // Add all instructions used to generate the addresses.
6052   SmallVector<Instruction *, 4> Worklist;
6053   for (auto *I : AddrDefs)
6054     Worklist.push_back(I);
6055   while (!Worklist.empty()) {
6056     Instruction *I = Worklist.pop_back_val();
6057     for (auto &Op : I->operands())
6058       if (auto *InstOp = dyn_cast<Instruction>(Op))
6059         if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
6060             AddrDefs.insert(InstOp).second)
6061           Worklist.push_back(InstOp);
6062   }
6063 
6064   for (auto *I : AddrDefs) {
6065     if (isa<LoadInst>(I)) {
6066       // Setting the desired widening decision should ideally be handled in
6067       // by cost functions, but since this involves the task of finding out
6068       // if the loaded register is involved in an address computation, it is
6069       // instead changed here when we know this is the case.
6070       InstWidening Decision = getWideningDecision(I, VF);
6071       if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
6072         // Scalarize a widened load of address.
6073         setWideningDecision(I, VF, CM_Scalarize,
6074                             (VF * getMemoryInstructionCost(I, 1)));
6075       else if (auto Group = getInterleavedAccessGroup(I)) {
6076         // Scalarize an interleave group of address loads.
6077         for (unsigned I = 0; I < Group->getFactor(); ++I) {
6078           if (Instruction *Member = Group->getMember(I))
6079             setWideningDecision(Member, VF, CM_Scalarize,
6080                                 (VF * getMemoryInstructionCost(Member, 1)));
6081         }
6082       }
6083     } else
6084       // Make sure I gets scalarized and a cost estimate without
6085       // scalarization overhead.
6086       ForcedScalars[VF].insert(I);
6087   }
6088 }
6089 
6090 unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
6091                                                         unsigned VF,
6092                                                         Type *&VectorTy) {
6093   Type *RetTy = I->getType();
6094   if (canTruncateToMinimalBitwidth(I, VF))
6095     RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
6096   VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF);
6097   auto SE = PSE.getSE();
6098 
6099   // TODO: We need to estimate the cost of intrinsic calls.
6100   switch (I->getOpcode()) {
6101   case Instruction::GetElementPtr:
6102     // We mark this instruction as zero-cost because the cost of GEPs in
6103     // vectorized code depends on whether the corresponding memory instruction
6104     // is scalarized or not. Therefore, we handle GEPs with the memory
6105     // instruction cost.
6106     return 0;
6107   case Instruction::Br: {
6108     // In cases of scalarized and predicated instructions, there will be VF
6109     // predicated blocks in the vectorized loop. Each branch around these
6110     // blocks requires also an extract of its vector compare i1 element.
6111     bool ScalarPredicatedBB = false;
6112     BranchInst *BI = cast<BranchInst>(I);
6113     if (VF > 1 && BI->isConditional() &&
6114         (PredicatedBBsAfterVectorization.find(BI->getSuccessor(0)) !=
6115              PredicatedBBsAfterVectorization.end() ||
6116          PredicatedBBsAfterVectorization.find(BI->getSuccessor(1)) !=
6117              PredicatedBBsAfterVectorization.end()))
6118       ScalarPredicatedBB = true;
6119 
6120     if (ScalarPredicatedBB) {
6121       // Return cost for branches around scalarized and predicated blocks.
6122       Type *Vec_i1Ty =
6123           VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
6124       return (TTI.getScalarizationOverhead(Vec_i1Ty, false, true) +
6125               (TTI.getCFInstrCost(Instruction::Br) * VF));
6126     } else if (I->getParent() == TheLoop->getLoopLatch() || VF == 1)
6127       // The back-edge branch will remain, as will all scalar branches.
6128       return TTI.getCFInstrCost(Instruction::Br);
6129     else
6130       // This branch will be eliminated by if-conversion.
6131       return 0;
6132     // Note: We currently assume zero cost for an unconditional branch inside
6133     // a predicated block since it will become a fall-through, although we
6134     // may decide in the future to call TTI for all branches.
6135   }
6136   case Instruction::PHI: {
6137     auto *Phi = cast<PHINode>(I);
6138 
6139     // First-order recurrences are replaced by vector shuffles inside the loop.
6140     // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type.
6141     if (VF > 1 && Legal->isFirstOrderRecurrence(Phi))
6142       return TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector,
6143                                 VectorTy, VF - 1, VectorType::get(RetTy, 1));
6144 
6145     // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
6146     // converted into select instructions. We require N - 1 selects per phi
6147     // node, where N is the number of incoming values.
6148     if (VF > 1 && Phi->getParent() != TheLoop->getHeader())
6149       return (Phi->getNumIncomingValues() - 1) *
6150              TTI.getCmpSelInstrCost(
6151                  Instruction::Select, ToVectorTy(Phi->getType(), VF),
6152                  ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF));
6153 
6154     return TTI.getCFInstrCost(Instruction::PHI);
6155   }
6156   case Instruction::UDiv:
6157   case Instruction::SDiv:
6158   case Instruction::URem:
6159   case Instruction::SRem:
6160     // If we have a predicated instruction, it may not be executed for each
6161     // vector lane. Get the scalarization cost and scale this amount by the
6162     // probability of executing the predicated block. If the instruction is not
6163     // predicated, we fall through to the next case.
6164     if (VF > 1 && isScalarWithPredication(I)) {
6165       unsigned Cost = 0;
6166 
6167       // These instructions have a non-void type, so account for the phi nodes
6168       // that we will create. This cost is likely to be zero. The phi node
6169       // cost, if any, should be scaled by the block probability because it
6170       // models a copy at the end of each predicated block.
6171       Cost += VF * TTI.getCFInstrCost(Instruction::PHI);
6172 
6173       // The cost of the non-predicated instruction.
6174       Cost += VF * TTI.getArithmeticInstrCost(I->getOpcode(), RetTy);
6175 
6176       // The cost of insertelement and extractelement instructions needed for
6177       // scalarization.
6178       Cost += getScalarizationOverhead(I, VF);
6179 
6180       // Scale the cost by the probability of executing the predicated blocks.
6181       // This assumes the predicated block for each vector lane is equally
6182       // likely.
6183       return Cost / getReciprocalPredBlockProb();
6184     }
6185     LLVM_FALLTHROUGH;
6186   case Instruction::Add:
6187   case Instruction::FAdd:
6188   case Instruction::Sub:
6189   case Instruction::FSub:
6190   case Instruction::Mul:
6191   case Instruction::FMul:
6192   case Instruction::FDiv:
6193   case Instruction::FRem:
6194   case Instruction::Shl:
6195   case Instruction::LShr:
6196   case Instruction::AShr:
6197   case Instruction::And:
6198   case Instruction::Or:
6199   case Instruction::Xor: {
6200     // Since we will replace the stride by 1 the multiplication should go away.
6201     if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal))
6202       return 0;
6203     // Certain instructions can be cheaper to vectorize if they have a constant
6204     // second vector operand. One example of this are shifts on x86.
6205     Value *Op2 = I->getOperand(1);
6206     TargetTransformInfo::OperandValueProperties Op2VP;
6207     TargetTransformInfo::OperandValueKind Op2VK =
6208         TTI.getOperandInfo(Op2, Op2VP);
6209     if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2))
6210       Op2VK = TargetTransformInfo::OK_UniformValue;
6211 
6212     SmallVector<const Value *, 4> Operands(I->operand_values());
6213     unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
6214     return N * TTI.getArithmeticInstrCost(
6215                    I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue,
6216                    Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands);
6217   }
6218   case Instruction::FNeg: {
6219     unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
6220     return N * TTI.getArithmeticInstrCost(
6221                    I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue,
6222                    TargetTransformInfo::OK_AnyValue,
6223                    TargetTransformInfo::OP_None, TargetTransformInfo::OP_None,
6224                    I->getOperand(0));
6225   }
6226   case Instruction::Select: {
6227     SelectInst *SI = cast<SelectInst>(I);
6228     const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
6229     bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
6230     Type *CondTy = SI->getCondition()->getType();
6231     if (!ScalarCond)
6232       CondTy = VectorType::get(CondTy, VF);
6233 
6234     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, I);
6235   }
6236   case Instruction::ICmp:
6237   case Instruction::FCmp: {
6238     Type *ValTy = I->getOperand(0)->getType();
6239     Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
6240     if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
6241       ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
6242     VectorTy = ToVectorTy(ValTy, VF);
6243     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, I);
6244   }
6245   case Instruction::Store:
6246   case Instruction::Load: {
6247     unsigned Width = VF;
6248     if (Width > 1) {
6249       InstWidening Decision = getWideningDecision(I, Width);
6250       assert(Decision != CM_Unknown &&
6251              "CM decision should be taken at this point");
6252       if (Decision == CM_Scalarize)
6253         Width = 1;
6254     }
6255     VectorTy = ToVectorTy(getMemInstValueType(I), Width);
6256     return getMemoryInstructionCost(I, VF);
6257   }
6258   case Instruction::ZExt:
6259   case Instruction::SExt:
6260   case Instruction::FPToUI:
6261   case Instruction::FPToSI:
6262   case Instruction::FPExt:
6263   case Instruction::PtrToInt:
6264   case Instruction::IntToPtr:
6265   case Instruction::SIToFP:
6266   case Instruction::UIToFP:
6267   case Instruction::Trunc:
6268   case Instruction::FPTrunc:
6269   case Instruction::BitCast: {
6270     // We optimize the truncation of induction variables having constant
6271     // integer steps. The cost of these truncations is the same as the scalar
6272     // operation.
6273     if (isOptimizableIVTruncate(I, VF)) {
6274       auto *Trunc = cast<TruncInst>(I);
6275       return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
6276                                   Trunc->getSrcTy(), Trunc);
6277     }
6278 
6279     Type *SrcScalarTy = I->getOperand(0)->getType();
6280     Type *SrcVecTy =
6281         VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
6282     if (canTruncateToMinimalBitwidth(I, VF)) {
6283       // This cast is going to be shrunk. This may remove the cast or it might
6284       // turn it into slightly different cast. For example, if MinBW == 16,
6285       // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
6286       //
6287       // Calculate the modified src and dest types.
6288       Type *MinVecTy = VectorTy;
6289       if (I->getOpcode() == Instruction::Trunc) {
6290         SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
6291         VectorTy =
6292             largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
6293       } else if (I->getOpcode() == Instruction::ZExt ||
6294                  I->getOpcode() == Instruction::SExt) {
6295         SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy);
6296         VectorTy =
6297             smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
6298       }
6299     }
6300 
6301     unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
6302     return N * TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy, I);
6303   }
6304   case Instruction::Call: {
6305     bool NeedToScalarize;
6306     CallInst *CI = cast<CallInst>(I);
6307     unsigned CallCost = getVectorCallCost(CI, VF, NeedToScalarize);
6308     if (getVectorIntrinsicIDForCall(CI, TLI))
6309       return std::min(CallCost, getVectorIntrinsicCost(CI, VF));
6310     return CallCost;
6311   }
6312   default:
6313     // The cost of executing VF copies of the scalar instruction. This opcode
6314     // is unknown. Assume that it is the same as 'mul'.
6315     return VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy) +
6316            getScalarizationOverhead(I, VF);
6317   } // end of switch.
6318 }
6319 
6320 char LoopVectorize::ID = 0;
6321 
6322 static const char lv_name[] = "Loop Vectorization";
6323 
6324 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
6325 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
6326 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)
6327 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
6328 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
6329 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
6330 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
6331 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
6332 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
6333 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
6334 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
6335 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
6336 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
6337 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
6338 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
6339 
6340 namespace llvm {
6341 
6342 Pass *createLoopVectorizePass() { return new LoopVectorize(); }
6343 
6344 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced,
6345                               bool VectorizeOnlyWhenForced) {
6346   return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced);
6347 }
6348 
6349 } // end namespace llvm
6350 
6351 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
6352   // Check if the pointer operand of a load or store instruction is
6353   // consecutive.
6354   if (auto *Ptr = getLoadStorePointerOperand(Inst))
6355     return Legal->isConsecutivePtr(Ptr);
6356   return false;
6357 }
6358 
6359 void LoopVectorizationCostModel::collectValuesToIgnore() {
6360   // Ignore ephemeral values.
6361   CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
6362 
6363   // Ignore type-promoting instructions we identified during reduction
6364   // detection.
6365   for (auto &Reduction : *Legal->getReductionVars()) {
6366     RecurrenceDescriptor &RedDes = Reduction.second;
6367     SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
6368     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
6369   }
6370   // Ignore type-casting instructions we identified during induction
6371   // detection.
6372   for (auto &Induction : *Legal->getInductionVars()) {
6373     InductionDescriptor &IndDes = Induction.second;
6374     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
6375     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
6376   }
6377 }
6378 
6379 // TODO: we could return a pair of values that specify the max VF and
6380 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
6381 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
6382 // doesn't have a cost model that can choose which plan to execute if
6383 // more than one is generated.
6384 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits,
6385                                  LoopVectorizationCostModel &CM) {
6386   unsigned WidestType;
6387   std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
6388   return WidestVectorRegBits / WidestType;
6389 }
6390 
6391 VectorizationFactor
6392 LoopVectorizationPlanner::planInVPlanNativePath(unsigned UserVF) {
6393   unsigned VF = UserVF;
6394   // Outer loop handling: They may require CFG and instruction level
6395   // transformations before even evaluating whether vectorization is profitable.
6396   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
6397   // the vectorization pipeline.
6398   if (!OrigLoop->empty()) {
6399     // If the user doesn't provide a vectorization factor, determine a
6400     // reasonable one.
6401     if (!UserVF) {
6402       VF = determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM);
6403       LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
6404 
6405       // Make sure we have a VF > 1 for stress testing.
6406       if (VPlanBuildStressTest && VF < 2) {
6407         LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
6408                           << "overriding computed VF.\n");
6409         VF = 4;
6410       }
6411     }
6412     assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
6413     assert(isPowerOf2_32(VF) && "VF needs to be a power of two");
6414     LLVM_DEBUG(dbgs() << "LV: Using " << (UserVF ? "user " : "") << "VF " << VF
6415                       << " to build VPlans.\n");
6416     buildVPlans(VF, VF);
6417 
6418     // For VPlan build stress testing, we bail out after VPlan construction.
6419     if (VPlanBuildStressTest)
6420       return VectorizationFactor::Disabled();
6421 
6422     return {VF, 0};
6423   }
6424 
6425   LLVM_DEBUG(
6426       dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
6427                 "VPlan-native path.\n");
6428   return VectorizationFactor::Disabled();
6429 }
6430 
6431 Optional<VectorizationFactor> LoopVectorizationPlanner::plan(unsigned UserVF) {
6432   assert(OrigLoop->empty() && "Inner loop expected.");
6433   Optional<unsigned> MaybeMaxVF = CM.computeMaxVF();
6434   if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved.
6435     return None;
6436 
6437   // Invalidate interleave groups if all blocks of loop will be predicated.
6438   if (CM.blockNeedsPredication(OrigLoop->getHeader()) &&
6439       !useMaskedInterleavedAccesses(*TTI)) {
6440     LLVM_DEBUG(
6441         dbgs()
6442         << "LV: Invalidate all interleaved groups due to fold-tail by masking "
6443            "which requires masked-interleaved support.\n");
6444     CM.InterleaveInfo.reset();
6445   }
6446 
6447   if (UserVF) {
6448     LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
6449     assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two");
6450     // Collect the instructions (and their associated costs) that will be more
6451     // profitable to scalarize.
6452     CM.selectUserVectorizationFactor(UserVF);
6453     buildVPlansWithVPRecipes(UserVF, UserVF);
6454     LLVM_DEBUG(printPlans(dbgs()));
6455     return {{UserVF, 0}};
6456   }
6457 
6458   unsigned MaxVF = MaybeMaxVF.getValue();
6459   assert(MaxVF != 0 && "MaxVF is zero.");
6460 
6461   for (unsigned VF = 1; VF <= MaxVF; VF *= 2) {
6462     // Collect Uniform and Scalar instructions after vectorization with VF.
6463     CM.collectUniformsAndScalars(VF);
6464 
6465     // Collect the instructions (and their associated costs) that will be more
6466     // profitable to scalarize.
6467     if (VF > 1)
6468       CM.collectInstsToScalarize(VF);
6469   }
6470 
6471   buildVPlansWithVPRecipes(1, MaxVF);
6472   LLVM_DEBUG(printPlans(dbgs()));
6473   if (MaxVF == 1)
6474     return VectorizationFactor::Disabled();
6475 
6476   // Select the optimal vectorization factor.
6477   return CM.selectVectorizationFactor(MaxVF);
6478 }
6479 
6480 void LoopVectorizationPlanner::setBestPlan(unsigned VF, unsigned UF) {
6481   LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF
6482                     << '\n');
6483   BestVF = VF;
6484   BestUF = UF;
6485 
6486   erase_if(VPlans, [VF](const VPlanPtr &Plan) {
6487     return !Plan->hasVF(VF);
6488   });
6489   assert(VPlans.size() == 1 && "Best VF has not a single VPlan.");
6490 }
6491 
6492 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV,
6493                                            DominatorTree *DT) {
6494   // Perform the actual loop transformation.
6495 
6496   // 1. Create a new empty loop. Unlink the old loop and connect the new one.
6497   VPCallbackILV CallbackILV(ILV);
6498 
6499   VPTransformState State{BestVF, BestUF,      LI,
6500                          DT,     ILV.Builder, ILV.VectorLoopValueMap,
6501                          &ILV,   CallbackILV};
6502   State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
6503   State.TripCount = ILV.getOrCreateTripCount(nullptr);
6504 
6505   //===------------------------------------------------===//
6506   //
6507   // Notice: any optimization or new instruction that go
6508   // into the code below should also be implemented in
6509   // the cost-model.
6510   //
6511   //===------------------------------------------------===//
6512 
6513   // 2. Copy and widen instructions from the old loop into the new loop.
6514   assert(VPlans.size() == 1 && "Not a single VPlan to execute.");
6515   VPlans.front()->execute(&State);
6516 
6517   // 3. Fix the vectorized code: take care of header phi's, live-outs,
6518   //    predication, updating analyses.
6519   ILV.fixVectorizedLoop();
6520 }
6521 
6522 void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
6523     SmallPtrSetImpl<Instruction *> &DeadInstructions) {
6524   BasicBlock *Latch = OrigLoop->getLoopLatch();
6525 
6526   // We create new control-flow for the vectorized loop, so the original
6527   // condition will be dead after vectorization if it's only used by the
6528   // branch.
6529   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
6530   if (Cmp && Cmp->hasOneUse())
6531     DeadInstructions.insert(Cmp);
6532 
6533   // We create new "steps" for induction variable updates to which the original
6534   // induction variables map. An original update instruction will be dead if
6535   // all its users except the induction variable are dead.
6536   for (auto &Induction : *Legal->getInductionVars()) {
6537     PHINode *Ind = Induction.first;
6538     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
6539     if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
6540           return U == Ind || DeadInstructions.find(cast<Instruction>(U)) !=
6541                                  DeadInstructions.end();
6542         }))
6543       DeadInstructions.insert(IndUpdate);
6544 
6545     // We record as "Dead" also the type-casting instructions we had identified
6546     // during induction analysis. We don't need any handling for them in the
6547     // vectorized loop because we have proven that, under a proper runtime
6548     // test guarding the vectorized loop, the value of the phi, and the casted
6549     // value of the phi, are the same. The last instruction in this casting chain
6550     // will get its scalar/vector/widened def from the scalar/vector/widened def
6551     // of the respective phi node. Any other casts in the induction def-use chain
6552     // have no other uses outside the phi update chain, and will be ignored.
6553     InductionDescriptor &IndDes = Induction.second;
6554     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
6555     DeadInstructions.insert(Casts.begin(), Casts.end());
6556   }
6557 }
6558 
6559 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; }
6560 
6561 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; }
6562 
6563 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step,
6564                                         Instruction::BinaryOps BinOp) {
6565   // When unrolling and the VF is 1, we only need to add a simple scalar.
6566   Type *Ty = Val->getType();
6567   assert(!Ty->isVectorTy() && "Val must be a scalar");
6568 
6569   if (Ty->isFloatingPointTy()) {
6570     Constant *C = ConstantFP::get(Ty, (double)StartIdx);
6571 
6572     // Floating point operations had to be 'fast' to enable the unrolling.
6573     Value *MulOp = addFastMathFlag(Builder.CreateFMul(C, Step));
6574     return addFastMathFlag(Builder.CreateBinOp(BinOp, Val, MulOp));
6575   }
6576   Constant *C = ConstantInt::get(Ty, StartIdx);
6577   return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction");
6578 }
6579 
6580 static void AddRuntimeUnrollDisableMetaData(Loop *L) {
6581   SmallVector<Metadata *, 4> MDs;
6582   // Reserve first location for self reference to the LoopID metadata node.
6583   MDs.push_back(nullptr);
6584   bool IsUnrollMetadata = false;
6585   MDNode *LoopID = L->getLoopID();
6586   if (LoopID) {
6587     // First find existing loop unrolling disable metadata.
6588     for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
6589       auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
6590       if (MD) {
6591         const auto *S = dyn_cast<MDString>(MD->getOperand(0));
6592         IsUnrollMetadata =
6593             S && S->getString().startswith("llvm.loop.unroll.disable");
6594       }
6595       MDs.push_back(LoopID->getOperand(i));
6596     }
6597   }
6598 
6599   if (!IsUnrollMetadata) {
6600     // Add runtime unroll disable metadata.
6601     LLVMContext &Context = L->getHeader()->getContext();
6602     SmallVector<Metadata *, 1> DisableOperands;
6603     DisableOperands.push_back(
6604         MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
6605     MDNode *DisableNode = MDNode::get(Context, DisableOperands);
6606     MDs.push_back(DisableNode);
6607     MDNode *NewLoopID = MDNode::get(Context, MDs);
6608     // Set operand 0 to refer to the loop id itself.
6609     NewLoopID->replaceOperandWith(0, NewLoopID);
6610     L->setLoopID(NewLoopID);
6611   }
6612 }
6613 
6614 bool LoopVectorizationPlanner::getDecisionAndClampRange(
6615     const std::function<bool(unsigned)> &Predicate, VFRange &Range) {
6616   assert(Range.End > Range.Start && "Trying to test an empty VF range.");
6617   bool PredicateAtRangeStart = Predicate(Range.Start);
6618 
6619   for (unsigned TmpVF = Range.Start * 2; TmpVF < Range.End; TmpVF *= 2)
6620     if (Predicate(TmpVF) != PredicateAtRangeStart) {
6621       Range.End = TmpVF;
6622       break;
6623     }
6624 
6625   return PredicateAtRangeStart;
6626 }
6627 
6628 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
6629 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
6630 /// of VF's starting at a given VF and extending it as much as possible. Each
6631 /// vectorization decision can potentially shorten this sub-range during
6632 /// buildVPlan().
6633 void LoopVectorizationPlanner::buildVPlans(unsigned MinVF, unsigned MaxVF) {
6634   for (unsigned VF = MinVF; VF < MaxVF + 1;) {
6635     VFRange SubRange = {VF, MaxVF + 1};
6636     VPlans.push_back(buildVPlan(SubRange));
6637     VF = SubRange.End;
6638   }
6639 }
6640 
6641 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
6642                                          VPlanPtr &Plan) {
6643   assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
6644 
6645   // Look for cached value.
6646   std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
6647   EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
6648   if (ECEntryIt != EdgeMaskCache.end())
6649     return ECEntryIt->second;
6650 
6651   VPValue *SrcMask = createBlockInMask(Src, Plan);
6652 
6653   // The terminator has to be a branch inst!
6654   BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
6655   assert(BI && "Unexpected terminator found");
6656 
6657   if (!BI->isConditional())
6658     return EdgeMaskCache[Edge] = SrcMask;
6659 
6660   VPValue *EdgeMask = Plan->getVPValue(BI->getCondition());
6661   assert(EdgeMask && "No Edge Mask found for condition");
6662 
6663   if (BI->getSuccessor(0) != Dst)
6664     EdgeMask = Builder.createNot(EdgeMask);
6665 
6666   if (SrcMask) // Otherwise block in-mask is all-one, no need to AND.
6667     EdgeMask = Builder.createAnd(EdgeMask, SrcMask);
6668 
6669   return EdgeMaskCache[Edge] = EdgeMask;
6670 }
6671 
6672 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
6673   assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
6674 
6675   // Look for cached value.
6676   BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB);
6677   if (BCEntryIt != BlockMaskCache.end())
6678     return BCEntryIt->second;
6679 
6680   // All-one mask is modelled as no-mask following the convention for masked
6681   // load/store/gather/scatter. Initialize BlockMask to no-mask.
6682   VPValue *BlockMask = nullptr;
6683 
6684   if (OrigLoop->getHeader() == BB) {
6685     if (!CM.blockNeedsPredication(BB))
6686       return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one.
6687 
6688     // Introduce the early-exit compare IV <= BTC to form header block mask.
6689     // This is used instead of IV < TC because TC may wrap, unlike BTC.
6690     VPValue *IV = Plan->getVPValue(Legal->getPrimaryInduction());
6691     VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
6692     BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
6693     return BlockMaskCache[BB] = BlockMask;
6694   }
6695 
6696   // This is the block mask. We OR all incoming edges.
6697   for (auto *Predecessor : predecessors(BB)) {
6698     VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan);
6699     if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too.
6700       return BlockMaskCache[BB] = EdgeMask;
6701 
6702     if (!BlockMask) { // BlockMask has its initialized nullptr value.
6703       BlockMask = EdgeMask;
6704       continue;
6705     }
6706 
6707     BlockMask = Builder.createOr(BlockMask, EdgeMask);
6708   }
6709 
6710   return BlockMaskCache[BB] = BlockMask;
6711 }
6712 
6713 VPWidenMemoryInstructionRecipe *
6714 VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range,
6715                                   VPlanPtr &Plan) {
6716   if (!isa<LoadInst>(I) && !isa<StoreInst>(I))
6717     return nullptr;
6718 
6719   auto willWiden = [&](unsigned VF) -> bool {
6720     if (VF == 1)
6721       return false;
6722     LoopVectorizationCostModel::InstWidening Decision =
6723         CM.getWideningDecision(I, VF);
6724     assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
6725            "CM decision should be taken at this point.");
6726     if (Decision == LoopVectorizationCostModel::CM_Interleave)
6727       return true;
6728     if (CM.isScalarAfterVectorization(I, VF) ||
6729         CM.isProfitableToScalarize(I, VF))
6730       return false;
6731     return Decision != LoopVectorizationCostModel::CM_Scalarize;
6732   };
6733 
6734   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
6735     return nullptr;
6736 
6737   VPValue *Mask = nullptr;
6738   if (Legal->isMaskRequired(I))
6739     Mask = createBlockInMask(I->getParent(), Plan);
6740 
6741   return new VPWidenMemoryInstructionRecipe(*I, Mask);
6742 }
6743 
6744 VPWidenIntOrFpInductionRecipe *
6745 VPRecipeBuilder::tryToOptimizeInduction(Instruction *I, VFRange &Range) {
6746   if (PHINode *Phi = dyn_cast<PHINode>(I)) {
6747     // Check if this is an integer or fp induction. If so, build the recipe that
6748     // produces its scalar and vector values.
6749     InductionDescriptor II = Legal->getInductionVars()->lookup(Phi);
6750     if (II.getKind() == InductionDescriptor::IK_IntInduction ||
6751         II.getKind() == InductionDescriptor::IK_FpInduction)
6752       return new VPWidenIntOrFpInductionRecipe(Phi);
6753 
6754     return nullptr;
6755   }
6756 
6757   // Optimize the special case where the source is a constant integer
6758   // induction variable. Notice that we can only optimize the 'trunc' case
6759   // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
6760   // (c) other casts depend on pointer size.
6761 
6762   // Determine whether \p K is a truncation based on an induction variable that
6763   // can be optimized.
6764   auto isOptimizableIVTruncate =
6765       [&](Instruction *K) -> std::function<bool(unsigned)> {
6766     return
6767         [=](unsigned VF) -> bool { return CM.isOptimizableIVTruncate(K, VF); };
6768   };
6769 
6770   if (isa<TruncInst>(I) && LoopVectorizationPlanner::getDecisionAndClampRange(
6771                                isOptimizableIVTruncate(I), Range))
6772     return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)),
6773                                              cast<TruncInst>(I));
6774   return nullptr;
6775 }
6776 
6777 VPBlendRecipe *VPRecipeBuilder::tryToBlend(Instruction *I, VPlanPtr &Plan) {
6778   PHINode *Phi = dyn_cast<PHINode>(I);
6779   if (!Phi || Phi->getParent() == OrigLoop->getHeader())
6780     return nullptr;
6781 
6782   // We know that all PHIs in non-header blocks are converted into selects, so
6783   // we don't have to worry about the insertion order and we can just use the
6784   // builder. At this point we generate the predication tree. There may be
6785   // duplications since this is a simple recursive scan, but future
6786   // optimizations will clean it up.
6787 
6788   SmallVector<VPValue *, 2> Masks;
6789   unsigned NumIncoming = Phi->getNumIncomingValues();
6790   for (unsigned In = 0; In < NumIncoming; In++) {
6791     VPValue *EdgeMask =
6792       createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan);
6793     assert((EdgeMask || NumIncoming == 1) &&
6794            "Multiple predecessors with one having a full mask");
6795     if (EdgeMask)
6796       Masks.push_back(EdgeMask);
6797   }
6798   return new VPBlendRecipe(Phi, Masks);
6799 }
6800 
6801 bool VPRecipeBuilder::tryToWiden(Instruction *I, VPBasicBlock *VPBB,
6802                                  VFRange &Range) {
6803 
6804   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
6805       [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range);
6806 
6807   if (IsPredicated)
6808     return false;
6809 
6810   auto IsVectorizableOpcode = [](unsigned Opcode) {
6811     switch (Opcode) {
6812     case Instruction::Add:
6813     case Instruction::And:
6814     case Instruction::AShr:
6815     case Instruction::BitCast:
6816     case Instruction::Br:
6817     case Instruction::Call:
6818     case Instruction::FAdd:
6819     case Instruction::FCmp:
6820     case Instruction::FDiv:
6821     case Instruction::FMul:
6822     case Instruction::FNeg:
6823     case Instruction::FPExt:
6824     case Instruction::FPToSI:
6825     case Instruction::FPToUI:
6826     case Instruction::FPTrunc:
6827     case Instruction::FRem:
6828     case Instruction::FSub:
6829     case Instruction::GetElementPtr:
6830     case Instruction::ICmp:
6831     case Instruction::IntToPtr:
6832     case Instruction::Load:
6833     case Instruction::LShr:
6834     case Instruction::Mul:
6835     case Instruction::Or:
6836     case Instruction::PHI:
6837     case Instruction::PtrToInt:
6838     case Instruction::SDiv:
6839     case Instruction::Select:
6840     case Instruction::SExt:
6841     case Instruction::Shl:
6842     case Instruction::SIToFP:
6843     case Instruction::SRem:
6844     case Instruction::Store:
6845     case Instruction::Sub:
6846     case Instruction::Trunc:
6847     case Instruction::UDiv:
6848     case Instruction::UIToFP:
6849     case Instruction::URem:
6850     case Instruction::Xor:
6851     case Instruction::ZExt:
6852       return true;
6853     }
6854     return false;
6855   };
6856 
6857   if (!IsVectorizableOpcode(I->getOpcode()))
6858     return false;
6859 
6860   if (CallInst *CI = dyn_cast<CallInst>(I)) {
6861     Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
6862     if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
6863                ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect))
6864       return false;
6865   }
6866 
6867   auto willWiden = [&](unsigned VF) -> bool {
6868     if (!isa<PHINode>(I) && (CM.isScalarAfterVectorization(I, VF) ||
6869                              CM.isProfitableToScalarize(I, VF)))
6870       return false;
6871     if (CallInst *CI = dyn_cast<CallInst>(I)) {
6872       Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
6873       // The following case may be scalarized depending on the VF.
6874       // The flag shows whether we use Intrinsic or a usual Call for vectorized
6875       // version of the instruction.
6876       // Is it beneficial to perform intrinsic call compared to lib call?
6877       bool NeedToScalarize;
6878       unsigned CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize);
6879       bool UseVectorIntrinsic =
6880           ID && CM.getVectorIntrinsicCost(CI, VF) <= CallCost;
6881       return UseVectorIntrinsic || !NeedToScalarize;
6882     }
6883     if (isa<LoadInst>(I) || isa<StoreInst>(I)) {
6884       assert(CM.getWideningDecision(I, VF) ==
6885                  LoopVectorizationCostModel::CM_Scalarize &&
6886              "Memory widening decisions should have been taken care by now");
6887       return false;
6888     }
6889     return true;
6890   };
6891 
6892   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
6893     return false;
6894 
6895   // If this ingredient's recipe is to be recorded, keep its recipe a singleton
6896   // to avoid having to split recipes later.
6897   bool IsSingleton = Ingredient2Recipe.count(I);
6898 
6899   // Success: widen this instruction. We optimize the common case where
6900   // consecutive instructions can be represented by a single recipe.
6901   if (!IsSingleton && !VPBB->empty() && LastExtensibleRecipe == &VPBB->back() &&
6902       LastExtensibleRecipe->appendInstruction(I))
6903     return true;
6904 
6905   VPWidenRecipe *WidenRecipe = new VPWidenRecipe(I);
6906   if (!IsSingleton)
6907     LastExtensibleRecipe = WidenRecipe;
6908   setRecipe(I, WidenRecipe);
6909   VPBB->appendRecipe(WidenRecipe);
6910   return true;
6911 }
6912 
6913 VPBasicBlock *VPRecipeBuilder::handleReplication(
6914     Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
6915     DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe,
6916     VPlanPtr &Plan) {
6917   bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
6918       [&](unsigned VF) { return CM.isUniformAfterVectorization(I, VF); },
6919       Range);
6920 
6921   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
6922       [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range);
6923 
6924   auto *Recipe = new VPReplicateRecipe(I, IsUniform, IsPredicated);
6925   setRecipe(I, Recipe);
6926 
6927   // Find if I uses a predicated instruction. If so, it will use its scalar
6928   // value. Avoid hoisting the insert-element which packs the scalar value into
6929   // a vector value, as that happens iff all users use the vector value.
6930   for (auto &Op : I->operands())
6931     if (auto *PredInst = dyn_cast<Instruction>(Op))
6932       if (PredInst2Recipe.find(PredInst) != PredInst2Recipe.end())
6933         PredInst2Recipe[PredInst]->setAlsoPack(false);
6934 
6935   // Finalize the recipe for Instr, first if it is not predicated.
6936   if (!IsPredicated) {
6937     LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
6938     VPBB->appendRecipe(Recipe);
6939     return VPBB;
6940   }
6941   LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
6942   assert(VPBB->getSuccessors().empty() &&
6943          "VPBB has successors when handling predicated replication.");
6944   // Record predicated instructions for above packing optimizations.
6945   PredInst2Recipe[I] = Recipe;
6946   VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan);
6947   VPBlockUtils::insertBlockAfter(Region, VPBB);
6948   auto *RegSucc = new VPBasicBlock();
6949   VPBlockUtils::insertBlockAfter(RegSucc, Region);
6950   return RegSucc;
6951 }
6952 
6953 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr,
6954                                                       VPRecipeBase *PredRecipe,
6955                                                       VPlanPtr &Plan) {
6956   // Instructions marked for predication are replicated and placed under an
6957   // if-then construct to prevent side-effects.
6958 
6959   // Generate recipes to compute the block mask for this region.
6960   VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan);
6961 
6962   // Build the triangular if-then region.
6963   std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
6964   assert(Instr->getParent() && "Predicated instruction not in any basic block");
6965   auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask);
6966   auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
6967   auto *PHIRecipe =
6968       Instr->getType()->isVoidTy() ? nullptr : new VPPredInstPHIRecipe(Instr);
6969   auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
6970   auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe);
6971   VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true);
6972 
6973   // Note: first set Entry as region entry and then connect successors starting
6974   // from it in order, to propagate the "parent" of each VPBasicBlock.
6975   VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry);
6976   VPBlockUtils::connectBlocks(Pred, Exit);
6977 
6978   return Region;
6979 }
6980 
6981 bool VPRecipeBuilder::tryToCreateRecipe(Instruction *Instr, VFRange &Range,
6982                                         VPlanPtr &Plan, VPBasicBlock *VPBB) {
6983   VPRecipeBase *Recipe = nullptr;
6984 
6985   // First, check for specific widening recipes that deal with memory
6986   // operations, inductions and Phi nodes.
6987   if ((Recipe = tryToWidenMemory(Instr, Range, Plan)) ||
6988       (Recipe = tryToOptimizeInduction(Instr, Range)) ||
6989       (Recipe = tryToBlend(Instr, Plan)) ||
6990       (isa<PHINode>(Instr) &&
6991        (Recipe = new VPWidenPHIRecipe(cast<PHINode>(Instr))))) {
6992     setRecipe(Instr, Recipe);
6993     VPBB->appendRecipe(Recipe);
6994     return true;
6995   }
6996 
6997   // Check if Instr is to be widened by a general VPWidenRecipe.
6998   if (tryToWiden(Instr, VPBB, Range))
6999     return true;
7000 
7001   return false;
7002 }
7003 
7004 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF,
7005                                                         unsigned MaxVF) {
7006   assert(OrigLoop->empty() && "Inner loop expected.");
7007 
7008   // Collect conditions feeding internal conditional branches; they need to be
7009   // represented in VPlan for it to model masking.
7010   SmallPtrSet<Value *, 1> NeedDef;
7011 
7012   auto *Latch = OrigLoop->getLoopLatch();
7013   for (BasicBlock *BB : OrigLoop->blocks()) {
7014     if (BB == Latch)
7015       continue;
7016     BranchInst *Branch = dyn_cast<BranchInst>(BB->getTerminator());
7017     if (Branch && Branch->isConditional())
7018       NeedDef.insert(Branch->getCondition());
7019   }
7020 
7021   // If the tail is to be folded by masking, the primary induction variable
7022   // needs to be represented in VPlan for it to model early-exit masking.
7023   // Also, both the Phi and the live-out instruction of each reduction are
7024   // required in order to introduce a select between them in VPlan.
7025   if (CM.foldTailByMasking()) {
7026     NeedDef.insert(Legal->getPrimaryInduction());
7027     for (auto &Reduction : *Legal->getReductionVars()) {
7028       NeedDef.insert(Reduction.first);
7029       NeedDef.insert(Reduction.second.getLoopExitInstr());
7030     }
7031   }
7032 
7033   // Collect instructions from the original loop that will become trivially dead
7034   // in the vectorized loop. We don't need to vectorize these instructions. For
7035   // example, original induction update instructions can become dead because we
7036   // separately emit induction "steps" when generating code for the new loop.
7037   // Similarly, we create a new latch condition when setting up the structure
7038   // of the new loop, so the old one can become dead.
7039   SmallPtrSet<Instruction *, 4> DeadInstructions;
7040   collectTriviallyDeadInstructions(DeadInstructions);
7041 
7042   for (unsigned VF = MinVF; VF < MaxVF + 1;) {
7043     VFRange SubRange = {VF, MaxVF + 1};
7044     VPlans.push_back(
7045         buildVPlanWithVPRecipes(SubRange, NeedDef, DeadInstructions));
7046     VF = SubRange.End;
7047   }
7048 }
7049 
7050 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
7051     VFRange &Range, SmallPtrSetImpl<Value *> &NeedDef,
7052     SmallPtrSetImpl<Instruction *> &DeadInstructions) {
7053 
7054   // Hold a mapping from predicated instructions to their recipes, in order to
7055   // fix their AlsoPack behavior if a user is determined to replicate and use a
7056   // scalar instead of vector value.
7057   DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe;
7058 
7059   DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter();
7060 
7061   SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
7062 
7063   VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, Builder);
7064 
7065   // ---------------------------------------------------------------------------
7066   // Pre-construction: record ingredients whose recipes we'll need to further
7067   // process after constructing the initial VPlan.
7068   // ---------------------------------------------------------------------------
7069 
7070   // Mark instructions we'll need to sink later and their targets as
7071   // ingredients whose recipe we'll need to record.
7072   for (auto &Entry : SinkAfter) {
7073     RecipeBuilder.recordRecipeOf(Entry.first);
7074     RecipeBuilder.recordRecipeOf(Entry.second);
7075   }
7076 
7077   // For each interleave group which is relevant for this (possibly trimmed)
7078   // Range, add it to the set of groups to be later applied to the VPlan and add
7079   // placeholders for its members' Recipes which we'll be replacing with a
7080   // single VPInterleaveRecipe.
7081   for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
7082     auto applyIG = [IG, this](unsigned VF) -> bool {
7083       return (VF >= 2 && // Query is illegal for VF == 1
7084               CM.getWideningDecision(IG->getInsertPos(), VF) ==
7085                   LoopVectorizationCostModel::CM_Interleave);
7086     };
7087     if (!getDecisionAndClampRange(applyIG, Range))
7088       continue;
7089     InterleaveGroups.insert(IG);
7090     for (unsigned i = 0; i < IG->getFactor(); i++)
7091       if (Instruction *Member = IG->getMember(i))
7092         RecipeBuilder.recordRecipeOf(Member);
7093   };
7094 
7095   // ---------------------------------------------------------------------------
7096   // Build initial VPlan: Scan the body of the loop in a topological order to
7097   // visit each basic block after having visited its predecessor basic blocks.
7098   // ---------------------------------------------------------------------------
7099 
7100   // Create a dummy pre-entry VPBasicBlock to start building the VPlan.
7101   VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry");
7102   auto Plan = std::make_unique<VPlan>(VPBB);
7103 
7104   // Represent values that will have defs inside VPlan.
7105   for (Value *V : NeedDef)
7106     Plan->addVPValue(V);
7107 
7108   // Scan the body of the loop in a topological order to visit each basic block
7109   // after having visited its predecessor basic blocks.
7110   LoopBlocksDFS DFS(OrigLoop);
7111   DFS.perform(LI);
7112 
7113   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
7114     // Relevant instructions from basic block BB will be grouped into VPRecipe
7115     // ingredients and fill a new VPBasicBlock.
7116     unsigned VPBBsForBB = 0;
7117     auto *FirstVPBBForBB = new VPBasicBlock(BB->getName());
7118     VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB);
7119     VPBB = FirstVPBBForBB;
7120     Builder.setInsertPoint(VPBB);
7121 
7122     // Introduce each ingredient into VPlan.
7123     for (Instruction &I : BB->instructionsWithoutDebug()) {
7124       Instruction *Instr = &I;
7125 
7126       // First filter out irrelevant instructions, to ensure no recipes are
7127       // built for them.
7128       if (isa<BranchInst>(Instr) ||
7129           DeadInstructions.find(Instr) != DeadInstructions.end())
7130         continue;
7131 
7132       if (RecipeBuilder.tryToCreateRecipe(Instr, Range, Plan, VPBB))
7133         continue;
7134 
7135       // Otherwise, if all widening options failed, Instruction is to be
7136       // replicated. This may create a successor for VPBB.
7137       VPBasicBlock *NextVPBB = RecipeBuilder.handleReplication(
7138           Instr, Range, VPBB, PredInst2Recipe, Plan);
7139       if (NextVPBB != VPBB) {
7140         VPBB = NextVPBB;
7141         VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++)
7142                                     : "");
7143       }
7144     }
7145   }
7146 
7147   // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks
7148   // may also be empty, such as the last one VPBB, reflecting original
7149   // basic-blocks with no recipes.
7150   VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry());
7151   assert(PreEntry->empty() && "Expecting empty pre-entry block.");
7152   VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor());
7153   VPBlockUtils::disconnectBlocks(PreEntry, Entry);
7154   delete PreEntry;
7155 
7156   // ---------------------------------------------------------------------------
7157   // Transform initial VPlan: Apply previously taken decisions, in order, to
7158   // bring the VPlan to its final state.
7159   // ---------------------------------------------------------------------------
7160 
7161   // Apply Sink-After legal constraints.
7162   for (auto &Entry : SinkAfter) {
7163     VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first);
7164     VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second);
7165     Sink->moveAfter(Target);
7166   }
7167 
7168   // Interleave memory: for each Interleave Group we marked earlier as relevant
7169   // for this VPlan, replace the Recipes widening its memory instructions with a
7170   // single VPInterleaveRecipe at its insertion point.
7171   for (auto IG : InterleaveGroups) {
7172     auto *Recipe = cast<VPWidenMemoryInstructionRecipe>(
7173         RecipeBuilder.getRecipe(IG->getInsertPos()));
7174     (new VPInterleaveRecipe(IG, Recipe->getMask()))->insertBefore(Recipe);
7175 
7176     for (unsigned i = 0; i < IG->getFactor(); ++i)
7177       if (Instruction *Member = IG->getMember(i)) {
7178         RecipeBuilder.getRecipe(Member)->eraseFromParent();
7179       }
7180   }
7181 
7182   // Finally, if tail is folded by masking, introduce selects between the phi
7183   // and the live-out instruction of each reduction, at the end of the latch.
7184   if (CM.foldTailByMasking()) {
7185     Builder.setInsertPoint(VPBB);
7186     auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan);
7187     for (auto &Reduction : *Legal->getReductionVars()) {
7188       VPValue *Phi = Plan->getVPValue(Reduction.first);
7189       VPValue *Red = Plan->getVPValue(Reduction.second.getLoopExitInstr());
7190       Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi});
7191     }
7192   }
7193 
7194   std::string PlanName;
7195   raw_string_ostream RSO(PlanName);
7196   unsigned VF = Range.Start;
7197   Plan->addVF(VF);
7198   RSO << "Initial VPlan for VF={" << VF;
7199   for (VF *= 2; VF < Range.End; VF *= 2) {
7200     Plan->addVF(VF);
7201     RSO << "," << VF;
7202   }
7203   RSO << "},UF>=1";
7204   RSO.flush();
7205   Plan->setName(PlanName);
7206 
7207   return Plan;
7208 }
7209 
7210 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
7211   // Outer loop handling: They may require CFG and instruction level
7212   // transformations before even evaluating whether vectorization is profitable.
7213   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
7214   // the vectorization pipeline.
7215   assert(!OrigLoop->empty());
7216   assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
7217 
7218   // Create new empty VPlan
7219   auto Plan = std::make_unique<VPlan>();
7220 
7221   // Build hierarchical CFG
7222   VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
7223   HCFGBuilder.buildHierarchicalCFG();
7224 
7225   for (unsigned VF = Range.Start; VF < Range.End; VF *= 2)
7226     Plan->addVF(VF);
7227 
7228   if (EnableVPlanPredication) {
7229     VPlanPredicator VPP(*Plan);
7230     VPP.predicate();
7231 
7232     // Avoid running transformation to recipes until masked code generation in
7233     // VPlan-native path is in place.
7234     return Plan;
7235   }
7236 
7237   SmallPtrSet<Instruction *, 1> DeadInstructions;
7238   VPlanHCFGTransforms::VPInstructionsToVPRecipes(
7239       Plan, Legal->getInductionVars(), DeadInstructions);
7240 
7241   return Plan;
7242 }
7243 
7244 Value* LoopVectorizationPlanner::VPCallbackILV::
7245 getOrCreateVectorValues(Value *V, unsigned Part) {
7246       return ILV.getOrCreateVectorValue(V, Part);
7247 }
7248 
7249 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent) const {
7250   O << " +\n"
7251     << Indent << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
7252   IG->getInsertPos()->printAsOperand(O, false);
7253   if (User) {
7254     O << ", ";
7255     User->getOperand(0)->printAsOperand(O);
7256   }
7257   O << "\\l\"";
7258   for (unsigned i = 0; i < IG->getFactor(); ++i)
7259     if (Instruction *I = IG->getMember(i))
7260       O << " +\n"
7261         << Indent << "\"  " << VPlanIngredient(I) << " " << i << "\\l\"";
7262 }
7263 
7264 void VPWidenRecipe::execute(VPTransformState &State) {
7265   for (auto &Instr : make_range(Begin, End))
7266     State.ILV->widenInstruction(Instr);
7267 }
7268 
7269 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
7270   assert(!State.Instance && "Int or FP induction being replicated.");
7271   State.ILV->widenIntOrFpInduction(IV, Trunc);
7272 }
7273 
7274 void VPWidenPHIRecipe::execute(VPTransformState &State) {
7275   State.ILV->widenPHIInstruction(Phi, State.UF, State.VF);
7276 }
7277 
7278 void VPBlendRecipe::execute(VPTransformState &State) {
7279   State.ILV->setDebugLocFromInst(State.Builder, Phi);
7280   // We know that all PHIs in non-header blocks are converted into
7281   // selects, so we don't have to worry about the insertion order and we
7282   // can just use the builder.
7283   // At this point we generate the predication tree. There may be
7284   // duplications since this is a simple recursive scan, but future
7285   // optimizations will clean it up.
7286 
7287   unsigned NumIncoming = Phi->getNumIncomingValues();
7288 
7289   assert((User || NumIncoming == 1) &&
7290          "Multiple predecessors with predecessors having a full mask");
7291   // Generate a sequence of selects of the form:
7292   // SELECT(Mask3, In3,
7293   //      SELECT(Mask2, In2,
7294   //                   ( ...)))
7295   InnerLoopVectorizer::VectorParts Entry(State.UF);
7296   for (unsigned In = 0; In < NumIncoming; ++In) {
7297     for (unsigned Part = 0; Part < State.UF; ++Part) {
7298       // We might have single edge PHIs (blocks) - use an identity
7299       // 'select' for the first PHI operand.
7300       Value *In0 =
7301           State.ILV->getOrCreateVectorValue(Phi->getIncomingValue(In), Part);
7302       if (In == 0)
7303         Entry[Part] = In0; // Initialize with the first incoming value.
7304       else {
7305         // Select between the current value and the previous incoming edge
7306         // based on the incoming mask.
7307         Value *Cond = State.get(User->getOperand(In), Part);
7308         Entry[Part] =
7309             State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi");
7310       }
7311     }
7312   }
7313   for (unsigned Part = 0; Part < State.UF; ++Part)
7314     State.ValueMap.setVectorValue(Phi, Part, Entry[Part]);
7315 }
7316 
7317 void VPInterleaveRecipe::execute(VPTransformState &State) {
7318   assert(!State.Instance && "Interleave group being replicated.");
7319   if (!User)
7320     return State.ILV->vectorizeInterleaveGroup(IG->getInsertPos());
7321 
7322   // Last (and currently only) operand is a mask.
7323   InnerLoopVectorizer::VectorParts MaskValues(State.UF);
7324   VPValue *Mask = User->getOperand(User->getNumOperands() - 1);
7325   for (unsigned Part = 0; Part < State.UF; ++Part)
7326     MaskValues[Part] = State.get(Mask, Part);
7327   State.ILV->vectorizeInterleaveGroup(IG->getInsertPos(), &MaskValues);
7328 }
7329 
7330 void VPReplicateRecipe::execute(VPTransformState &State) {
7331   if (State.Instance) { // Generate a single instance.
7332     State.ILV->scalarizeInstruction(Ingredient, *State.Instance, IsPredicated);
7333     // Insert scalar instance packing it into a vector.
7334     if (AlsoPack && State.VF > 1) {
7335       // If we're constructing lane 0, initialize to start from undef.
7336       if (State.Instance->Lane == 0) {
7337         Value *Undef =
7338             UndefValue::get(VectorType::get(Ingredient->getType(), State.VF));
7339         State.ValueMap.setVectorValue(Ingredient, State.Instance->Part, Undef);
7340       }
7341       State.ILV->packScalarIntoVectorValue(Ingredient, *State.Instance);
7342     }
7343     return;
7344   }
7345 
7346   // Generate scalar instances for all VF lanes of all UF parts, unless the
7347   // instruction is uniform inwhich case generate only the first lane for each
7348   // of the UF parts.
7349   unsigned EndLane = IsUniform ? 1 : State.VF;
7350   for (unsigned Part = 0; Part < State.UF; ++Part)
7351     for (unsigned Lane = 0; Lane < EndLane; ++Lane)
7352       State.ILV->scalarizeInstruction(Ingredient, {Part, Lane}, IsPredicated);
7353 }
7354 
7355 void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
7356   assert(State.Instance && "Branch on Mask works only on single instance.");
7357 
7358   unsigned Part = State.Instance->Part;
7359   unsigned Lane = State.Instance->Lane;
7360 
7361   Value *ConditionBit = nullptr;
7362   if (!User) // Block in mask is all-one.
7363     ConditionBit = State.Builder.getTrue();
7364   else {
7365     VPValue *BlockInMask = User->getOperand(0);
7366     ConditionBit = State.get(BlockInMask, Part);
7367     if (ConditionBit->getType()->isVectorTy())
7368       ConditionBit = State.Builder.CreateExtractElement(
7369           ConditionBit, State.Builder.getInt32(Lane));
7370   }
7371 
7372   // Replace the temporary unreachable terminator with a new conditional branch,
7373   // whose two destinations will be set later when they are created.
7374   auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
7375   assert(isa<UnreachableInst>(CurrentTerminator) &&
7376          "Expected to replace unreachable terminator with conditional branch.");
7377   auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
7378   CondBr->setSuccessor(0, nullptr);
7379   ReplaceInstWithInst(CurrentTerminator, CondBr);
7380 }
7381 
7382 void VPPredInstPHIRecipe::execute(VPTransformState &State) {
7383   assert(State.Instance && "Predicated instruction PHI works per instance.");
7384   Instruction *ScalarPredInst = cast<Instruction>(
7385       State.ValueMap.getScalarValue(PredInst, *State.Instance));
7386   BasicBlock *PredicatedBB = ScalarPredInst->getParent();
7387   BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
7388   assert(PredicatingBB && "Predicated block has no single predecessor.");
7389 
7390   // By current pack/unpack logic we need to generate only a single phi node: if
7391   // a vector value for the predicated instruction exists at this point it means
7392   // the instruction has vector users only, and a phi for the vector value is
7393   // needed. In this case the recipe of the predicated instruction is marked to
7394   // also do that packing, thereby "hoisting" the insert-element sequence.
7395   // Otherwise, a phi node for the scalar value is needed.
7396   unsigned Part = State.Instance->Part;
7397   if (State.ValueMap.hasVectorValue(PredInst, Part)) {
7398     Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part);
7399     InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
7400     PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
7401     VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
7402     VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
7403     State.ValueMap.resetVectorValue(PredInst, Part, VPhi); // Update cache.
7404   } else {
7405     Type *PredInstType = PredInst->getType();
7406     PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
7407     Phi->addIncoming(UndefValue::get(ScalarPredInst->getType()), PredicatingBB);
7408     Phi->addIncoming(ScalarPredInst, PredicatedBB);
7409     State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi);
7410   }
7411 }
7412 
7413 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
7414   VPValue *Mask = getMask();
7415   if (!Mask)
7416     return State.ILV->vectorizeMemoryInstruction(&Instr);
7417 
7418   InnerLoopVectorizer::VectorParts MaskValues(State.UF);
7419   for (unsigned Part = 0; Part < State.UF; ++Part)
7420     MaskValues[Part] = State.get(Mask, Part);
7421   State.ILV->vectorizeMemoryInstruction(&Instr, &MaskValues);
7422 }
7423 
7424 static ScalarEpilogueLowering
7425 getScalarEpilogueLowering(Function *F, Loop *L, LoopVectorizeHints &Hints,
7426                           ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI,
7427                           TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
7428                           AssumptionCache *AC, LoopInfo *LI,
7429                           ScalarEvolution *SE, DominatorTree *DT,
7430                           const LoopAccessInfo *LAI) {
7431   ScalarEpilogueLowering SEL = CM_ScalarEpilogueAllowed;
7432   bool PredicateOptDisabled = PreferPredicateOverEpilog.getNumOccurrences() &&
7433                               !PreferPredicateOverEpilog;
7434 
7435   if (Hints.getForce() != LoopVectorizeHints::FK_Enabled &&
7436       (F->hasOptSize() ||
7437        llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI)))
7438     SEL = CM_ScalarEpilogueNotAllowedOptSize;
7439   else if (PreferPredicateOverEpilog ||
7440            Hints.getPredicate() == LoopVectorizeHints::FK_Enabled ||
7441            (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, LAI) &&
7442             Hints.getPredicate() != LoopVectorizeHints::FK_Disabled &&
7443             !PredicateOptDisabled))
7444     SEL = CM_ScalarEpilogueNotNeededUsePredicate;
7445 
7446   return SEL;
7447 }
7448 
7449 // Process the loop in the VPlan-native vectorization path. This path builds
7450 // VPlan upfront in the vectorization pipeline, which allows to apply
7451 // VPlan-to-VPlan transformations from the very beginning without modifying the
7452 // input LLVM IR.
7453 static bool processLoopInVPlanNativePath(
7454     Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
7455     LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
7456     TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
7457     OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
7458     ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) {
7459 
7460   assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
7461   Function *F = L->getHeader()->getParent();
7462   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
7463 
7464   ScalarEpilogueLowering SEL =
7465     getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, AC, LI,
7466                               PSE.getSE(), DT, LVL->getLAI());
7467 
7468   LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
7469                                 &Hints, IAI);
7470   // Use the planner for outer loop vectorization.
7471   // TODO: CM is not used at this point inside the planner. Turn CM into an
7472   // optional argument if we don't need it in the future.
7473   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI);
7474 
7475   // Get user vectorization factor.
7476   const unsigned UserVF = Hints.getWidth();
7477 
7478   // Plan how to best vectorize, return the best VF and its cost.
7479   const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
7480 
7481   // If we are stress testing VPlan builds, do not attempt to generate vector
7482   // code. Masked vector code generation support will follow soon.
7483   // Also, do not attempt to vectorize if no vector code will be produced.
7484   if (VPlanBuildStressTest || EnableVPlanPredication ||
7485       VectorizationFactor::Disabled() == VF)
7486     return false;
7487 
7488   LVP.setBestPlan(VF.Width, 1);
7489 
7490   InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL,
7491                          &CM);
7492   LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
7493                     << L->getHeader()->getParent()->getName() << "\"\n");
7494   LVP.executePlan(LB, DT);
7495 
7496   // Mark the loop as already vectorized to avoid vectorizing again.
7497   Hints.setAlreadyVectorized();
7498 
7499   LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent()));
7500   return true;
7501 }
7502 
7503 bool LoopVectorizePass::processLoop(Loop *L) {
7504   assert((EnableVPlanNativePath || L->empty()) &&
7505          "VPlan-native path is not enabled. Only process inner loops.");
7506 
7507 #ifndef NDEBUG
7508   const std::string DebugLocStr = getDebugLocString(L);
7509 #endif /* NDEBUG */
7510 
7511   LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \""
7512                     << L->getHeader()->getParent()->getName() << "\" from "
7513                     << DebugLocStr << "\n");
7514 
7515   LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE);
7516 
7517   LLVM_DEBUG(
7518       dbgs() << "LV: Loop hints:"
7519              << " force="
7520              << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
7521                      ? "disabled"
7522                      : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
7523                             ? "enabled"
7524                             : "?"))
7525              << " width=" << Hints.getWidth()
7526              << " unroll=" << Hints.getInterleave() << "\n");
7527 
7528   // Function containing loop
7529   Function *F = L->getHeader()->getParent();
7530 
7531   // Looking at the diagnostic output is the only way to determine if a loop
7532   // was vectorized (other than looking at the IR or machine code), so it
7533   // is important to generate an optimization remark for each loop. Most of
7534   // these messages are generated as OptimizationRemarkAnalysis. Remarks
7535   // generated as OptimizationRemark and OptimizationRemarkMissed are
7536   // less verbose reporting vectorized loops and unvectorized loops that may
7537   // benefit from vectorization, respectively.
7538 
7539   if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
7540     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
7541     return false;
7542   }
7543 
7544   PredicatedScalarEvolution PSE(*SE, *L);
7545 
7546   // Check if it is legal to vectorize the loop.
7547   LoopVectorizationRequirements Requirements(*ORE);
7548   LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE,
7549                                 &Requirements, &Hints, DB, AC);
7550   if (!LVL.canVectorize(EnableVPlanNativePath)) {
7551     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
7552     Hints.emitRemarkWithHints();
7553     return false;
7554   }
7555 
7556   // Check the function attributes and profiles to find out if this function
7557   // should be optimized for size.
7558   ScalarEpilogueLowering SEL =
7559     getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, AC, LI,
7560                               PSE.getSE(), DT, LVL.getLAI());
7561 
7562   // Entrance to the VPlan-native vectorization path. Outer loops are processed
7563   // here. They may require CFG and instruction level transformations before
7564   // even evaluating whether vectorization is profitable. Since we cannot modify
7565   // the incoming IR, we need to build VPlan upfront in the vectorization
7566   // pipeline.
7567   if (!L->empty())
7568     return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
7569                                         ORE, BFI, PSI, Hints);
7570 
7571   assert(L->empty() && "Inner loop expected.");
7572 
7573   // Check the loop for a trip count threshold: vectorize loops with a tiny trip
7574   // count by optimizing for size, to minimize overheads.
7575   auto ExpectedTC = getSmallBestKnownTC(*SE, L);
7576   if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
7577     LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
7578                       << "This loop is worth vectorizing only if no scalar "
7579                       << "iteration overheads are incurred.");
7580     if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
7581       LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
7582     else {
7583       LLVM_DEBUG(dbgs() << "\n");
7584       SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
7585     }
7586   }
7587 
7588   // Check the function attributes to see if implicit floats are allowed.
7589   // FIXME: This check doesn't seem possibly correct -- what if the loop is
7590   // an integer loop and the vector instructions selected are purely integer
7591   // vector instructions?
7592   if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
7593     reportVectorizationFailure(
7594         "Can't vectorize when the NoImplicitFloat attribute is used",
7595         "loop not vectorized due to NoImplicitFloat attribute",
7596         "NoImplicitFloat", ORE, L);
7597     Hints.emitRemarkWithHints();
7598     return false;
7599   }
7600 
7601   // Check if the target supports potentially unsafe FP vectorization.
7602   // FIXME: Add a check for the type of safety issue (denormal, signaling)
7603   // for the target we're vectorizing for, to make sure none of the
7604   // additional fp-math flags can help.
7605   if (Hints.isPotentiallyUnsafe() &&
7606       TTI->isFPVectorizationPotentiallyUnsafe()) {
7607     reportVectorizationFailure(
7608         "Potentially unsafe FP op prevents vectorization",
7609         "loop not vectorized due to unsafe FP support.",
7610         "UnsafeFP", ORE, L);
7611     Hints.emitRemarkWithHints();
7612     return false;
7613   }
7614 
7615   bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
7616   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
7617 
7618   // If an override option has been passed in for interleaved accesses, use it.
7619   if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
7620     UseInterleaved = EnableInterleavedMemAccesses;
7621 
7622   // Analyze interleaved memory accesses.
7623   if (UseInterleaved) {
7624     IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
7625   }
7626 
7627   // Use the cost model.
7628   LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
7629                                 F, &Hints, IAI);
7630   CM.collectValuesToIgnore();
7631 
7632   // Use the planner for vectorization.
7633   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI);
7634 
7635   // Get user vectorization factor.
7636   unsigned UserVF = Hints.getWidth();
7637 
7638   // Plan how to best vectorize, return the best VF and its cost.
7639   Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF);
7640 
7641   VectorizationFactor VF = VectorizationFactor::Disabled();
7642   unsigned IC = 1;
7643   unsigned UserIC = Hints.getInterleave();
7644 
7645   if (MaybeVF) {
7646     VF = *MaybeVF;
7647     // Select the interleave count.
7648     IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
7649   }
7650 
7651   // Identify the diagnostic messages that should be produced.
7652   std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
7653   bool VectorizeLoop = true, InterleaveLoop = true;
7654   if (Requirements.doesNotMeet(F, L, Hints)) {
7655     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization "
7656                          "requirements.\n");
7657     Hints.emitRemarkWithHints();
7658     return false;
7659   }
7660 
7661   if (VF.Width == 1) {
7662     LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
7663     VecDiagMsg = std::make_pair(
7664         "VectorizationNotBeneficial",
7665         "the cost-model indicates that vectorization is not beneficial");
7666     VectorizeLoop = false;
7667   }
7668 
7669   if (!MaybeVF && UserIC > 1) {
7670     // Tell the user interleaving was avoided up-front, despite being explicitly
7671     // requested.
7672     LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
7673                          "interleaving should be avoided up front\n");
7674     IntDiagMsg = std::make_pair(
7675         "InterleavingAvoided",
7676         "Ignoring UserIC, because interleaving was avoided up front");
7677     InterleaveLoop = false;
7678   } else if (IC == 1 && UserIC <= 1) {
7679     // Tell the user interleaving is not beneficial.
7680     LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
7681     IntDiagMsg = std::make_pair(
7682         "InterleavingNotBeneficial",
7683         "the cost-model indicates that interleaving is not beneficial");
7684     InterleaveLoop = false;
7685     if (UserIC == 1) {
7686       IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
7687       IntDiagMsg.second +=
7688           " and is explicitly disabled or interleave count is set to 1";
7689     }
7690   } else if (IC > 1 && UserIC == 1) {
7691     // Tell the user interleaving is beneficial, but it explicitly disabled.
7692     LLVM_DEBUG(
7693         dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
7694     IntDiagMsg = std::make_pair(
7695         "InterleavingBeneficialButDisabled",
7696         "the cost-model indicates that interleaving is beneficial "
7697         "but is explicitly disabled or interleave count is set to 1");
7698     InterleaveLoop = false;
7699   }
7700 
7701   // Override IC if user provided an interleave count.
7702   IC = UserIC > 0 ? UserIC : IC;
7703 
7704   // Emit diagnostic messages, if any.
7705   const char *VAPassName = Hints.vectorizeAnalysisPassName();
7706   if (!VectorizeLoop && !InterleaveLoop) {
7707     // Do not vectorize or interleaving the loop.
7708     ORE->emit([&]() {
7709       return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
7710                                       L->getStartLoc(), L->getHeader())
7711              << VecDiagMsg.second;
7712     });
7713     ORE->emit([&]() {
7714       return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
7715                                       L->getStartLoc(), L->getHeader())
7716              << IntDiagMsg.second;
7717     });
7718     return false;
7719   } else if (!VectorizeLoop && InterleaveLoop) {
7720     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
7721     ORE->emit([&]() {
7722       return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
7723                                         L->getStartLoc(), L->getHeader())
7724              << VecDiagMsg.second;
7725     });
7726   } else if (VectorizeLoop && !InterleaveLoop) {
7727     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
7728                       << ") in " << DebugLocStr << '\n');
7729     ORE->emit([&]() {
7730       return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
7731                                         L->getStartLoc(), L->getHeader())
7732              << IntDiagMsg.second;
7733     });
7734   } else if (VectorizeLoop && InterleaveLoop) {
7735     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
7736                       << ") in " << DebugLocStr << '\n');
7737     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
7738   }
7739 
7740   LVP.setBestPlan(VF.Width, IC);
7741 
7742   using namespace ore;
7743   bool DisableRuntimeUnroll = false;
7744   MDNode *OrigLoopID = L->getLoopID();
7745 
7746   if (!VectorizeLoop) {
7747     assert(IC > 1 && "interleave count should not be 1 or 0");
7748     // If we decided that it is not legal to vectorize the loop, then
7749     // interleave it.
7750     InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
7751                                &CM);
7752     LVP.executePlan(Unroller, DT);
7753 
7754     ORE->emit([&]() {
7755       return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
7756                                 L->getHeader())
7757              << "interleaved loop (interleaved count: "
7758              << NV("InterleaveCount", IC) << ")";
7759     });
7760   } else {
7761     // If we decided that it is *legal* to vectorize the loop, then do it.
7762     InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
7763                            &LVL, &CM);
7764     LVP.executePlan(LB, DT);
7765     ++LoopsVectorized;
7766 
7767     // Add metadata to disable runtime unrolling a scalar loop when there are
7768     // no runtime checks about strides and memory. A scalar loop that is
7769     // rarely used is not worth unrolling.
7770     if (!LB.areSafetyChecksAdded())
7771       DisableRuntimeUnroll = true;
7772 
7773     // Report the vectorization decision.
7774     ORE->emit([&]() {
7775       return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(),
7776                                 L->getHeader())
7777              << "vectorized loop (vectorization width: "
7778              << NV("VectorizationFactor", VF.Width)
7779              << ", interleaved count: " << NV("InterleaveCount", IC) << ")";
7780     });
7781   }
7782 
7783   Optional<MDNode *> RemainderLoopID =
7784       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
7785                                       LLVMLoopVectorizeFollowupEpilogue});
7786   if (RemainderLoopID.hasValue()) {
7787     L->setLoopID(RemainderLoopID.getValue());
7788   } else {
7789     if (DisableRuntimeUnroll)
7790       AddRuntimeUnrollDisableMetaData(L);
7791 
7792     // Mark the loop as already vectorized to avoid vectorizing again.
7793     Hints.setAlreadyVectorized();
7794   }
7795 
7796   LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent()));
7797   return true;
7798 }
7799 
7800 bool LoopVectorizePass::runImpl(
7801     Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_,
7802     DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_,
7803     DemandedBits &DB_, AliasAnalysis &AA_, AssumptionCache &AC_,
7804     std::function<const LoopAccessInfo &(Loop &)> &GetLAA_,
7805     OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) {
7806   SE = &SE_;
7807   LI = &LI_;
7808   TTI = &TTI_;
7809   DT = &DT_;
7810   BFI = &BFI_;
7811   TLI = TLI_;
7812   AA = &AA_;
7813   AC = &AC_;
7814   GetLAA = &GetLAA_;
7815   DB = &DB_;
7816   ORE = &ORE_;
7817   PSI = PSI_;
7818 
7819   // Don't attempt if
7820   // 1. the target claims to have no vector registers, and
7821   // 2. interleaving won't help ILP.
7822   //
7823   // The second condition is necessary because, even if the target has no
7824   // vector registers, loop vectorization may still enable scalar
7825   // interleaving.
7826   if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) &&
7827       TTI->getMaxInterleaveFactor(1) < 2)
7828     return false;
7829 
7830   bool Changed = false;
7831 
7832   // The vectorizer requires loops to be in simplified form.
7833   // Since simplification may add new inner loops, it has to run before the
7834   // legality and profitability checks. This means running the loop vectorizer
7835   // will simplify all loops, regardless of whether anything end up being
7836   // vectorized.
7837   for (auto &L : *LI)
7838     Changed |=
7839         simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
7840 
7841   // Build up a worklist of inner-loops to vectorize. This is necessary as
7842   // the act of vectorizing or partially unrolling a loop creates new loops
7843   // and can invalidate iterators across the loops.
7844   SmallVector<Loop *, 8> Worklist;
7845 
7846   for (Loop *L : *LI)
7847     collectSupportedLoops(*L, LI, ORE, Worklist);
7848 
7849   LoopsAnalyzed += Worklist.size();
7850 
7851   // Now walk the identified inner loops.
7852   while (!Worklist.empty()) {
7853     Loop *L = Worklist.pop_back_val();
7854 
7855     // For the inner loops we actually process, form LCSSA to simplify the
7856     // transform.
7857     Changed |= formLCSSARecursively(*L, *DT, LI, SE);
7858 
7859     Changed |= processLoop(L);
7860   }
7861 
7862   // Process each loop nest in the function.
7863   return Changed;
7864 }
7865 
7866 PreservedAnalyses LoopVectorizePass::run(Function &F,
7867                                          FunctionAnalysisManager &AM) {
7868     auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
7869     auto &LI = AM.getResult<LoopAnalysis>(F);
7870     auto &TTI = AM.getResult<TargetIRAnalysis>(F);
7871     auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
7872     auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F);
7873     auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
7874     auto &AA = AM.getResult<AAManager>(F);
7875     auto &AC = AM.getResult<AssumptionAnalysis>(F);
7876     auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
7877     auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
7878     MemorySSA *MSSA = EnableMSSALoopDependency
7879                           ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA()
7880                           : nullptr;
7881 
7882     auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
7883     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
7884         [&](Loop &L) -> const LoopAccessInfo & {
7885       LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI, MSSA};
7886       return LAM.getResult<LoopAccessAnalysis>(L, AR);
7887     };
7888     const ModuleAnalysisManager &MAM =
7889         AM.getResult<ModuleAnalysisManagerFunctionProxy>(F).getManager();
7890     ProfileSummaryInfo *PSI =
7891         MAM.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
7892     bool Changed =
7893         runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI);
7894     if (!Changed)
7895       return PreservedAnalyses::all();
7896     PreservedAnalyses PA;
7897 
7898     // We currently do not preserve loopinfo/dominator analyses with outer loop
7899     // vectorization. Until this is addressed, mark these analyses as preserved
7900     // only for non-VPlan-native path.
7901     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
7902     if (!EnableVPlanNativePath) {
7903       PA.preserve<LoopAnalysis>();
7904       PA.preserve<DominatorTreeAnalysis>();
7905     }
7906     PA.preserve<BasicAA>();
7907     PA.preserve<GlobalsAA>();
7908     return PA;
7909 }
7910