1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10 // and generates target-independent LLVM-IR.
11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12 // of instructions in order to estimate the profitability of vectorization.
13 //
14 // The loop vectorizer combines consecutive loop iterations into a single
15 // 'wide' iteration. After this transformation the index is incremented
16 // by the SIMD vector width, and not by one.
17 //
18 // This pass has three parts:
19 // 1. The main loop pass that drives the different parts.
20 // 2. LoopVectorizationLegality - A unit that checks for the legality
21 //    of the vectorization.
22 // 3. InnerLoopVectorizer - A unit that performs the actual
23 //    widening of instructions.
24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
25 //    of vectorization. It decides on the optimal vector width, which
26 //    can be one, if vectorization is not profitable.
27 //
28 // There is a development effort going on to migrate loop vectorizer to the
29 // VPlan infrastructure and to introduce outer loop vectorization support (see
30 // docs/Proposal/VectorizationPlan.rst and
31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32 // purpose, we temporarily introduced the VPlan-native vectorization path: an
33 // alternative vectorization path that is natively implemented on top of the
34 // VPlan infrastructure. See EnableVPlanNativePath for enabling.
35 //
36 //===----------------------------------------------------------------------===//
37 //
38 // The reduction-variable vectorization is based on the paper:
39 //  D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40 //
41 // Variable uniformity checks are inspired by:
42 //  Karrenberg, R. and Hack, S. Whole Function Vectorization.
43 //
44 // The interleaved access vectorization is based on the paper:
45 //  Dorit Nuzman, Ira Rosen and Ayal Zaks.  Auto-Vectorization of Interleaved
46 //  Data for SIMD
47 //
48 // Other ideas/concepts are from:
49 //  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50 //
51 //  S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua.  An Evaluation of
52 //  Vectorizing Compilers.
53 //
54 //===----------------------------------------------------------------------===//
55 
56 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
57 #include "LoopVectorizationPlanner.h"
58 #include "VPRecipeBuilder.h"
59 #include "VPlan.h"
60 #include "VPlanHCFGBuilder.h"
61 #include "VPlanPredicator.h"
62 #include "VPlanTransforms.h"
63 #include "llvm/ADT/APInt.h"
64 #include "llvm/ADT/ArrayRef.h"
65 #include "llvm/ADT/DenseMap.h"
66 #include "llvm/ADT/DenseMapInfo.h"
67 #include "llvm/ADT/Hashing.h"
68 #include "llvm/ADT/MapVector.h"
69 #include "llvm/ADT/None.h"
70 #include "llvm/ADT/Optional.h"
71 #include "llvm/ADT/STLExtras.h"
72 #include "llvm/ADT/SetVector.h"
73 #include "llvm/ADT/SmallPtrSet.h"
74 #include "llvm/ADT/SmallVector.h"
75 #include "llvm/ADT/Statistic.h"
76 #include "llvm/ADT/StringRef.h"
77 #include "llvm/ADT/Twine.h"
78 #include "llvm/ADT/iterator_range.h"
79 #include "llvm/Analysis/AssumptionCache.h"
80 #include "llvm/Analysis/BasicAliasAnalysis.h"
81 #include "llvm/Analysis/BlockFrequencyInfo.h"
82 #include "llvm/Analysis/CFG.h"
83 #include "llvm/Analysis/CodeMetrics.h"
84 #include "llvm/Analysis/DemandedBits.h"
85 #include "llvm/Analysis/GlobalsModRef.h"
86 #include "llvm/Analysis/LoopAccessAnalysis.h"
87 #include "llvm/Analysis/LoopAnalysisManager.h"
88 #include "llvm/Analysis/LoopInfo.h"
89 #include "llvm/Analysis/LoopIterator.h"
90 #include "llvm/Analysis/MemorySSA.h"
91 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
92 #include "llvm/Analysis/ProfileSummaryInfo.h"
93 #include "llvm/Analysis/ScalarEvolution.h"
94 #include "llvm/Analysis/ScalarEvolutionExpander.h"
95 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
96 #include "llvm/Analysis/TargetLibraryInfo.h"
97 #include "llvm/Analysis/TargetTransformInfo.h"
98 #include "llvm/Analysis/VectorUtils.h"
99 #include "llvm/IR/Attributes.h"
100 #include "llvm/IR/BasicBlock.h"
101 #include "llvm/IR/CFG.h"
102 #include "llvm/IR/Constant.h"
103 #include "llvm/IR/Constants.h"
104 #include "llvm/IR/DataLayout.h"
105 #include "llvm/IR/DebugInfoMetadata.h"
106 #include "llvm/IR/DebugLoc.h"
107 #include "llvm/IR/DerivedTypes.h"
108 #include "llvm/IR/DiagnosticInfo.h"
109 #include "llvm/IR/Dominators.h"
110 #include "llvm/IR/Function.h"
111 #include "llvm/IR/IRBuilder.h"
112 #include "llvm/IR/InstrTypes.h"
113 #include "llvm/IR/Instruction.h"
114 #include "llvm/IR/Instructions.h"
115 #include "llvm/IR/IntrinsicInst.h"
116 #include "llvm/IR/Intrinsics.h"
117 #include "llvm/IR/LLVMContext.h"
118 #include "llvm/IR/Metadata.h"
119 #include "llvm/IR/Module.h"
120 #include "llvm/IR/Operator.h"
121 #include "llvm/IR/Type.h"
122 #include "llvm/IR/Use.h"
123 #include "llvm/IR/User.h"
124 #include "llvm/IR/Value.h"
125 #include "llvm/IR/ValueHandle.h"
126 #include "llvm/IR/Verifier.h"
127 #include "llvm/InitializePasses.h"
128 #include "llvm/Pass.h"
129 #include "llvm/Support/Casting.h"
130 #include "llvm/Support/CommandLine.h"
131 #include "llvm/Support/Compiler.h"
132 #include "llvm/Support/Debug.h"
133 #include "llvm/Support/ErrorHandling.h"
134 #include "llvm/Support/MathExtras.h"
135 #include "llvm/Support/raw_ostream.h"
136 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
137 #include "llvm/Transforms/Utils/LoopSimplify.h"
138 #include "llvm/Transforms/Utils/LoopUtils.h"
139 #include "llvm/Transforms/Utils/LoopVersioning.h"
140 #include "llvm/Transforms/Utils/SizeOpts.h"
141 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
142 #include <algorithm>
143 #include <cassert>
144 #include <cstdint>
145 #include <cstdlib>
146 #include <functional>
147 #include <iterator>
148 #include <limits>
149 #include <memory>
150 #include <string>
151 #include <tuple>
152 #include <utility>
153 
154 using namespace llvm;
155 
156 #define LV_NAME "loop-vectorize"
157 #define DEBUG_TYPE LV_NAME
158 
159 /// @{
160 /// Metadata attribute names
161 static const char *const LLVMLoopVectorizeFollowupAll =
162     "llvm.loop.vectorize.followup_all";
163 static const char *const LLVMLoopVectorizeFollowupVectorized =
164     "llvm.loop.vectorize.followup_vectorized";
165 static const char *const LLVMLoopVectorizeFollowupEpilogue =
166     "llvm.loop.vectorize.followup_epilogue";
167 /// @}
168 
169 STATISTIC(LoopsVectorized, "Number of loops vectorized");
170 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
171 
172 /// Loops with a known constant trip count below this number are vectorized only
173 /// if no scalar iteration overheads are incurred.
174 static cl::opt<unsigned> TinyTripCountVectorThreshold(
175     "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
176     cl::desc("Loops with a constant trip count that is smaller than this "
177              "value are vectorized only if no scalar iteration overheads "
178              "are incurred."));
179 
180 // Indicates that an epilogue is undesired, predication is preferred.
181 // This means that the vectorizer will try to fold the loop-tail (epilogue)
182 // into the loop and predicate the loop body accordingly.
183 static cl::opt<bool> PreferPredicateOverEpilog(
184     "prefer-predicate-over-epilog", cl::init(false), cl::Hidden,
185     cl::desc("Indicate that an epilogue is undesired, predication should be "
186              "used instead."));
187 
188 static cl::opt<bool> MaximizeBandwidth(
189     "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
190     cl::desc("Maximize bandwidth when selecting vectorization factor which "
191              "will be determined by the smallest type in loop."));
192 
193 static cl::opt<bool> EnableInterleavedMemAccesses(
194     "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
195     cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
196 
197 /// An interleave-group may need masking if it resides in a block that needs
198 /// predication, or in order to mask away gaps.
199 static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
200     "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
201     cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
202 
203 static cl::opt<unsigned> TinyTripCountInterleaveThreshold(
204     "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden,
205     cl::desc("We don't interleave loops with a estimated constant trip count "
206              "below this number"));
207 
208 static cl::opt<unsigned> ForceTargetNumScalarRegs(
209     "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
210     cl::desc("A flag that overrides the target's number of scalar registers."));
211 
212 static cl::opt<unsigned> ForceTargetNumVectorRegs(
213     "force-target-num-vector-regs", cl::init(0), cl::Hidden,
214     cl::desc("A flag that overrides the target's number of vector registers."));
215 
216 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
217     "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
218     cl::desc("A flag that overrides the target's max interleave factor for "
219              "scalar loops."));
220 
221 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
222     "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
223     cl::desc("A flag that overrides the target's max interleave factor for "
224              "vectorized loops."));
225 
226 static cl::opt<unsigned> ForceTargetInstructionCost(
227     "force-target-instruction-cost", cl::init(0), cl::Hidden,
228     cl::desc("A flag that overrides the target's expected cost for "
229              "an instruction to a single constant value. Mostly "
230              "useful for getting consistent testing."));
231 
232 static cl::opt<unsigned> SmallLoopCost(
233     "small-loop-cost", cl::init(20), cl::Hidden,
234     cl::desc(
235         "The cost of a loop that is considered 'small' by the interleaver."));
236 
237 static cl::opt<bool> LoopVectorizeWithBlockFrequency(
238     "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
239     cl::desc("Enable the use of the block frequency analysis to access PGO "
240              "heuristics minimizing code growth in cold regions and being more "
241              "aggressive in hot regions."));
242 
243 // Runtime interleave loops for load/store throughput.
244 static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
245     "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
246     cl::desc(
247         "Enable runtime interleaving until load/store ports are saturated"));
248 
249 /// The number of stores in a loop that are allowed to need predication.
250 static cl::opt<unsigned> NumberOfStoresToPredicate(
251     "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
252     cl::desc("Max number of stores to be predicated behind an if."));
253 
254 static cl::opt<bool> EnableIndVarRegisterHeur(
255     "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
256     cl::desc("Count the induction variable only once when interleaving"));
257 
258 static cl::opt<bool> EnableCondStoresVectorization(
259     "enable-cond-stores-vec", cl::init(true), cl::Hidden,
260     cl::desc("Enable if predication of stores during vectorization."));
261 
262 static cl::opt<unsigned> MaxNestedScalarReductionIC(
263     "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
264     cl::desc("The maximum interleave count to use when interleaving a scalar "
265              "reduction in a nested loop."));
266 
267 cl::opt<bool> EnableVPlanNativePath(
268     "enable-vplan-native-path", cl::init(false), cl::Hidden,
269     cl::desc("Enable VPlan-native vectorization path with "
270              "support for outer loop vectorization."));
271 
272 // FIXME: Remove this switch once we have divergence analysis. Currently we
273 // assume divergent non-backedge branches when this switch is true.
274 cl::opt<bool> EnableVPlanPredication(
275     "enable-vplan-predication", cl::init(false), cl::Hidden,
276     cl::desc("Enable VPlan-native vectorization path predicator with "
277              "support for outer loop vectorization."));
278 
279 // This flag enables the stress testing of the VPlan H-CFG construction in the
280 // VPlan-native vectorization path. It must be used in conjuction with
281 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
282 // verification of the H-CFGs built.
283 static cl::opt<bool> VPlanBuildStressTest(
284     "vplan-build-stress-test", cl::init(false), cl::Hidden,
285     cl::desc(
286         "Build VPlan for every supported loop nest in the function and bail "
287         "out right after the build (stress test the VPlan H-CFG construction "
288         "in the VPlan-native vectorization path)."));
289 
290 cl::opt<bool> llvm::EnableLoopInterleaving(
291     "interleave-loops", cl::init(true), cl::Hidden,
292     cl::desc("Enable loop interleaving in Loop vectorization passes"));
293 cl::opt<bool> llvm::EnableLoopVectorization(
294     "vectorize-loops", cl::init(true), cl::Hidden,
295     cl::desc("Run the Loop vectorization passes"));
296 
297 /// A helper function for converting Scalar types to vector types.
298 /// If the incoming type is void, we return void. If the VF is 1, we return
299 /// the scalar type.
300 static Type *ToVectorTy(Type *Scalar, unsigned VF) {
301   if (Scalar->isVoidTy() || VF == 1)
302     return Scalar;
303   return VectorType::get(Scalar, VF);
304 }
305 
306 /// A helper function that returns the type of loaded or stored value.
307 static Type *getMemInstValueType(Value *I) {
308   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
309          "Expected Load or Store instruction");
310   if (auto *LI = dyn_cast<LoadInst>(I))
311     return LI->getType();
312   return cast<StoreInst>(I)->getValueOperand()->getType();
313 }
314 
315 /// A helper function that returns true if the given type is irregular. The
316 /// type is irregular if its allocated size doesn't equal the store size of an
317 /// element of the corresponding vector type at the given vectorization factor.
318 static bool hasIrregularType(Type *Ty, const DataLayout &DL, unsigned VF) {
319   // Determine if an array of VF elements of type Ty is "bitcast compatible"
320   // with a <VF x Ty> vector.
321   if (VF > 1) {
322     auto *VectorTy = VectorType::get(Ty, VF);
323     return VF * DL.getTypeAllocSize(Ty) != DL.getTypeStoreSize(VectorTy);
324   }
325 
326   // If the vectorization factor is one, we just check if an array of type Ty
327   // requires padding between elements.
328   return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
329 }
330 
331 /// A helper function that returns the reciprocal of the block probability of
332 /// predicated blocks. If we return X, we are assuming the predicated block
333 /// will execute once for every X iterations of the loop header.
334 ///
335 /// TODO: We should use actual block probability here, if available. Currently,
336 ///       we always assume predicated blocks have a 50% chance of executing.
337 static unsigned getReciprocalPredBlockProb() { return 2; }
338 
339 /// A helper function that adds a 'fast' flag to floating-point operations.
340 static Value *addFastMathFlag(Value *V) {
341   if (isa<FPMathOperator>(V))
342     cast<Instruction>(V)->setFastMathFlags(FastMathFlags::getFast());
343   return V;
344 }
345 
346 static Value *addFastMathFlag(Value *V, FastMathFlags FMF) {
347   if (isa<FPMathOperator>(V))
348     cast<Instruction>(V)->setFastMathFlags(FMF);
349   return V;
350 }
351 
352 /// A helper function that returns an integer or floating-point constant with
353 /// value C.
354 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
355   return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
356                            : ConstantFP::get(Ty, C);
357 }
358 
359 /// Returns "best known" trip count for the specified loop \p L as defined by
360 /// the following procedure:
361 ///   1) Returns exact trip count if it is known.
362 ///   2) Returns expected trip count according to profile data if any.
363 ///   3) Returns upper bound estimate if it is known.
364 ///   4) Returns None if all of the above failed.
365 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) {
366   // Check if exact trip count is known.
367   if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
368     return ExpectedTC;
369 
370   // Check if there is an expected trip count available from profile data.
371   if (LoopVectorizeWithBlockFrequency)
372     if (auto EstimatedTC = getLoopEstimatedTripCount(L))
373       return EstimatedTC;
374 
375   // Check if upper bound estimate is known.
376   if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
377     return ExpectedTC;
378 
379   return None;
380 }
381 
382 namespace llvm {
383 
384 /// InnerLoopVectorizer vectorizes loops which contain only one basic
385 /// block to a specified vectorization factor (VF).
386 /// This class performs the widening of scalars into vectors, or multiple
387 /// scalars. This class also implements the following features:
388 /// * It inserts an epilogue loop for handling loops that don't have iteration
389 ///   counts that are known to be a multiple of the vectorization factor.
390 /// * It handles the code generation for reduction variables.
391 /// * Scalarization (implementation using scalars) of un-vectorizable
392 ///   instructions.
393 /// InnerLoopVectorizer does not perform any vectorization-legality
394 /// checks, and relies on the caller to check for the different legality
395 /// aspects. The InnerLoopVectorizer relies on the
396 /// LoopVectorizationLegality class to provide information about the induction
397 /// and reduction variables that were found to a given vectorization factor.
398 class InnerLoopVectorizer {
399 public:
400   InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
401                       LoopInfo *LI, DominatorTree *DT,
402                       const TargetLibraryInfo *TLI,
403                       const TargetTransformInfo *TTI, AssumptionCache *AC,
404                       OptimizationRemarkEmitter *ORE, unsigned VecWidth,
405                       unsigned UnrollFactor, LoopVectorizationLegality *LVL,
406                       LoopVectorizationCostModel *CM)
407       : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
408         AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
409         Builder(PSE.getSE()->getContext()),
410         VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM) {}
411   virtual ~InnerLoopVectorizer() = default;
412 
413   /// Create a new empty loop. Unlink the old loop and connect the new one.
414   /// Return the pre-header block of the new loop.
415   BasicBlock *createVectorizedLoopSkeleton();
416 
417   /// Widen a single instruction within the innermost loop.
418   void widenInstruction(Instruction &I);
419 
420   /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
421   void fixVectorizedLoop();
422 
423   // Return true if any runtime check is added.
424   bool areSafetyChecksAdded() { return AddedSafetyChecks; }
425 
426   /// A type for vectorized values in the new loop. Each value from the
427   /// original loop, when vectorized, is represented by UF vector values in the
428   /// new unrolled loop, where UF is the unroll factor.
429   using VectorParts = SmallVector<Value *, 2>;
430 
431   /// Vectorize a single GetElementPtrInst based on information gathered and
432   /// decisions taken during planning.
433   void widenGEP(GetElementPtrInst *GEP, unsigned UF, unsigned VF,
434                 bool IsPtrLoopInvariant, SmallBitVector &IsIndexLoopInvariant);
435 
436   /// Vectorize a single PHINode in a block. This method handles the induction
437   /// variable canonicalization. It supports both VF = 1 for unrolled loops and
438   /// arbitrary length vectors.
439   void widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF);
440 
441   /// A helper function to scalarize a single Instruction in the innermost loop.
442   /// Generates a sequence of scalar instances for each lane between \p MinLane
443   /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
444   /// inclusive..
445   void scalarizeInstruction(Instruction *Instr, const VPIteration &Instance,
446                             bool IfPredicateInstr);
447 
448   /// Widen an integer or floating-point induction variable \p IV. If \p Trunc
449   /// is provided, the integer induction variable will first be truncated to
450   /// the corresponding type.
451   void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr);
452 
453   /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a
454   /// vector or scalar value on-demand if one is not yet available. When
455   /// vectorizing a loop, we visit the definition of an instruction before its
456   /// uses. When visiting the definition, we either vectorize or scalarize the
457   /// instruction, creating an entry for it in the corresponding map. (In some
458   /// cases, such as induction variables, we will create both vector and scalar
459   /// entries.) Then, as we encounter uses of the definition, we derive values
460   /// for each scalar or vector use unless such a value is already available.
461   /// For example, if we scalarize a definition and one of its uses is vector,
462   /// we build the required vector on-demand with an insertelement sequence
463   /// when visiting the use. Otherwise, if the use is scalar, we can use the
464   /// existing scalar definition.
465   ///
466   /// Return a value in the new loop corresponding to \p V from the original
467   /// loop at unroll index \p Part. If the value has already been vectorized,
468   /// the corresponding vector entry in VectorLoopValueMap is returned. If,
469   /// however, the value has a scalar entry in VectorLoopValueMap, we construct
470   /// a new vector value on-demand by inserting the scalar values into a vector
471   /// with an insertelement sequence. If the value has been neither vectorized
472   /// nor scalarized, it must be loop invariant, so we simply broadcast the
473   /// value into a vector.
474   Value *getOrCreateVectorValue(Value *V, unsigned Part);
475 
476   /// Return a value in the new loop corresponding to \p V from the original
477   /// loop at unroll and vector indices \p Instance. If the value has been
478   /// vectorized but not scalarized, the necessary extractelement instruction
479   /// will be generated.
480   Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance);
481 
482   /// Construct the vector value of a scalarized value \p V one lane at a time.
483   void packScalarIntoVectorValue(Value *V, const VPIteration &Instance);
484 
485   /// Try to vectorize the interleaved access group that \p Instr belongs to,
486   /// optionally masking the vector operations if \p BlockInMask is non-null.
487   void vectorizeInterleaveGroup(Instruction *Instr,
488                                 VectorParts *BlockInMask = nullptr);
489 
490   /// Vectorize Load and Store instructions, optionally masking the vector
491   /// operations if \p BlockInMask is non-null.
492   void vectorizeMemoryInstruction(Instruction *Instr,
493                                   VectorParts *BlockInMask = nullptr);
494 
495   /// Set the debug location in the builder using the debug location in
496   /// the instruction.
497   void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr);
498 
499   /// Fix the non-induction PHIs in the OrigPHIsToFix vector.
500   void fixNonInductionPHIs(void);
501 
502 protected:
503   friend class LoopVectorizationPlanner;
504 
505   /// A small list of PHINodes.
506   using PhiVector = SmallVector<PHINode *, 4>;
507 
508   /// A type for scalarized values in the new loop. Each value from the
509   /// original loop, when scalarized, is represented by UF x VF scalar values
510   /// in the new unrolled loop, where UF is the unroll factor and VF is the
511   /// vectorization factor.
512   using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
513 
514   /// Set up the values of the IVs correctly when exiting the vector loop.
515   void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
516                     Value *CountRoundDown, Value *EndValue,
517                     BasicBlock *MiddleBlock);
518 
519   /// Create a new induction variable inside L.
520   PHINode *createInductionVariable(Loop *L, Value *Start, Value *End,
521                                    Value *Step, Instruction *DL);
522 
523   /// Handle all cross-iteration phis in the header.
524   void fixCrossIterationPHIs();
525 
526   /// Fix a first-order recurrence. This is the second phase of vectorizing
527   /// this phi node.
528   void fixFirstOrderRecurrence(PHINode *Phi);
529 
530   /// Fix a reduction cross-iteration phi. This is the second phase of
531   /// vectorizing this phi node.
532   void fixReduction(PHINode *Phi);
533 
534   /// Clear NSW/NUW flags from reduction instructions if necessary.
535   void clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc);
536 
537   /// The Loop exit block may have single value PHI nodes with some
538   /// incoming value. While vectorizing we only handled real values
539   /// that were defined inside the loop and we should have one value for
540   /// each predecessor of its parent basic block. See PR14725.
541   void fixLCSSAPHIs();
542 
543   /// Iteratively sink the scalarized operands of a predicated instruction into
544   /// the block that was created for it.
545   void sinkScalarOperands(Instruction *PredInst);
546 
547   /// Shrinks vector element sizes to the smallest bitwidth they can be legally
548   /// represented as.
549   void truncateToMinimalBitwidths();
550 
551   /// Create a broadcast instruction. This method generates a broadcast
552   /// instruction (shuffle) for loop invariant values and for the induction
553   /// value. If this is the induction variable then we extend it to N, N+1, ...
554   /// this is needed because each iteration in the loop corresponds to a SIMD
555   /// element.
556   virtual Value *getBroadcastInstrs(Value *V);
557 
558   /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...)
559   /// to each vector element of Val. The sequence starts at StartIndex.
560   /// \p Opcode is relevant for FP induction variable.
561   virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step,
562                                Instruction::BinaryOps Opcode =
563                                Instruction::BinaryOpsEnd);
564 
565   /// Compute scalar induction steps. \p ScalarIV is the scalar induction
566   /// variable on which to base the steps, \p Step is the size of the step, and
567   /// \p EntryVal is the value from the original loop that maps to the steps.
568   /// Note that \p EntryVal doesn't have to be an induction variable - it
569   /// can also be a truncate instruction.
570   void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal,
571                         const InductionDescriptor &ID);
572 
573   /// Create a vector induction phi node based on an existing scalar one. \p
574   /// EntryVal is the value from the original loop that maps to the vector phi
575   /// node, and \p Step is the loop-invariant step. If \p EntryVal is a
576   /// truncate instruction, instead of widening the original IV, we widen a
577   /// version of the IV truncated to \p EntryVal's type.
578   void createVectorIntOrFpInductionPHI(const InductionDescriptor &II,
579                                        Value *Step, Instruction *EntryVal);
580 
581   /// Returns true if an instruction \p I should be scalarized instead of
582   /// vectorized for the chosen vectorization factor.
583   bool shouldScalarizeInstruction(Instruction *I) const;
584 
585   /// Returns true if we should generate a scalar version of \p IV.
586   bool needsScalarInduction(Instruction *IV) const;
587 
588   /// If there is a cast involved in the induction variable \p ID, which should
589   /// be ignored in the vectorized loop body, this function records the
590   /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the
591   /// cast. We had already proved that the casted Phi is equal to the uncasted
592   /// Phi in the vectorized loop (under a runtime guard), and therefore
593   /// there is no need to vectorize the cast - the same value can be used in the
594   /// vector loop for both the Phi and the cast.
595   /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified,
596   /// Otherwise, \p VectorLoopValue is a widened/vectorized value.
597   ///
598   /// \p EntryVal is the value from the original loop that maps to the vector
599   /// phi node and is used to distinguish what is the IV currently being
600   /// processed - original one (if \p EntryVal is a phi corresponding to the
601   /// original IV) or the "newly-created" one based on the proof mentioned above
602   /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the
603   /// latter case \p EntryVal is a TruncInst and we must not record anything for
604   /// that IV, but it's error-prone to expect callers of this routine to care
605   /// about that, hence this explicit parameter.
606   void recordVectorLoopValueForInductionCast(const InductionDescriptor &ID,
607                                              const Instruction *EntryVal,
608                                              Value *VectorLoopValue,
609                                              unsigned Part,
610                                              unsigned Lane = UINT_MAX);
611 
612   /// Generate a shuffle sequence that will reverse the vector Vec.
613   virtual Value *reverseVector(Value *Vec);
614 
615   /// Returns (and creates if needed) the original loop trip count.
616   Value *getOrCreateTripCount(Loop *NewLoop);
617 
618   /// Returns (and creates if needed) the trip count of the widened loop.
619   Value *getOrCreateVectorTripCount(Loop *NewLoop);
620 
621   /// Returns a bitcasted value to the requested vector type.
622   /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
623   Value *createBitOrPointerCast(Value *V, VectorType *DstVTy,
624                                 const DataLayout &DL);
625 
626   /// Emit a bypass check to see if the vector trip count is zero, including if
627   /// it overflows.
628   void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass);
629 
630   /// Emit a bypass check to see if all of the SCEV assumptions we've
631   /// had to make are correct.
632   void emitSCEVChecks(Loop *L, BasicBlock *Bypass);
633 
634   /// Emit bypass checks to check any memory assumptions we may have made.
635   void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass);
636 
637   /// Compute the transformed value of Index at offset StartValue using step
638   /// StepValue.
639   /// For integer induction, returns StartValue + Index * StepValue.
640   /// For pointer induction, returns StartValue[Index * StepValue].
641   /// FIXME: The newly created binary instructions should contain nsw/nuw
642   /// flags, which can be found from the original scalar operations.
643   Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE,
644                               const DataLayout &DL,
645                               const InductionDescriptor &ID) const;
646 
647   /// Add additional metadata to \p To that was not present on \p Orig.
648   ///
649   /// Currently this is used to add the noalias annotations based on the
650   /// inserted memchecks.  Use this for instructions that are *cloned* into the
651   /// vector loop.
652   void addNewMetadata(Instruction *To, const Instruction *Orig);
653 
654   /// Add metadata from one instruction to another.
655   ///
656   /// This includes both the original MDs from \p From and additional ones (\see
657   /// addNewMetadata).  Use this for *newly created* instructions in the vector
658   /// loop.
659   void addMetadata(Instruction *To, Instruction *From);
660 
661   /// Similar to the previous function but it adds the metadata to a
662   /// vector of instructions.
663   void addMetadata(ArrayRef<Value *> To, Instruction *From);
664 
665   /// The original loop.
666   Loop *OrigLoop;
667 
668   /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
669   /// dynamic knowledge to simplify SCEV expressions and converts them to a
670   /// more usable form.
671   PredicatedScalarEvolution &PSE;
672 
673   /// Loop Info.
674   LoopInfo *LI;
675 
676   /// Dominator Tree.
677   DominatorTree *DT;
678 
679   /// Alias Analysis.
680   AliasAnalysis *AA;
681 
682   /// Target Library Info.
683   const TargetLibraryInfo *TLI;
684 
685   /// Target Transform Info.
686   const TargetTransformInfo *TTI;
687 
688   /// Assumption Cache.
689   AssumptionCache *AC;
690 
691   /// Interface to emit optimization remarks.
692   OptimizationRemarkEmitter *ORE;
693 
694   /// LoopVersioning.  It's only set up (non-null) if memchecks were
695   /// used.
696   ///
697   /// This is currently only used to add no-alias metadata based on the
698   /// memchecks.  The actually versioning is performed manually.
699   std::unique_ptr<LoopVersioning> LVer;
700 
701   /// The vectorization SIMD factor to use. Each vector will have this many
702   /// vector elements.
703   unsigned VF;
704 
705   /// The vectorization unroll factor to use. Each scalar is vectorized to this
706   /// many different vector instructions.
707   unsigned UF;
708 
709   /// The builder that we use
710   IRBuilder<> Builder;
711 
712   // --- Vectorization state ---
713 
714   /// The vector-loop preheader.
715   BasicBlock *LoopVectorPreHeader;
716 
717   /// The scalar-loop preheader.
718   BasicBlock *LoopScalarPreHeader;
719 
720   /// Middle Block between the vector and the scalar.
721   BasicBlock *LoopMiddleBlock;
722 
723   /// The ExitBlock of the scalar loop.
724   BasicBlock *LoopExitBlock;
725 
726   /// The vector loop body.
727   BasicBlock *LoopVectorBody;
728 
729   /// The scalar loop body.
730   BasicBlock *LoopScalarBody;
731 
732   /// A list of all bypass blocks. The first block is the entry of the loop.
733   SmallVector<BasicBlock *, 4> LoopBypassBlocks;
734 
735   /// The new Induction variable which was added to the new block.
736   PHINode *Induction = nullptr;
737 
738   /// The induction variable of the old basic block.
739   PHINode *OldInduction = nullptr;
740 
741   /// Maps values from the original loop to their corresponding values in the
742   /// vectorized loop. A key value can map to either vector values, scalar
743   /// values or both kinds of values, depending on whether the key was
744   /// vectorized and scalarized.
745   VectorizerValueMap VectorLoopValueMap;
746 
747   /// Store instructions that were predicated.
748   SmallVector<Instruction *, 4> PredicatedInstructions;
749 
750   /// Trip count of the original loop.
751   Value *TripCount = nullptr;
752 
753   /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
754   Value *VectorTripCount = nullptr;
755 
756   /// The legality analysis.
757   LoopVectorizationLegality *Legal;
758 
759   /// The profitablity analysis.
760   LoopVectorizationCostModel *Cost;
761 
762   // Record whether runtime checks are added.
763   bool AddedSafetyChecks = false;
764 
765   // Holds the end values for each induction variable. We save the end values
766   // so we can later fix-up the external users of the induction variables.
767   DenseMap<PHINode *, Value *> IVEndValues;
768 
769   // Vector of original scalar PHIs whose corresponding widened PHIs need to be
770   // fixed up at the end of vector code generation.
771   SmallVector<PHINode *, 8> OrigPHIsToFix;
772 };
773 
774 class InnerLoopUnroller : public InnerLoopVectorizer {
775 public:
776   InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
777                     LoopInfo *LI, DominatorTree *DT,
778                     const TargetLibraryInfo *TLI,
779                     const TargetTransformInfo *TTI, AssumptionCache *AC,
780                     OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
781                     LoopVectorizationLegality *LVL,
782                     LoopVectorizationCostModel *CM)
783       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1,
784                             UnrollFactor, LVL, CM) {}
785 
786 private:
787   Value *getBroadcastInstrs(Value *V) override;
788   Value *getStepVector(Value *Val, int StartIdx, Value *Step,
789                        Instruction::BinaryOps Opcode =
790                        Instruction::BinaryOpsEnd) override;
791   Value *reverseVector(Value *Vec) override;
792 };
793 
794 } // end namespace llvm
795 
796 /// Look for a meaningful debug location on the instruction or it's
797 /// operands.
798 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
799   if (!I)
800     return I;
801 
802   DebugLoc Empty;
803   if (I->getDebugLoc() != Empty)
804     return I;
805 
806   for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) {
807     if (Instruction *OpInst = dyn_cast<Instruction>(*OI))
808       if (OpInst->getDebugLoc() != Empty)
809         return OpInst;
810   }
811 
812   return I;
813 }
814 
815 void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) {
816   if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) {
817     const DILocation *DIL = Inst->getDebugLoc();
818     if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
819         !isa<DbgInfoIntrinsic>(Inst)) {
820       auto NewDIL = DIL->cloneByMultiplyingDuplicationFactor(UF * VF);
821       if (NewDIL)
822         B.SetCurrentDebugLocation(NewDIL.getValue());
823       else
824         LLVM_DEBUG(dbgs()
825                    << "Failed to create new discriminator: "
826                    << DIL->getFilename() << " Line: " << DIL->getLine());
827     }
828     else
829       B.SetCurrentDebugLocation(DIL);
830   } else
831     B.SetCurrentDebugLocation(DebugLoc());
832 }
833 
834 /// Write a record \p DebugMsg about vectorization failure to the debug
835 /// output stream. If \p I is passed, it is an instruction that prevents
836 /// vectorization.
837 #ifndef NDEBUG
838 static void debugVectorizationFailure(const StringRef DebugMsg,
839     Instruction *I) {
840   dbgs() << "LV: Not vectorizing: " << DebugMsg;
841   if (I != nullptr)
842     dbgs() << " " << *I;
843   else
844     dbgs() << '.';
845   dbgs() << '\n';
846 }
847 #endif
848 
849 /// Create an analysis remark that explains why vectorization failed
850 ///
851 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint).  \p
852 /// RemarkName is the identifier for the remark.  If \p I is passed it is an
853 /// instruction that prevents vectorization.  Otherwise \p TheLoop is used for
854 /// the location of the remark.  \return the remark object that can be
855 /// streamed to.
856 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
857     StringRef RemarkName, Loop *TheLoop, Instruction *I) {
858   Value *CodeRegion = TheLoop->getHeader();
859   DebugLoc DL = TheLoop->getStartLoc();
860 
861   if (I) {
862     CodeRegion = I->getParent();
863     // If there is no debug location attached to the instruction, revert back to
864     // using the loop's.
865     if (I->getDebugLoc())
866       DL = I->getDebugLoc();
867   }
868 
869   OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion);
870   R << "loop not vectorized: ";
871   return R;
872 }
873 
874 namespace llvm {
875 
876 void reportVectorizationFailure(const StringRef DebugMsg,
877     const StringRef OREMsg, const StringRef ORETag,
878     OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) {
879   LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I));
880   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
881   ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(),
882                 ORETag, TheLoop, I) << OREMsg);
883 }
884 
885 } // end namespace llvm
886 
887 #ifndef NDEBUG
888 /// \return string containing a file name and a line # for the given loop.
889 static std::string getDebugLocString(const Loop *L) {
890   std::string Result;
891   if (L) {
892     raw_string_ostream OS(Result);
893     if (const DebugLoc LoopDbgLoc = L->getStartLoc())
894       LoopDbgLoc.print(OS);
895     else
896       // Just print the module name.
897       OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
898     OS.flush();
899   }
900   return Result;
901 }
902 #endif
903 
904 void InnerLoopVectorizer::addNewMetadata(Instruction *To,
905                                          const Instruction *Orig) {
906   // If the loop was versioned with memchecks, add the corresponding no-alias
907   // metadata.
908   if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig)))
909     LVer->annotateInstWithNoAlias(To, Orig);
910 }
911 
912 void InnerLoopVectorizer::addMetadata(Instruction *To,
913                                       Instruction *From) {
914   propagateMetadata(To, From);
915   addNewMetadata(To, From);
916 }
917 
918 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,
919                                       Instruction *From) {
920   for (Value *V : To) {
921     if (Instruction *I = dyn_cast<Instruction>(V))
922       addMetadata(I, From);
923   }
924 }
925 
926 namespace llvm {
927 
928 // Loop vectorization cost-model hints how the scalar epilogue loop should be
929 // lowered.
930 enum ScalarEpilogueLowering {
931 
932   // The default: allowing scalar epilogues.
933   CM_ScalarEpilogueAllowed,
934 
935   // Vectorization with OptForSize: don't allow epilogues.
936   CM_ScalarEpilogueNotAllowedOptSize,
937 
938   // A special case of vectorisation with OptForSize: loops with a very small
939   // trip count are considered for vectorization under OptForSize, thereby
940   // making sure the cost of their loop body is dominant, free of runtime
941   // guards and scalar iteration overheads.
942   CM_ScalarEpilogueNotAllowedLowTripLoop,
943 
944   // Loop hint predicate indicating an epilogue is undesired.
945   CM_ScalarEpilogueNotNeededUsePredicate
946 };
947 
948 /// LoopVectorizationCostModel - estimates the expected speedups due to
949 /// vectorization.
950 /// In many cases vectorization is not profitable. This can happen because of
951 /// a number of reasons. In this class we mainly attempt to predict the
952 /// expected speedup/slowdowns due to the supported instruction set. We use the
953 /// TargetTransformInfo to query the different backends for the cost of
954 /// different operations.
955 class LoopVectorizationCostModel {
956 public:
957   LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
958                              PredicatedScalarEvolution &PSE, LoopInfo *LI,
959                              LoopVectorizationLegality *Legal,
960                              const TargetTransformInfo &TTI,
961                              const TargetLibraryInfo *TLI, DemandedBits *DB,
962                              AssumptionCache *AC,
963                              OptimizationRemarkEmitter *ORE, const Function *F,
964                              const LoopVectorizeHints *Hints,
965                              InterleavedAccessInfo &IAI)
966       : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
967         TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
968         Hints(Hints), InterleaveInfo(IAI) {}
969 
970   /// \return An upper bound for the vectorization factor, or None if
971   /// vectorization and interleaving should be avoided up front.
972   Optional<unsigned> computeMaxVF();
973 
974   /// \return True if runtime checks are required for vectorization, and false
975   /// otherwise.
976   bool runtimeChecksRequired();
977 
978   /// \return The most profitable vectorization factor and the cost of that VF.
979   /// This method checks every power of two up to MaxVF. If UserVF is not ZERO
980   /// then this vectorization factor will be selected if vectorization is
981   /// possible.
982   VectorizationFactor selectVectorizationFactor(unsigned MaxVF);
983 
984   /// Setup cost-based decisions for user vectorization factor.
985   void selectUserVectorizationFactor(unsigned UserVF) {
986     collectUniformsAndScalars(UserVF);
987     collectInstsToScalarize(UserVF);
988   }
989 
990   /// \return The size (in bits) of the smallest and widest types in the code
991   /// that needs to be vectorized. We ignore values that remain scalar such as
992   /// 64 bit loop indices.
993   std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
994 
995   /// \return The desired interleave count.
996   /// If interleave count has been specified by metadata it will be returned.
997   /// Otherwise, the interleave count is computed and returned. VF and LoopCost
998   /// are the selected vectorization factor and the cost of the selected VF.
999   unsigned selectInterleaveCount(unsigned VF, unsigned LoopCost);
1000 
1001   /// Memory access instruction may be vectorized in more than one way.
1002   /// Form of instruction after vectorization depends on cost.
1003   /// This function takes cost-based decisions for Load/Store instructions
1004   /// and collects them in a map. This decisions map is used for building
1005   /// the lists of loop-uniform and loop-scalar instructions.
1006   /// The calculated cost is saved with widening decision in order to
1007   /// avoid redundant calculations.
1008   void setCostBasedWideningDecision(unsigned VF);
1009 
1010   /// A struct that represents some properties of the register usage
1011   /// of a loop.
1012   struct RegisterUsage {
1013     /// Holds the number of loop invariant values that are used in the loop.
1014     /// The key is ClassID of target-provided register class.
1015     SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs;
1016     /// Holds the maximum number of concurrent live intervals in the loop.
1017     /// The key is ClassID of target-provided register class.
1018     SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
1019   };
1020 
1021   /// \return Returns information about the register usages of the loop for the
1022   /// given vectorization factors.
1023   SmallVector<RegisterUsage, 8> calculateRegisterUsage(ArrayRef<unsigned> VFs);
1024 
1025   /// Collect values we want to ignore in the cost model.
1026   void collectValuesToIgnore();
1027 
1028   /// \returns The smallest bitwidth each instruction can be represented with.
1029   /// The vector equivalents of these instructions should be truncated to this
1030   /// type.
1031   const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
1032     return MinBWs;
1033   }
1034 
1035   /// \returns True if it is more profitable to scalarize instruction \p I for
1036   /// vectorization factor \p VF.
1037   bool isProfitableToScalarize(Instruction *I, unsigned VF) const {
1038     assert(VF > 1 && "Profitable to scalarize relevant only for VF > 1.");
1039 
1040     // Cost model is not run in the VPlan-native path - return conservative
1041     // result until this changes.
1042     if (EnableVPlanNativePath)
1043       return false;
1044 
1045     auto Scalars = InstsToScalarize.find(VF);
1046     assert(Scalars != InstsToScalarize.end() &&
1047            "VF not yet analyzed for scalarization profitability");
1048     return Scalars->second.find(I) != Scalars->second.end();
1049   }
1050 
1051   /// Returns true if \p I is known to be uniform after vectorization.
1052   bool isUniformAfterVectorization(Instruction *I, unsigned VF) const {
1053     if (VF == 1)
1054       return true;
1055 
1056     // Cost model is not run in the VPlan-native path - return conservative
1057     // result until this changes.
1058     if (EnableVPlanNativePath)
1059       return false;
1060 
1061     auto UniformsPerVF = Uniforms.find(VF);
1062     assert(UniformsPerVF != Uniforms.end() &&
1063            "VF not yet analyzed for uniformity");
1064     return UniformsPerVF->second.find(I) != UniformsPerVF->second.end();
1065   }
1066 
1067   /// Returns true if \p I is known to be scalar after vectorization.
1068   bool isScalarAfterVectorization(Instruction *I, unsigned VF) const {
1069     if (VF == 1)
1070       return true;
1071 
1072     // Cost model is not run in the VPlan-native path - return conservative
1073     // result until this changes.
1074     if (EnableVPlanNativePath)
1075       return false;
1076 
1077     auto ScalarsPerVF = Scalars.find(VF);
1078     assert(ScalarsPerVF != Scalars.end() &&
1079            "Scalar values are not calculated for VF");
1080     return ScalarsPerVF->second.find(I) != ScalarsPerVF->second.end();
1081   }
1082 
1083   /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1084   /// for vectorization factor \p VF.
1085   bool canTruncateToMinimalBitwidth(Instruction *I, unsigned VF) const {
1086     return VF > 1 && MinBWs.find(I) != MinBWs.end() &&
1087            !isProfitableToScalarize(I, VF) &&
1088            !isScalarAfterVectorization(I, VF);
1089   }
1090 
1091   /// Decision that was taken during cost calculation for memory instruction.
1092   enum InstWidening {
1093     CM_Unknown,
1094     CM_Widen,         // For consecutive accesses with stride +1.
1095     CM_Widen_Reverse, // For consecutive accesses with stride -1.
1096     CM_Interleave,
1097     CM_GatherScatter,
1098     CM_Scalarize
1099   };
1100 
1101   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1102   /// instruction \p I and vector width \p VF.
1103   void setWideningDecision(Instruction *I, unsigned VF, InstWidening W,
1104                            unsigned Cost) {
1105     assert(VF >= 2 && "Expected VF >=2");
1106     WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1107   }
1108 
1109   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1110   /// interleaving group \p Grp and vector width \p VF.
1111   void setWideningDecision(const InterleaveGroup<Instruction> *Grp, unsigned VF,
1112                            InstWidening W, unsigned Cost) {
1113     assert(VF >= 2 && "Expected VF >=2");
1114     /// Broadcast this decicion to all instructions inside the group.
1115     /// But the cost will be assigned to one instruction only.
1116     for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1117       if (auto *I = Grp->getMember(i)) {
1118         if (Grp->getInsertPos() == I)
1119           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1120         else
1121           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1122       }
1123     }
1124   }
1125 
1126   /// Return the cost model decision for the given instruction \p I and vector
1127   /// width \p VF. Return CM_Unknown if this instruction did not pass
1128   /// through the cost modeling.
1129   InstWidening getWideningDecision(Instruction *I, unsigned VF) {
1130     assert(VF >= 2 && "Expected VF >=2");
1131 
1132     // Cost model is not run in the VPlan-native path - return conservative
1133     // result until this changes.
1134     if (EnableVPlanNativePath)
1135       return CM_GatherScatter;
1136 
1137     std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
1138     auto Itr = WideningDecisions.find(InstOnVF);
1139     if (Itr == WideningDecisions.end())
1140       return CM_Unknown;
1141     return Itr->second.first;
1142   }
1143 
1144   /// Return the vectorization cost for the given instruction \p I and vector
1145   /// width \p VF.
1146   unsigned getWideningCost(Instruction *I, unsigned VF) {
1147     assert(VF >= 2 && "Expected VF >=2");
1148     std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
1149     assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&
1150            "The cost is not calculated");
1151     return WideningDecisions[InstOnVF].second;
1152   }
1153 
1154   /// Return True if instruction \p I is an optimizable truncate whose operand
1155   /// is an induction variable. Such a truncate will be removed by adding a new
1156   /// induction variable with the destination type.
1157   bool isOptimizableIVTruncate(Instruction *I, unsigned VF) {
1158     // If the instruction is not a truncate, return false.
1159     auto *Trunc = dyn_cast<TruncInst>(I);
1160     if (!Trunc)
1161       return false;
1162 
1163     // Get the source and destination types of the truncate.
1164     Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1165     Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1166 
1167     // If the truncate is free for the given types, return false. Replacing a
1168     // free truncate with an induction variable would add an induction variable
1169     // update instruction to each iteration of the loop. We exclude from this
1170     // check the primary induction variable since it will need an update
1171     // instruction regardless.
1172     Value *Op = Trunc->getOperand(0);
1173     if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1174       return false;
1175 
1176     // If the truncated value is not an induction variable, return false.
1177     return Legal->isInductionPhi(Op);
1178   }
1179 
1180   /// Collects the instructions to scalarize for each predicated instruction in
1181   /// the loop.
1182   void collectInstsToScalarize(unsigned VF);
1183 
1184   /// Collect Uniform and Scalar values for the given \p VF.
1185   /// The sets depend on CM decision for Load/Store instructions
1186   /// that may be vectorized as interleave, gather-scatter or scalarized.
1187   void collectUniformsAndScalars(unsigned VF) {
1188     // Do the analysis once.
1189     if (VF == 1 || Uniforms.find(VF) != Uniforms.end())
1190       return;
1191     setCostBasedWideningDecision(VF);
1192     collectLoopUniforms(VF);
1193     collectLoopScalars(VF);
1194   }
1195 
1196   /// Returns true if the target machine supports masked store operation
1197   /// for the given \p DataType and kind of access to \p Ptr.
1198   bool isLegalMaskedStore(Type *DataType, Value *Ptr, MaybeAlign Alignment) {
1199     return Legal->isConsecutivePtr(Ptr) &&
1200            TTI.isLegalMaskedStore(DataType, Alignment);
1201   }
1202 
1203   /// Returns true if the target machine supports masked load operation
1204   /// for the given \p DataType and kind of access to \p Ptr.
1205   bool isLegalMaskedLoad(Type *DataType, Value *Ptr, MaybeAlign Alignment) {
1206     return Legal->isConsecutivePtr(Ptr) &&
1207            TTI.isLegalMaskedLoad(DataType, Alignment);
1208   }
1209 
1210   /// Returns true if the target machine supports masked scatter operation
1211   /// for the given \p DataType.
1212   bool isLegalMaskedScatter(Type *DataType, MaybeAlign Alignment) {
1213     return TTI.isLegalMaskedScatter(DataType, Alignment);
1214   }
1215 
1216   /// Returns true if the target machine supports masked gather operation
1217   /// for the given \p DataType.
1218   bool isLegalMaskedGather(Type *DataType, MaybeAlign Alignment) {
1219     return TTI.isLegalMaskedGather(DataType, Alignment);
1220   }
1221 
1222   /// Returns true if the target machine can represent \p V as a masked gather
1223   /// or scatter operation.
1224   bool isLegalGatherOrScatter(Value *V) {
1225     bool LI = isa<LoadInst>(V);
1226     bool SI = isa<StoreInst>(V);
1227     if (!LI && !SI)
1228       return false;
1229     auto *Ty = getMemInstValueType(V);
1230     MaybeAlign Align = getLoadStoreAlignment(V);
1231     return (LI && isLegalMaskedGather(Ty, Align)) ||
1232            (SI && isLegalMaskedScatter(Ty, Align));
1233   }
1234 
1235   /// Returns true if \p I is an instruction that will be scalarized with
1236   /// predication. Such instructions include conditional stores and
1237   /// instructions that may divide by zero.
1238   /// If a non-zero VF has been calculated, we check if I will be scalarized
1239   /// predication for that VF.
1240   bool isScalarWithPredication(Instruction *I, unsigned VF = 1);
1241 
1242   // Returns true if \p I is an instruction that will be predicated either
1243   // through scalar predication or masked load/store or masked gather/scatter.
1244   // Superset of instructions that return true for isScalarWithPredication.
1245   bool isPredicatedInst(Instruction *I) {
1246     if (!blockNeedsPredication(I->getParent()))
1247       return false;
1248     // Loads and stores that need some form of masked operation are predicated
1249     // instructions.
1250     if (isa<LoadInst>(I) || isa<StoreInst>(I))
1251       return Legal->isMaskRequired(I);
1252     return isScalarWithPredication(I);
1253   }
1254 
1255   /// Returns true if \p I is a memory instruction with consecutive memory
1256   /// access that can be widened.
1257   bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1);
1258 
1259   /// Returns true if \p I is a memory instruction in an interleaved-group
1260   /// of memory accesses that can be vectorized with wide vector loads/stores
1261   /// and shuffles.
1262   bool interleavedAccessCanBeWidened(Instruction *I, unsigned VF = 1);
1263 
1264   /// Check if \p Instr belongs to any interleaved access group.
1265   bool isAccessInterleaved(Instruction *Instr) {
1266     return InterleaveInfo.isInterleaved(Instr);
1267   }
1268 
1269   /// Get the interleaved access group that \p Instr belongs to.
1270   const InterleaveGroup<Instruction> *
1271   getInterleavedAccessGroup(Instruction *Instr) {
1272     return InterleaveInfo.getInterleaveGroup(Instr);
1273   }
1274 
1275   /// Returns true if an interleaved group requires a scalar iteration
1276   /// to handle accesses with gaps, and there is nothing preventing us from
1277   /// creating a scalar epilogue.
1278   bool requiresScalarEpilogue() const {
1279     return isScalarEpilogueAllowed() && InterleaveInfo.requiresScalarEpilogue();
1280   }
1281 
1282   /// Returns true if a scalar epilogue is not allowed due to optsize or a
1283   /// loop hint annotation.
1284   bool isScalarEpilogueAllowed() const {
1285     return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1286   }
1287 
1288   /// Returns true if all loop blocks should be masked to fold tail loop.
1289   bool foldTailByMasking() const { return FoldTailByMasking; }
1290 
1291   bool blockNeedsPredication(BasicBlock *BB) {
1292     return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1293   }
1294 
1295   /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1296   /// with factor VF.  Return the cost of the instruction, including
1297   /// scalarization overhead if it's needed.
1298   unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF);
1299 
1300   /// Estimate cost of a call instruction CI if it were vectorized with factor
1301   /// VF. Return the cost of the instruction, including scalarization overhead
1302   /// if it's needed. The flag NeedToScalarize shows if the call needs to be
1303   /// scalarized -
1304   /// i.e. either vector version isn't available, or is too expensive.
1305   unsigned getVectorCallCost(CallInst *CI, unsigned VF, bool &NeedToScalarize);
1306 
1307 private:
1308   unsigned NumPredStores = 0;
1309 
1310   /// \return An upper bound for the vectorization factor, larger than zero.
1311   /// One is returned if vectorization should best be avoided due to cost.
1312   unsigned computeFeasibleMaxVF(unsigned ConstTripCount);
1313 
1314   /// The vectorization cost is a combination of the cost itself and a boolean
1315   /// indicating whether any of the contributing operations will actually
1316   /// operate on
1317   /// vector values after type legalization in the backend. If this latter value
1318   /// is
1319   /// false, then all operations will be scalarized (i.e. no vectorization has
1320   /// actually taken place).
1321   using VectorizationCostTy = std::pair<unsigned, bool>;
1322 
1323   /// Returns the expected execution cost. The unit of the cost does
1324   /// not matter because we use the 'cost' units to compare different
1325   /// vector widths. The cost that is returned is *not* normalized by
1326   /// the factor width.
1327   VectorizationCostTy expectedCost(unsigned VF);
1328 
1329   /// Returns the execution time cost of an instruction for a given vector
1330   /// width. Vector width of one means scalar.
1331   VectorizationCostTy getInstructionCost(Instruction *I, unsigned VF);
1332 
1333   /// The cost-computation logic from getInstructionCost which provides
1334   /// the vector type as an output parameter.
1335   unsigned getInstructionCost(Instruction *I, unsigned VF, Type *&VectorTy);
1336 
1337   /// Calculate vectorization cost of memory instruction \p I.
1338   unsigned getMemoryInstructionCost(Instruction *I, unsigned VF);
1339 
1340   /// The cost computation for scalarized memory instruction.
1341   unsigned getMemInstScalarizationCost(Instruction *I, unsigned VF);
1342 
1343   /// The cost computation for interleaving group of memory instructions.
1344   unsigned getInterleaveGroupCost(Instruction *I, unsigned VF);
1345 
1346   /// The cost computation for Gather/Scatter instruction.
1347   unsigned getGatherScatterCost(Instruction *I, unsigned VF);
1348 
1349   /// The cost computation for widening instruction \p I with consecutive
1350   /// memory access.
1351   unsigned getConsecutiveMemOpCost(Instruction *I, unsigned VF);
1352 
1353   /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1354   /// Load: scalar load + broadcast.
1355   /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1356   /// element)
1357   unsigned getUniformMemOpCost(Instruction *I, unsigned VF);
1358 
1359   /// Estimate the overhead of scalarizing an instruction. This is a
1360   /// convenience wrapper for the type-based getScalarizationOverhead API.
1361   unsigned getScalarizationOverhead(Instruction *I, unsigned VF);
1362 
1363   /// Returns whether the instruction is a load or store and will be a emitted
1364   /// as a vector operation.
1365   bool isConsecutiveLoadOrStore(Instruction *I);
1366 
1367   /// Returns true if an artificially high cost for emulated masked memrefs
1368   /// should be used.
1369   bool useEmulatedMaskMemRefHack(Instruction *I);
1370 
1371   /// Map of scalar integer values to the smallest bitwidth they can be legally
1372   /// represented as. The vector equivalents of these values should be truncated
1373   /// to this type.
1374   MapVector<Instruction *, uint64_t> MinBWs;
1375 
1376   /// A type representing the costs for instructions if they were to be
1377   /// scalarized rather than vectorized. The entries are Instruction-Cost
1378   /// pairs.
1379   using ScalarCostsTy = DenseMap<Instruction *, unsigned>;
1380 
1381   /// A set containing all BasicBlocks that are known to present after
1382   /// vectorization as a predicated block.
1383   SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;
1384 
1385   /// Records whether it is allowed to have the original scalar loop execute at
1386   /// least once. This may be needed as a fallback loop in case runtime
1387   /// aliasing/dependence checks fail, or to handle the tail/remainder
1388   /// iterations when the trip count is unknown or doesn't divide by the VF,
1389   /// or as a peel-loop to handle gaps in interleave-groups.
1390   /// Under optsize and when the trip count is very small we don't allow any
1391   /// iterations to execute in the scalar loop.
1392   ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1393 
1394   /// All blocks of loop are to be masked to fold tail of scalar iterations.
1395   bool FoldTailByMasking = false;
1396 
1397   /// A map holding scalar costs for different vectorization factors. The
1398   /// presence of a cost for an instruction in the mapping indicates that the
1399   /// instruction will be scalarized when vectorizing with the associated
1400   /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1401   DenseMap<unsigned, ScalarCostsTy> InstsToScalarize;
1402 
1403   /// Holds the instructions known to be uniform after vectorization.
1404   /// The data is collected per VF.
1405   DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Uniforms;
1406 
1407   /// Holds the instructions known to be scalar after vectorization.
1408   /// The data is collected per VF.
1409   DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Scalars;
1410 
1411   /// Holds the instructions (address computations) that are forced to be
1412   /// scalarized.
1413   DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1414 
1415   /// Returns the expected difference in cost from scalarizing the expression
1416   /// feeding a predicated instruction \p PredInst. The instructions to
1417   /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1418   /// non-negative return value implies the expression will be scalarized.
1419   /// Currently, only single-use chains are considered for scalarization.
1420   int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
1421                               unsigned VF);
1422 
1423   /// Collect the instructions that are uniform after vectorization. An
1424   /// instruction is uniform if we represent it with a single scalar value in
1425   /// the vectorized loop corresponding to each vector iteration. Examples of
1426   /// uniform instructions include pointer operands of consecutive or
1427   /// interleaved memory accesses. Note that although uniformity implies an
1428   /// instruction will be scalar, the reverse is not true. In general, a
1429   /// scalarized instruction will be represented by VF scalar values in the
1430   /// vectorized loop, each corresponding to an iteration of the original
1431   /// scalar loop.
1432   void collectLoopUniforms(unsigned VF);
1433 
1434   /// Collect the instructions that are scalar after vectorization. An
1435   /// instruction is scalar if it is known to be uniform or will be scalarized
1436   /// during vectorization. Non-uniform scalarized instructions will be
1437   /// represented by VF values in the vectorized loop, each corresponding to an
1438   /// iteration of the original scalar loop.
1439   void collectLoopScalars(unsigned VF);
1440 
1441   /// Keeps cost model vectorization decision and cost for instructions.
1442   /// Right now it is used for memory instructions only.
1443   using DecisionList = DenseMap<std::pair<Instruction *, unsigned>,
1444                                 std::pair<InstWidening, unsigned>>;
1445 
1446   DecisionList WideningDecisions;
1447 
1448   /// Returns true if \p V is expected to be vectorized and it needs to be
1449   /// extracted.
1450   bool needsExtract(Value *V, unsigned VF) const {
1451     Instruction *I = dyn_cast<Instruction>(V);
1452     if (VF == 1 || !I || !TheLoop->contains(I) || TheLoop->isLoopInvariant(I))
1453       return false;
1454 
1455     // Assume we can vectorize V (and hence we need extraction) if the
1456     // scalars are not computed yet. This can happen, because it is called
1457     // via getScalarizationOverhead from setCostBasedWideningDecision, before
1458     // the scalars are collected. That should be a safe assumption in most
1459     // cases, because we check if the operands have vectorizable types
1460     // beforehand in LoopVectorizationLegality.
1461     return Scalars.find(VF) == Scalars.end() ||
1462            !isScalarAfterVectorization(I, VF);
1463   };
1464 
1465   /// Returns a range containing only operands needing to be extracted.
1466   SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1467                                                    unsigned VF) {
1468     return SmallVector<Value *, 4>(make_filter_range(
1469         Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1470   }
1471 
1472 public:
1473   /// The loop that we evaluate.
1474   Loop *TheLoop;
1475 
1476   /// Predicated scalar evolution analysis.
1477   PredicatedScalarEvolution &PSE;
1478 
1479   /// Loop Info analysis.
1480   LoopInfo *LI;
1481 
1482   /// Vectorization legality.
1483   LoopVectorizationLegality *Legal;
1484 
1485   /// Vector target information.
1486   const TargetTransformInfo &TTI;
1487 
1488   /// Target Library Info.
1489   const TargetLibraryInfo *TLI;
1490 
1491   /// Demanded bits analysis.
1492   DemandedBits *DB;
1493 
1494   /// Assumption cache.
1495   AssumptionCache *AC;
1496 
1497   /// Interface to emit optimization remarks.
1498   OptimizationRemarkEmitter *ORE;
1499 
1500   const Function *TheFunction;
1501 
1502   /// Loop Vectorize Hint.
1503   const LoopVectorizeHints *Hints;
1504 
1505   /// The interleave access information contains groups of interleaved accesses
1506   /// with the same stride and close to each other.
1507   InterleavedAccessInfo &InterleaveInfo;
1508 
1509   /// Values to ignore in the cost model.
1510   SmallPtrSet<const Value *, 16> ValuesToIgnore;
1511 
1512   /// Values to ignore in the cost model when VF > 1.
1513   SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1514 };
1515 
1516 } // end namespace llvm
1517 
1518 // Return true if \p OuterLp is an outer loop annotated with hints for explicit
1519 // vectorization. The loop needs to be annotated with #pragma omp simd
1520 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
1521 // vector length information is not provided, vectorization is not considered
1522 // explicit. Interleave hints are not allowed either. These limitations will be
1523 // relaxed in the future.
1524 // Please, note that we are currently forced to abuse the pragma 'clang
1525 // vectorize' semantics. This pragma provides *auto-vectorization hints*
1526 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
1527 // provides *explicit vectorization hints* (LV can bypass legal checks and
1528 // assume that vectorization is legal). However, both hints are implemented
1529 // using the same metadata (llvm.loop.vectorize, processed by
1530 // LoopVectorizeHints). This will be fixed in the future when the native IR
1531 // representation for pragma 'omp simd' is introduced.
1532 static bool isExplicitVecOuterLoop(Loop *OuterLp,
1533                                    OptimizationRemarkEmitter *ORE) {
1534   assert(!OuterLp->empty() && "This is not an outer loop");
1535   LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
1536 
1537   // Only outer loops with an explicit vectorization hint are supported.
1538   // Unannotated outer loops are ignored.
1539   if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
1540     return false;
1541 
1542   Function *Fn = OuterLp->getHeader()->getParent();
1543   if (!Hints.allowVectorization(Fn, OuterLp,
1544                                 true /*VectorizeOnlyWhenForced*/)) {
1545     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
1546     return false;
1547   }
1548 
1549   if (Hints.getInterleave() > 1) {
1550     // TODO: Interleave support is future work.
1551     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
1552                          "outer loops.\n");
1553     Hints.emitRemarkWithHints();
1554     return false;
1555   }
1556 
1557   return true;
1558 }
1559 
1560 static void collectSupportedLoops(Loop &L, LoopInfo *LI,
1561                                   OptimizationRemarkEmitter *ORE,
1562                                   SmallVectorImpl<Loop *> &V) {
1563   // Collect inner loops and outer loops without irreducible control flow. For
1564   // now, only collect outer loops that have explicit vectorization hints. If we
1565   // are stress testing the VPlan H-CFG construction, we collect the outermost
1566   // loop of every loop nest.
1567   if (L.empty() || VPlanBuildStressTest ||
1568       (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
1569     LoopBlocksRPO RPOT(&L);
1570     RPOT.perform(LI);
1571     if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
1572       V.push_back(&L);
1573       // TODO: Collect inner loops inside marked outer loops in case
1574       // vectorization fails for the outer loop. Do not invoke
1575       // 'containsIrreducibleCFG' again for inner loops when the outer loop is
1576       // already known to be reducible. We can use an inherited attribute for
1577       // that.
1578       return;
1579     }
1580   }
1581   for (Loop *InnerL : L)
1582     collectSupportedLoops(*InnerL, LI, ORE, V);
1583 }
1584 
1585 namespace {
1586 
1587 /// The LoopVectorize Pass.
1588 struct LoopVectorize : public FunctionPass {
1589   /// Pass identification, replacement for typeid
1590   static char ID;
1591 
1592   LoopVectorizePass Impl;
1593 
1594   explicit LoopVectorize(bool InterleaveOnlyWhenForced = false,
1595                          bool VectorizeOnlyWhenForced = false)
1596       : FunctionPass(ID) {
1597     Impl.InterleaveOnlyWhenForced = InterleaveOnlyWhenForced;
1598     Impl.VectorizeOnlyWhenForced = VectorizeOnlyWhenForced;
1599     initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
1600   }
1601 
1602   bool runOnFunction(Function &F) override {
1603     if (skipFunction(F))
1604       return false;
1605 
1606     auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
1607     auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
1608     auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
1609     auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
1610     auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
1611     auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
1612     auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
1613     auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
1614     auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
1615     auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
1616     auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
1617     auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
1618     auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
1619 
1620     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
1621         [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
1622 
1623     return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC,
1624                         GetLAA, *ORE, PSI);
1625   }
1626 
1627   void getAnalysisUsage(AnalysisUsage &AU) const override {
1628     AU.addRequired<AssumptionCacheTracker>();
1629     AU.addRequired<BlockFrequencyInfoWrapperPass>();
1630     AU.addRequired<DominatorTreeWrapperPass>();
1631     AU.addRequired<LoopInfoWrapperPass>();
1632     AU.addRequired<ScalarEvolutionWrapperPass>();
1633     AU.addRequired<TargetTransformInfoWrapperPass>();
1634     AU.addRequired<AAResultsWrapperPass>();
1635     AU.addRequired<LoopAccessLegacyAnalysis>();
1636     AU.addRequired<DemandedBitsWrapperPass>();
1637     AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
1638 
1639     // We currently do not preserve loopinfo/dominator analyses with outer loop
1640     // vectorization. Until this is addressed, mark these analyses as preserved
1641     // only for non-VPlan-native path.
1642     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
1643     if (!EnableVPlanNativePath) {
1644       AU.addPreserved<LoopInfoWrapperPass>();
1645       AU.addPreserved<DominatorTreeWrapperPass>();
1646     }
1647 
1648     AU.addPreserved<BasicAAWrapperPass>();
1649     AU.addPreserved<GlobalsAAWrapperPass>();
1650     AU.addRequired<ProfileSummaryInfoWrapperPass>();
1651   }
1652 };
1653 
1654 } // end anonymous namespace
1655 
1656 //===----------------------------------------------------------------------===//
1657 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
1658 // LoopVectorizationCostModel and LoopVectorizationPlanner.
1659 //===----------------------------------------------------------------------===//
1660 
1661 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
1662   // We need to place the broadcast of invariant variables outside the loop,
1663   // but only if it's proven safe to do so. Else, broadcast will be inside
1664   // vector loop body.
1665   Instruction *Instr = dyn_cast<Instruction>(V);
1666   bool SafeToHoist = OrigLoop->isLoopInvariant(V) &&
1667                      (!Instr ||
1668                       DT->dominates(Instr->getParent(), LoopVectorPreHeader));
1669   // Place the code for broadcasting invariant variables in the new preheader.
1670   IRBuilder<>::InsertPointGuard Guard(Builder);
1671   if (SafeToHoist)
1672     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
1673 
1674   // Broadcast the scalar into all locations in the vector.
1675   Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
1676 
1677   return Shuf;
1678 }
1679 
1680 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
1681     const InductionDescriptor &II, Value *Step, Instruction *EntryVal) {
1682   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1683          "Expected either an induction phi-node or a truncate of it!");
1684   Value *Start = II.getStartValue();
1685 
1686   // Construct the initial value of the vector IV in the vector loop preheader
1687   auto CurrIP = Builder.saveIP();
1688   Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
1689   if (isa<TruncInst>(EntryVal)) {
1690     assert(Start->getType()->isIntegerTy() &&
1691            "Truncation requires an integer type");
1692     auto *TruncType = cast<IntegerType>(EntryVal->getType());
1693     Step = Builder.CreateTrunc(Step, TruncType);
1694     Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
1695   }
1696   Value *SplatStart = Builder.CreateVectorSplat(VF, Start);
1697   Value *SteppedStart =
1698       getStepVector(SplatStart, 0, Step, II.getInductionOpcode());
1699 
1700   // We create vector phi nodes for both integer and floating-point induction
1701   // variables. Here, we determine the kind of arithmetic we will perform.
1702   Instruction::BinaryOps AddOp;
1703   Instruction::BinaryOps MulOp;
1704   if (Step->getType()->isIntegerTy()) {
1705     AddOp = Instruction::Add;
1706     MulOp = Instruction::Mul;
1707   } else {
1708     AddOp = II.getInductionOpcode();
1709     MulOp = Instruction::FMul;
1710   }
1711 
1712   // Multiply the vectorization factor by the step using integer or
1713   // floating-point arithmetic as appropriate.
1714   Value *ConstVF = getSignedIntOrFpConstant(Step->getType(), VF);
1715   Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF));
1716 
1717   // Create a vector splat to use in the induction update.
1718   //
1719   // FIXME: If the step is non-constant, we create the vector splat with
1720   //        IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
1721   //        handle a constant vector splat.
1722   Value *SplatVF = isa<Constant>(Mul)
1723                        ? ConstantVector::getSplat(VF, cast<Constant>(Mul))
1724                        : Builder.CreateVectorSplat(VF, Mul);
1725   Builder.restoreIP(CurrIP);
1726 
1727   // We may need to add the step a number of times, depending on the unroll
1728   // factor. The last of those goes into the PHI.
1729   PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
1730                                     &*LoopVectorBody->getFirstInsertionPt());
1731   VecInd->setDebugLoc(EntryVal->getDebugLoc());
1732   Instruction *LastInduction = VecInd;
1733   for (unsigned Part = 0; Part < UF; ++Part) {
1734     VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction);
1735 
1736     if (isa<TruncInst>(EntryVal))
1737       addMetadata(LastInduction, EntryVal);
1738     recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, Part);
1739 
1740     LastInduction = cast<Instruction>(addFastMathFlag(
1741         Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")));
1742     LastInduction->setDebugLoc(EntryVal->getDebugLoc());
1743   }
1744 
1745   // Move the last step to the end of the latch block. This ensures consistent
1746   // placement of all induction updates.
1747   auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
1748   auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator());
1749   auto *ICmp = cast<Instruction>(Br->getCondition());
1750   LastInduction->moveBefore(ICmp);
1751   LastInduction->setName("vec.ind.next");
1752 
1753   VecInd->addIncoming(SteppedStart, LoopVectorPreHeader);
1754   VecInd->addIncoming(LastInduction, LoopVectorLatch);
1755 }
1756 
1757 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const {
1758   return Cost->isScalarAfterVectorization(I, VF) ||
1759          Cost->isProfitableToScalarize(I, VF);
1760 }
1761 
1762 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const {
1763   if (shouldScalarizeInstruction(IV))
1764     return true;
1765   auto isScalarInst = [&](User *U) -> bool {
1766     auto *I = cast<Instruction>(U);
1767     return (OrigLoop->contains(I) && shouldScalarizeInstruction(I));
1768   };
1769   return llvm::any_of(IV->users(), isScalarInst);
1770 }
1771 
1772 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast(
1773     const InductionDescriptor &ID, const Instruction *EntryVal,
1774     Value *VectorLoopVal, unsigned Part, unsigned Lane) {
1775   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1776          "Expected either an induction phi-node or a truncate of it!");
1777 
1778   // This induction variable is not the phi from the original loop but the
1779   // newly-created IV based on the proof that casted Phi is equal to the
1780   // uncasted Phi in the vectorized loop (under a runtime guard possibly). It
1781   // re-uses the same InductionDescriptor that original IV uses but we don't
1782   // have to do any recording in this case - that is done when original IV is
1783   // processed.
1784   if (isa<TruncInst>(EntryVal))
1785     return;
1786 
1787   const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts();
1788   if (Casts.empty())
1789     return;
1790   // Only the first Cast instruction in the Casts vector is of interest.
1791   // The rest of the Casts (if exist) have no uses outside the
1792   // induction update chain itself.
1793   Instruction *CastInst = *Casts.begin();
1794   if (Lane < UINT_MAX)
1795     VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal);
1796   else
1797     VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal);
1798 }
1799 
1800 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) {
1801   assert((IV->getType()->isIntegerTy() || IV != OldInduction) &&
1802          "Primary induction variable must have an integer type");
1803 
1804   auto II = Legal->getInductionVars()->find(IV);
1805   assert(II != Legal->getInductionVars()->end() && "IV is not an induction");
1806 
1807   auto ID = II->second;
1808   assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
1809 
1810   // The scalar value to broadcast. This will be derived from the canonical
1811   // induction variable.
1812   Value *ScalarIV = nullptr;
1813 
1814   // The value from the original loop to which we are mapping the new induction
1815   // variable.
1816   Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
1817 
1818   // True if we have vectorized the induction variable.
1819   auto VectorizedIV = false;
1820 
1821   // Determine if we want a scalar version of the induction variable. This is
1822   // true if the induction variable itself is not widened, or if it has at
1823   // least one user in the loop that is not widened.
1824   auto NeedsScalarIV = VF > 1 && needsScalarInduction(EntryVal);
1825 
1826   // Generate code for the induction step. Note that induction steps are
1827   // required to be loop-invariant
1828   assert(PSE.getSE()->isLoopInvariant(ID.getStep(), OrigLoop) &&
1829          "Induction step should be loop invariant");
1830   auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
1831   Value *Step = nullptr;
1832   if (PSE.getSE()->isSCEVable(IV->getType())) {
1833     SCEVExpander Exp(*PSE.getSE(), DL, "induction");
1834     Step = Exp.expandCodeFor(ID.getStep(), ID.getStep()->getType(),
1835                              LoopVectorPreHeader->getTerminator());
1836   } else {
1837     Step = cast<SCEVUnknown>(ID.getStep())->getValue();
1838   }
1839 
1840   // Try to create a new independent vector induction variable. If we can't
1841   // create the phi node, we will splat the scalar induction variable in each
1842   // loop iteration.
1843   if (VF > 1 && !shouldScalarizeInstruction(EntryVal)) {
1844     createVectorIntOrFpInductionPHI(ID, Step, EntryVal);
1845     VectorizedIV = true;
1846   }
1847 
1848   // If we haven't yet vectorized the induction variable, or if we will create
1849   // a scalar one, we need to define the scalar induction variable and step
1850   // values. If we were given a truncation type, truncate the canonical
1851   // induction variable and step. Otherwise, derive these values from the
1852   // induction descriptor.
1853   if (!VectorizedIV || NeedsScalarIV) {
1854     ScalarIV = Induction;
1855     if (IV != OldInduction) {
1856       ScalarIV = IV->getType()->isIntegerTy()
1857                      ? Builder.CreateSExtOrTrunc(Induction, IV->getType())
1858                      : Builder.CreateCast(Instruction::SIToFP, Induction,
1859                                           IV->getType());
1860       ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID);
1861       ScalarIV->setName("offset.idx");
1862     }
1863     if (Trunc) {
1864       auto *TruncType = cast<IntegerType>(Trunc->getType());
1865       assert(Step->getType()->isIntegerTy() &&
1866              "Truncation requires an integer step");
1867       ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType);
1868       Step = Builder.CreateTrunc(Step, TruncType);
1869     }
1870   }
1871 
1872   // If we haven't yet vectorized the induction variable, splat the scalar
1873   // induction variable, and build the necessary step vectors.
1874   // TODO: Don't do it unless the vectorized IV is really required.
1875   if (!VectorizedIV) {
1876     Value *Broadcasted = getBroadcastInstrs(ScalarIV);
1877     for (unsigned Part = 0; Part < UF; ++Part) {
1878       Value *EntryPart =
1879           getStepVector(Broadcasted, VF * Part, Step, ID.getInductionOpcode());
1880       VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart);
1881       if (Trunc)
1882         addMetadata(EntryPart, Trunc);
1883       recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part);
1884     }
1885   }
1886 
1887   // If an induction variable is only used for counting loop iterations or
1888   // calculating addresses, it doesn't need to be widened. Create scalar steps
1889   // that can be used by instructions we will later scalarize. Note that the
1890   // addition of the scalar steps will not increase the number of instructions
1891   // in the loop in the common case prior to InstCombine. We will be trading
1892   // one vector extract for each scalar step.
1893   if (NeedsScalarIV)
1894     buildScalarSteps(ScalarIV, Step, EntryVal, ID);
1895 }
1896 
1897 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step,
1898                                           Instruction::BinaryOps BinOp) {
1899   // Create and check the types.
1900   assert(Val->getType()->isVectorTy() && "Must be a vector");
1901   int VLen = Val->getType()->getVectorNumElements();
1902 
1903   Type *STy = Val->getType()->getScalarType();
1904   assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
1905          "Induction Step must be an integer or FP");
1906   assert(Step->getType() == STy && "Step has wrong type");
1907 
1908   SmallVector<Constant *, 8> Indices;
1909 
1910   if (STy->isIntegerTy()) {
1911     // Create a vector of consecutive numbers from zero to VF.
1912     for (int i = 0; i < VLen; ++i)
1913       Indices.push_back(ConstantInt::get(STy, StartIdx + i));
1914 
1915     // Add the consecutive indices to the vector value.
1916     Constant *Cv = ConstantVector::get(Indices);
1917     assert(Cv->getType() == Val->getType() && "Invalid consecutive vec");
1918     Step = Builder.CreateVectorSplat(VLen, Step);
1919     assert(Step->getType() == Val->getType() && "Invalid step vec");
1920     // FIXME: The newly created binary instructions should contain nsw/nuw flags,
1921     // which can be found from the original scalar operations.
1922     Step = Builder.CreateMul(Cv, Step);
1923     return Builder.CreateAdd(Val, Step, "induction");
1924   }
1925 
1926   // Floating point induction.
1927   assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
1928          "Binary Opcode should be specified for FP induction");
1929   // Create a vector of consecutive numbers from zero to VF.
1930   for (int i = 0; i < VLen; ++i)
1931     Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i)));
1932 
1933   // Add the consecutive indices to the vector value.
1934   Constant *Cv = ConstantVector::get(Indices);
1935 
1936   Step = Builder.CreateVectorSplat(VLen, Step);
1937 
1938   // Floating point operations had to be 'fast' to enable the induction.
1939   FastMathFlags Flags;
1940   Flags.setFast();
1941 
1942   Value *MulOp = Builder.CreateFMul(Cv, Step);
1943   if (isa<Instruction>(MulOp))
1944     // Have to check, MulOp may be a constant
1945     cast<Instruction>(MulOp)->setFastMathFlags(Flags);
1946 
1947   Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
1948   if (isa<Instruction>(BOp))
1949     cast<Instruction>(BOp)->setFastMathFlags(Flags);
1950   return BOp;
1951 }
1952 
1953 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
1954                                            Instruction *EntryVal,
1955                                            const InductionDescriptor &ID) {
1956   // We shouldn't have to build scalar steps if we aren't vectorizing.
1957   assert(VF > 1 && "VF should be greater than one");
1958 
1959   // Get the value type and ensure it and the step have the same integer type.
1960   Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
1961   assert(ScalarIVTy == Step->getType() &&
1962          "Val and Step should have the same type");
1963 
1964   // We build scalar steps for both integer and floating-point induction
1965   // variables. Here, we determine the kind of arithmetic we will perform.
1966   Instruction::BinaryOps AddOp;
1967   Instruction::BinaryOps MulOp;
1968   if (ScalarIVTy->isIntegerTy()) {
1969     AddOp = Instruction::Add;
1970     MulOp = Instruction::Mul;
1971   } else {
1972     AddOp = ID.getInductionOpcode();
1973     MulOp = Instruction::FMul;
1974   }
1975 
1976   // Determine the number of scalars we need to generate for each unroll
1977   // iteration. If EntryVal is uniform, we only need to generate the first
1978   // lane. Otherwise, we generate all VF values.
1979   unsigned Lanes =
1980       Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) ? 1
1981                                                                          : VF;
1982   // Compute the scalar steps and save the results in VectorLoopValueMap.
1983   for (unsigned Part = 0; Part < UF; ++Part) {
1984     for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
1985       auto *StartIdx = getSignedIntOrFpConstant(ScalarIVTy, VF * Part + Lane);
1986       auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step));
1987       auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul));
1988       VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add);
1989       recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane);
1990     }
1991   }
1992 }
1993 
1994 Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) {
1995   assert(V != Induction && "The new induction variable should not be used.");
1996   assert(!V->getType()->isVectorTy() && "Can't widen a vector");
1997   assert(!V->getType()->isVoidTy() && "Type does not produce a value");
1998 
1999   // If we have a stride that is replaced by one, do it here. Defer this for
2000   // the VPlan-native path until we start running Legal checks in that path.
2001   if (!EnableVPlanNativePath && Legal->hasStride(V))
2002     V = ConstantInt::get(V->getType(), 1);
2003 
2004   // If we have a vector mapped to this value, return it.
2005   if (VectorLoopValueMap.hasVectorValue(V, Part))
2006     return VectorLoopValueMap.getVectorValue(V, Part);
2007 
2008   // If the value has not been vectorized, check if it has been scalarized
2009   // instead. If it has been scalarized, and we actually need the value in
2010   // vector form, we will construct the vector values on demand.
2011   if (VectorLoopValueMap.hasAnyScalarValue(V)) {
2012     Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0});
2013 
2014     // If we've scalarized a value, that value should be an instruction.
2015     auto *I = cast<Instruction>(V);
2016 
2017     // If we aren't vectorizing, we can just copy the scalar map values over to
2018     // the vector map.
2019     if (VF == 1) {
2020       VectorLoopValueMap.setVectorValue(V, Part, ScalarValue);
2021       return ScalarValue;
2022     }
2023 
2024     // Get the last scalar instruction we generated for V and Part. If the value
2025     // is known to be uniform after vectorization, this corresponds to lane zero
2026     // of the Part unroll iteration. Otherwise, the last instruction is the one
2027     // we created for the last vector lane of the Part unroll iteration.
2028     unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) ? 0 : VF - 1;
2029     auto *LastInst = cast<Instruction>(
2030         VectorLoopValueMap.getScalarValue(V, {Part, LastLane}));
2031 
2032     // Set the insert point after the last scalarized instruction. This ensures
2033     // the insertelement sequence will directly follow the scalar definitions.
2034     auto OldIP = Builder.saveIP();
2035     auto NewIP = std::next(BasicBlock::iterator(LastInst));
2036     Builder.SetInsertPoint(&*NewIP);
2037 
2038     // However, if we are vectorizing, we need to construct the vector values.
2039     // If the value is known to be uniform after vectorization, we can just
2040     // broadcast the scalar value corresponding to lane zero for each unroll
2041     // iteration. Otherwise, we construct the vector values using insertelement
2042     // instructions. Since the resulting vectors are stored in
2043     // VectorLoopValueMap, we will only generate the insertelements once.
2044     Value *VectorValue = nullptr;
2045     if (Cost->isUniformAfterVectorization(I, VF)) {
2046       VectorValue = getBroadcastInstrs(ScalarValue);
2047       VectorLoopValueMap.setVectorValue(V, Part, VectorValue);
2048     } else {
2049       // Initialize packing with insertelements to start from undef.
2050       Value *Undef = UndefValue::get(VectorType::get(V->getType(), VF));
2051       VectorLoopValueMap.setVectorValue(V, Part, Undef);
2052       for (unsigned Lane = 0; Lane < VF; ++Lane)
2053         packScalarIntoVectorValue(V, {Part, Lane});
2054       VectorValue = VectorLoopValueMap.getVectorValue(V, Part);
2055     }
2056     Builder.restoreIP(OldIP);
2057     return VectorValue;
2058   }
2059 
2060   // If this scalar is unknown, assume that it is a constant or that it is
2061   // loop invariant. Broadcast V and save the value for future uses.
2062   Value *B = getBroadcastInstrs(V);
2063   VectorLoopValueMap.setVectorValue(V, Part, B);
2064   return B;
2065 }
2066 
2067 Value *
2068 InnerLoopVectorizer::getOrCreateScalarValue(Value *V,
2069                                             const VPIteration &Instance) {
2070   // If the value is not an instruction contained in the loop, it should
2071   // already be scalar.
2072   if (OrigLoop->isLoopInvariant(V))
2073     return V;
2074 
2075   assert(Instance.Lane > 0
2076              ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF)
2077              : true && "Uniform values only have lane zero");
2078 
2079   // If the value from the original loop has not been vectorized, it is
2080   // represented by UF x VF scalar values in the new loop. Return the requested
2081   // scalar value.
2082   if (VectorLoopValueMap.hasScalarValue(V, Instance))
2083     return VectorLoopValueMap.getScalarValue(V, Instance);
2084 
2085   // If the value has not been scalarized, get its entry in VectorLoopValueMap
2086   // for the given unroll part. If this entry is not a vector type (i.e., the
2087   // vectorization factor is one), there is no need to generate an
2088   // extractelement instruction.
2089   auto *U = getOrCreateVectorValue(V, Instance.Part);
2090   if (!U->getType()->isVectorTy()) {
2091     assert(VF == 1 && "Value not scalarized has non-vector type");
2092     return U;
2093   }
2094 
2095   // Otherwise, the value from the original loop has been vectorized and is
2096   // represented by UF vector values. Extract and return the requested scalar
2097   // value from the appropriate vector lane.
2098   return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane));
2099 }
2100 
2101 void InnerLoopVectorizer::packScalarIntoVectorValue(
2102     Value *V, const VPIteration &Instance) {
2103   assert(V != Induction && "The new induction variable should not be used.");
2104   assert(!V->getType()->isVectorTy() && "Can't pack a vector");
2105   assert(!V->getType()->isVoidTy() && "Type does not produce a value");
2106 
2107   Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance);
2108   Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part);
2109   VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst,
2110                                             Builder.getInt32(Instance.Lane));
2111   VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue);
2112 }
2113 
2114 Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
2115   assert(Vec->getType()->isVectorTy() && "Invalid type");
2116   SmallVector<Constant *, 8> ShuffleMask;
2117   for (unsigned i = 0; i < VF; ++i)
2118     ShuffleMask.push_back(Builder.getInt32(VF - i - 1));
2119 
2120   return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()),
2121                                      ConstantVector::get(ShuffleMask),
2122                                      "reverse");
2123 }
2124 
2125 // Return whether we allow using masked interleave-groups (for dealing with
2126 // strided loads/stores that reside in predicated blocks, or for dealing
2127 // with gaps).
2128 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2129   // If an override option has been passed in for interleaved accesses, use it.
2130   if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2131     return EnableMaskedInterleavedMemAccesses;
2132 
2133   return TTI.enableMaskedInterleavedAccessVectorization();
2134 }
2135 
2136 // Try to vectorize the interleave group that \p Instr belongs to.
2137 //
2138 // E.g. Translate following interleaved load group (factor = 3):
2139 //   for (i = 0; i < N; i+=3) {
2140 //     R = Pic[i];             // Member of index 0
2141 //     G = Pic[i+1];           // Member of index 1
2142 //     B = Pic[i+2];           // Member of index 2
2143 //     ... // do something to R, G, B
2144 //   }
2145 // To:
2146 //   %wide.vec = load <12 x i32>                       ; Read 4 tuples of R,G,B
2147 //   %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9>   ; R elements
2148 //   %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10>  ; G elements
2149 //   %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11>  ; B elements
2150 //
2151 // Or translate following interleaved store group (factor = 3):
2152 //   for (i = 0; i < N; i+=3) {
2153 //     ... do something to R, G, B
2154 //     Pic[i]   = R;           // Member of index 0
2155 //     Pic[i+1] = G;           // Member of index 1
2156 //     Pic[i+2] = B;           // Member of index 2
2157 //   }
2158 // To:
2159 //   %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2160 //   %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u>
2161 //   %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2162 //        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements
2163 //   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
2164 void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr,
2165                                                    VectorParts *BlockInMask) {
2166   const InterleaveGroup<Instruction> *Group =
2167       Cost->getInterleavedAccessGroup(Instr);
2168   assert(Group && "Fail to get an interleaved access group.");
2169 
2170   // Skip if current instruction is not the insert position.
2171   if (Instr != Group->getInsertPos())
2172     return;
2173 
2174   const DataLayout &DL = Instr->getModule()->getDataLayout();
2175   Value *Ptr = getLoadStorePointerOperand(Instr);
2176 
2177   // Prepare for the vector type of the interleaved load/store.
2178   Type *ScalarTy = getMemInstValueType(Instr);
2179   unsigned InterleaveFactor = Group->getFactor();
2180   Type *VecTy = VectorType::get(ScalarTy, InterleaveFactor * VF);
2181   Type *PtrTy = VecTy->getPointerTo(getLoadStoreAddressSpace(Instr));
2182 
2183   // Prepare for the new pointers.
2184   setDebugLocFromInst(Builder, Ptr);
2185   SmallVector<Value *, 2> NewPtrs;
2186   unsigned Index = Group->getIndex(Instr);
2187 
2188   VectorParts Mask;
2189   bool IsMaskForCondRequired = BlockInMask;
2190   if (IsMaskForCondRequired) {
2191     Mask = *BlockInMask;
2192     // TODO: extend the masked interleaved-group support to reversed access.
2193     assert(!Group->isReverse() && "Reversed masked interleave-group "
2194                                   "not supported.");
2195   }
2196 
2197   // If the group is reverse, adjust the index to refer to the last vector lane
2198   // instead of the first. We adjust the index from the first vector lane,
2199   // rather than directly getting the pointer for lane VF - 1, because the
2200   // pointer operand of the interleaved access is supposed to be uniform. For
2201   // uniform instructions, we're only required to generate a value for the
2202   // first vector lane in each unroll iteration.
2203   if (Group->isReverse())
2204     Index += (VF - 1) * Group->getFactor();
2205 
2206   bool InBounds = false;
2207   if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
2208     InBounds = gep->isInBounds();
2209 
2210   for (unsigned Part = 0; Part < UF; Part++) {
2211     Value *NewPtr = getOrCreateScalarValue(Ptr, {Part, 0});
2212 
2213     // Notice current instruction could be any index. Need to adjust the address
2214     // to the member of index 0.
2215     //
2216     // E.g.  a = A[i+1];     // Member of index 1 (Current instruction)
2217     //       b = A[i];       // Member of index 0
2218     // Current pointer is pointed to A[i+1], adjust it to A[i].
2219     //
2220     // E.g.  A[i+1] = a;     // Member of index 1
2221     //       A[i]   = b;     // Member of index 0
2222     //       A[i+2] = c;     // Member of index 2 (Current instruction)
2223     // Current pointer is pointed to A[i+2], adjust it to A[i].
2224     NewPtr = Builder.CreateGEP(ScalarTy, NewPtr, Builder.getInt32(-Index));
2225     if (InBounds)
2226       cast<GetElementPtrInst>(NewPtr)->setIsInBounds(true);
2227 
2228     // Cast to the vector pointer type.
2229     NewPtrs.push_back(Builder.CreateBitCast(NewPtr, PtrTy));
2230   }
2231 
2232   setDebugLocFromInst(Builder, Instr);
2233   Value *UndefVec = UndefValue::get(VecTy);
2234 
2235   Value *MaskForGaps = nullptr;
2236   if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) {
2237     MaskForGaps = createBitMaskForGaps(Builder, VF, *Group);
2238     assert(MaskForGaps && "Mask for Gaps is required but it is null");
2239   }
2240 
2241   // Vectorize the interleaved load group.
2242   if (isa<LoadInst>(Instr)) {
2243     // For each unroll part, create a wide load for the group.
2244     SmallVector<Value *, 2> NewLoads;
2245     for (unsigned Part = 0; Part < UF; Part++) {
2246       Instruction *NewLoad;
2247       if (IsMaskForCondRequired || MaskForGaps) {
2248         assert(useMaskedInterleavedAccesses(*TTI) &&
2249                "masked interleaved groups are not allowed.");
2250         Value *GroupMask = MaskForGaps;
2251         if (IsMaskForCondRequired) {
2252           auto *Undefs = UndefValue::get(Mask[Part]->getType());
2253           auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF);
2254           Value *ShuffledMask = Builder.CreateShuffleVector(
2255               Mask[Part], Undefs, RepMask, "interleaved.mask");
2256           GroupMask = MaskForGaps
2257                           ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
2258                                                 MaskForGaps)
2259                           : ShuffledMask;
2260         }
2261         NewLoad =
2262             Builder.CreateMaskedLoad(NewPtrs[Part], Group->getAlignment(),
2263                                      GroupMask, UndefVec, "wide.masked.vec");
2264       }
2265       else
2266         NewLoad = Builder.CreateAlignedLoad(VecTy, NewPtrs[Part],
2267                                             Group->getAlignment(), "wide.vec");
2268       Group->addMetadata(NewLoad);
2269       NewLoads.push_back(NewLoad);
2270     }
2271 
2272     // For each member in the group, shuffle out the appropriate data from the
2273     // wide loads.
2274     for (unsigned I = 0; I < InterleaveFactor; ++I) {
2275       Instruction *Member = Group->getMember(I);
2276 
2277       // Skip the gaps in the group.
2278       if (!Member)
2279         continue;
2280 
2281       Constant *StrideMask = createStrideMask(Builder, I, InterleaveFactor, VF);
2282       for (unsigned Part = 0; Part < UF; Part++) {
2283         Value *StridedVec = Builder.CreateShuffleVector(
2284             NewLoads[Part], UndefVec, StrideMask, "strided.vec");
2285 
2286         // If this member has different type, cast the result type.
2287         if (Member->getType() != ScalarTy) {
2288           VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2289           StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2290         }
2291 
2292         if (Group->isReverse())
2293           StridedVec = reverseVector(StridedVec);
2294 
2295         VectorLoopValueMap.setVectorValue(Member, Part, StridedVec);
2296       }
2297     }
2298     return;
2299   }
2300 
2301   // The sub vector type for current instruction.
2302   VectorType *SubVT = VectorType::get(ScalarTy, VF);
2303 
2304   // Vectorize the interleaved store group.
2305   for (unsigned Part = 0; Part < UF; Part++) {
2306     // Collect the stored vector from each member.
2307     SmallVector<Value *, 4> StoredVecs;
2308     for (unsigned i = 0; i < InterleaveFactor; i++) {
2309       // Interleaved store group doesn't allow a gap, so each index has a member
2310       Instruction *Member = Group->getMember(i);
2311       assert(Member && "Fail to get a member from an interleaved store group");
2312 
2313       Value *StoredVec = getOrCreateVectorValue(
2314           cast<StoreInst>(Member)->getValueOperand(), Part);
2315       if (Group->isReverse())
2316         StoredVec = reverseVector(StoredVec);
2317 
2318       // If this member has different type, cast it to a unified type.
2319 
2320       if (StoredVec->getType() != SubVT)
2321         StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2322 
2323       StoredVecs.push_back(StoredVec);
2324     }
2325 
2326     // Concatenate all vectors into a wide vector.
2327     Value *WideVec = concatenateVectors(Builder, StoredVecs);
2328 
2329     // Interleave the elements in the wide vector.
2330     Constant *IMask = createInterleaveMask(Builder, VF, InterleaveFactor);
2331     Value *IVec = Builder.CreateShuffleVector(WideVec, UndefVec, IMask,
2332                                               "interleaved.vec");
2333 
2334     Instruction *NewStoreInstr;
2335     if (IsMaskForCondRequired) {
2336       auto *Undefs = UndefValue::get(Mask[Part]->getType());
2337       auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF);
2338       Value *ShuffledMask = Builder.CreateShuffleVector(
2339           Mask[Part], Undefs, RepMask, "interleaved.mask");
2340       NewStoreInstr = Builder.CreateMaskedStore(
2341           IVec, NewPtrs[Part], Group->getAlignment(), ShuffledMask);
2342     }
2343     else
2344       NewStoreInstr = Builder.CreateAlignedStore(IVec, NewPtrs[Part],
2345         Group->getAlignment());
2346 
2347     Group->addMetadata(NewStoreInstr);
2348   }
2349 }
2350 
2351 void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
2352                                                      VectorParts *BlockInMask) {
2353   // Attempt to issue a wide load.
2354   LoadInst *LI = dyn_cast<LoadInst>(Instr);
2355   StoreInst *SI = dyn_cast<StoreInst>(Instr);
2356 
2357   assert((LI || SI) && "Invalid Load/Store instruction");
2358 
2359   LoopVectorizationCostModel::InstWidening Decision =
2360       Cost->getWideningDecision(Instr, VF);
2361   assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
2362          "CM decision should be taken at this point");
2363   if (Decision == LoopVectorizationCostModel::CM_Interleave)
2364     return vectorizeInterleaveGroup(Instr);
2365 
2366   Type *ScalarDataTy = getMemInstValueType(Instr);
2367   Type *DataTy = VectorType::get(ScalarDataTy, VF);
2368   Value *Ptr = getLoadStorePointerOperand(Instr);
2369   // An alignment of 0 means target abi alignment. We need to use the scalar's
2370   // target abi alignment in such a case.
2371   const DataLayout &DL = Instr->getModule()->getDataLayout();
2372   const Align Alignment =
2373       DL.getValueOrABITypeAlignment(getLoadStoreAlignment(Instr), ScalarDataTy);
2374   unsigned AddressSpace = getLoadStoreAddressSpace(Instr);
2375 
2376   // Determine if the pointer operand of the access is either consecutive or
2377   // reverse consecutive.
2378   bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse);
2379   bool ConsecutiveStride =
2380       Reverse || (Decision == LoopVectorizationCostModel::CM_Widen);
2381   bool CreateGatherScatter =
2382       (Decision == LoopVectorizationCostModel::CM_GatherScatter);
2383 
2384   // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector
2385   // gather/scatter. Otherwise Decision should have been to Scalarize.
2386   assert((ConsecutiveStride || CreateGatherScatter) &&
2387          "The instruction should be scalarized");
2388 
2389   // Handle consecutive loads/stores.
2390   if (ConsecutiveStride)
2391     Ptr = getOrCreateScalarValue(Ptr, {0, 0});
2392 
2393   VectorParts Mask;
2394   bool isMaskRequired = BlockInMask;
2395   if (isMaskRequired)
2396     Mask = *BlockInMask;
2397 
2398   bool InBounds = false;
2399   if (auto *gep = dyn_cast<GetElementPtrInst>(
2400           getLoadStorePointerOperand(Instr)->stripPointerCasts()))
2401     InBounds = gep->isInBounds();
2402 
2403   const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
2404     // Calculate the pointer for the specific unroll-part.
2405     GetElementPtrInst *PartPtr = nullptr;
2406 
2407     if (Reverse) {
2408       // If the address is consecutive but reversed, then the
2409       // wide store needs to start at the last vector element.
2410       PartPtr = cast<GetElementPtrInst>(
2411           Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(-Part * VF)));
2412       PartPtr->setIsInBounds(InBounds);
2413       PartPtr = cast<GetElementPtrInst>(
2414           Builder.CreateGEP(ScalarDataTy, PartPtr, Builder.getInt32(1 - VF)));
2415       PartPtr->setIsInBounds(InBounds);
2416       if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
2417         Mask[Part] = reverseVector(Mask[Part]);
2418     } else {
2419       PartPtr = cast<GetElementPtrInst>(
2420           Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(Part * VF)));
2421       PartPtr->setIsInBounds(InBounds);
2422     }
2423 
2424     return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
2425   };
2426 
2427   // Handle Stores:
2428   if (SI) {
2429     setDebugLocFromInst(Builder, SI);
2430 
2431     for (unsigned Part = 0; Part < UF; ++Part) {
2432       Instruction *NewSI = nullptr;
2433       Value *StoredVal = getOrCreateVectorValue(SI->getValueOperand(), Part);
2434       if (CreateGatherScatter) {
2435         Value *MaskPart = isMaskRequired ? Mask[Part] : nullptr;
2436         Value *VectorGep = getOrCreateVectorValue(Ptr, Part);
2437         NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep,
2438                                             Alignment.value(), MaskPart);
2439       } else {
2440         if (Reverse) {
2441           // If we store to reverse consecutive memory locations, then we need
2442           // to reverse the order of elements in the stored value.
2443           StoredVal = reverseVector(StoredVal);
2444           // We don't want to update the value in the map as it might be used in
2445           // another expression. So don't call resetVectorValue(StoredVal).
2446         }
2447         auto *VecPtr = CreateVecPtr(Part, Ptr);
2448         if (isMaskRequired)
2449           NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr,
2450                                             Alignment.value(), Mask[Part]);
2451         else
2452           NewSI =
2453               Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment.value());
2454       }
2455       addMetadata(NewSI, SI);
2456     }
2457     return;
2458   }
2459 
2460   // Handle loads.
2461   assert(LI && "Must have a load instruction");
2462   setDebugLocFromInst(Builder, LI);
2463   for (unsigned Part = 0; Part < UF; ++Part) {
2464     Value *NewLI;
2465     if (CreateGatherScatter) {
2466       Value *MaskPart = isMaskRequired ? Mask[Part] : nullptr;
2467       Value *VectorGep = getOrCreateVectorValue(Ptr, Part);
2468       NewLI = Builder.CreateMaskedGather(VectorGep, Alignment.value(), MaskPart,
2469                                          nullptr, "wide.masked.gather");
2470       addMetadata(NewLI, LI);
2471     } else {
2472       auto *VecPtr = CreateVecPtr(Part, Ptr);
2473       if (isMaskRequired)
2474         NewLI = Builder.CreateMaskedLoad(VecPtr, Alignment.value(), Mask[Part],
2475                                          UndefValue::get(DataTy),
2476                                          "wide.masked.load");
2477       else
2478         NewLI = Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment.value(),
2479                                           "wide.load");
2480 
2481       // Add metadata to the load, but setVectorValue to the reverse shuffle.
2482       addMetadata(NewLI, LI);
2483       if (Reverse)
2484         NewLI = reverseVector(NewLI);
2485     }
2486     VectorLoopValueMap.setVectorValue(Instr, Part, NewLI);
2487   }
2488 }
2489 
2490 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
2491                                                const VPIteration &Instance,
2492                                                bool IfPredicateInstr) {
2493   assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
2494 
2495   setDebugLocFromInst(Builder, Instr);
2496 
2497   // Does this instruction return a value ?
2498   bool IsVoidRetTy = Instr->getType()->isVoidTy();
2499 
2500   Instruction *Cloned = Instr->clone();
2501   if (!IsVoidRetTy)
2502     Cloned->setName(Instr->getName() + ".cloned");
2503 
2504   // Replace the operands of the cloned instructions with their scalar
2505   // equivalents in the new loop.
2506   for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
2507     auto *NewOp = getOrCreateScalarValue(Instr->getOperand(op), Instance);
2508     Cloned->setOperand(op, NewOp);
2509   }
2510   addNewMetadata(Cloned, Instr);
2511 
2512   // Place the cloned scalar in the new loop.
2513   Builder.Insert(Cloned);
2514 
2515   // Add the cloned scalar to the scalar map entry.
2516   VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned);
2517 
2518   // If we just cloned a new assumption, add it the assumption cache.
2519   if (auto *II = dyn_cast<IntrinsicInst>(Cloned))
2520     if (II->getIntrinsicID() == Intrinsic::assume)
2521       AC->registerAssumption(II);
2522 
2523   // End if-block.
2524   if (IfPredicateInstr)
2525     PredicatedInstructions.push_back(Cloned);
2526 }
2527 
2528 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start,
2529                                                       Value *End, Value *Step,
2530                                                       Instruction *DL) {
2531   BasicBlock *Header = L->getHeader();
2532   BasicBlock *Latch = L->getLoopLatch();
2533   // As we're just creating this loop, it's possible no latch exists
2534   // yet. If so, use the header as this will be a single block loop.
2535   if (!Latch)
2536     Latch = Header;
2537 
2538   IRBuilder<> Builder(&*Header->getFirstInsertionPt());
2539   Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction);
2540   setDebugLocFromInst(Builder, OldInst);
2541   auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index");
2542 
2543   Builder.SetInsertPoint(Latch->getTerminator());
2544   setDebugLocFromInst(Builder, OldInst);
2545 
2546   // Create i+1 and fill the PHINode.
2547   Value *Next = Builder.CreateAdd(Induction, Step, "index.next");
2548   Induction->addIncoming(Start, L->getLoopPreheader());
2549   Induction->addIncoming(Next, Latch);
2550   // Create the compare.
2551   Value *ICmp = Builder.CreateICmpEQ(Next, End);
2552   Builder.CreateCondBr(ICmp, L->getExitBlock(), Header);
2553 
2554   // Now we have two terminators. Remove the old one from the block.
2555   Latch->getTerminator()->eraseFromParent();
2556 
2557   return Induction;
2558 }
2559 
2560 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
2561   if (TripCount)
2562     return TripCount;
2563 
2564   assert(L && "Create Trip Count for null loop.");
2565   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2566   // Find the loop boundaries.
2567   ScalarEvolution *SE = PSE.getSE();
2568   const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
2569   assert(BackedgeTakenCount != SE->getCouldNotCompute() &&
2570          "Invalid loop count");
2571 
2572   Type *IdxTy = Legal->getWidestInductionType();
2573   assert(IdxTy && "No type for induction");
2574 
2575   // The exit count might have the type of i64 while the phi is i32. This can
2576   // happen if we have an induction variable that is sign extended before the
2577   // compare. The only way that we get a backedge taken count is that the
2578   // induction variable was signed and as such will not overflow. In such a case
2579   // truncation is legal.
2580   if (BackedgeTakenCount->getType()->getPrimitiveSizeInBits() >
2581       IdxTy->getPrimitiveSizeInBits())
2582     BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
2583   BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
2584 
2585   // Get the total trip count from the count by adding 1.
2586   const SCEV *ExitCount = SE->getAddExpr(
2587       BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
2588 
2589   const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
2590 
2591   // Expand the trip count and place the new instructions in the preheader.
2592   // Notice that the pre-header does not change, only the loop body.
2593   SCEVExpander Exp(*SE, DL, "induction");
2594 
2595   // Count holds the overall loop count (N).
2596   TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
2597                                 L->getLoopPreheader()->getTerminator());
2598 
2599   if (TripCount->getType()->isPointerTy())
2600     TripCount =
2601         CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
2602                                     L->getLoopPreheader()->getTerminator());
2603 
2604   return TripCount;
2605 }
2606 
2607 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
2608   if (VectorTripCount)
2609     return VectorTripCount;
2610 
2611   Value *TC = getOrCreateTripCount(L);
2612   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2613 
2614   Type *Ty = TC->getType();
2615   Constant *Step = ConstantInt::get(Ty, VF * UF);
2616 
2617   // If the tail is to be folded by masking, round the number of iterations N
2618   // up to a multiple of Step instead of rounding down. This is done by first
2619   // adding Step-1 and then rounding down. Note that it's ok if this addition
2620   // overflows: the vector induction variable will eventually wrap to zero given
2621   // that it starts at zero and its Step is a power of two; the loop will then
2622   // exit, with the last early-exit vector comparison also producing all-true.
2623   if (Cost->foldTailByMasking()) {
2624     assert(isPowerOf2_32(VF * UF) &&
2625            "VF*UF must be a power of 2 when folding tail by masking");
2626     TC = Builder.CreateAdd(TC, ConstantInt::get(Ty, VF * UF - 1), "n.rnd.up");
2627   }
2628 
2629   // Now we need to generate the expression for the part of the loop that the
2630   // vectorized body will execute. This is equal to N - (N % Step) if scalar
2631   // iterations are not required for correctness, or N - Step, otherwise. Step
2632   // is equal to the vectorization factor (number of SIMD elements) times the
2633   // unroll factor (number of SIMD instructions).
2634   Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
2635 
2636   // If there is a non-reversed interleaved group that may speculatively access
2637   // memory out-of-bounds, we need to ensure that there will be at least one
2638   // iteration of the scalar epilogue loop. Thus, if the step evenly divides
2639   // the trip count, we set the remainder to be equal to the step. If the step
2640   // does not evenly divide the trip count, no adjustment is necessary since
2641   // there will already be scalar iterations. Note that the minimum iterations
2642   // check ensures that N >= Step.
2643   if (VF > 1 && Cost->requiresScalarEpilogue()) {
2644     auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
2645     R = Builder.CreateSelect(IsZero, Step, R);
2646   }
2647 
2648   VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
2649 
2650   return VectorTripCount;
2651 }
2652 
2653 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
2654                                                    const DataLayout &DL) {
2655   // Verify that V is a vector type with same number of elements as DstVTy.
2656   unsigned VF = DstVTy->getNumElements();
2657   VectorType *SrcVecTy = cast<VectorType>(V->getType());
2658   assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match");
2659   Type *SrcElemTy = SrcVecTy->getElementType();
2660   Type *DstElemTy = DstVTy->getElementType();
2661   assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
2662          "Vector elements must have same size");
2663 
2664   // Do a direct cast if element types are castable.
2665   if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
2666     return Builder.CreateBitOrPointerCast(V, DstVTy);
2667   }
2668   // V cannot be directly casted to desired vector type.
2669   // May happen when V is a floating point vector but DstVTy is a vector of
2670   // pointers or vice-versa. Handle this using a two-step bitcast using an
2671   // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
2672   assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
2673          "Only one type should be a pointer type");
2674   assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
2675          "Only one type should be a floating point type");
2676   Type *IntTy =
2677       IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
2678   VectorType *VecIntTy = VectorType::get(IntTy, VF);
2679   Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
2680   return Builder.CreateBitOrPointerCast(CastVal, DstVTy);
2681 }
2682 
2683 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
2684                                                          BasicBlock *Bypass) {
2685   Value *Count = getOrCreateTripCount(L);
2686   // Reuse existing vector loop preheader for TC checks.
2687   // Note that new preheader block is generated for vector loop.
2688   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
2689   IRBuilder<> Builder(TCCheckBlock->getTerminator());
2690 
2691   // Generate code to check if the loop's trip count is less than VF * UF, or
2692   // equal to it in case a scalar epilogue is required; this implies that the
2693   // vector trip count is zero. This check also covers the case where adding one
2694   // to the backedge-taken count overflowed leading to an incorrect trip count
2695   // of zero. In this case we will also jump to the scalar loop.
2696   auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE
2697                                           : ICmpInst::ICMP_ULT;
2698 
2699   // If tail is to be folded, vector loop takes care of all iterations.
2700   Value *CheckMinIters = Builder.getFalse();
2701   if (!Cost->foldTailByMasking())
2702     CheckMinIters = Builder.CreateICmp(
2703         P, Count, ConstantInt::get(Count->getType(), VF * UF),
2704         "min.iters.check");
2705 
2706   // Create new preheader for vector loop.
2707   LoopVectorPreHeader =
2708       SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
2709                  "vector.ph");
2710 
2711   assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
2712                                DT->getNode(Bypass)->getIDom()) &&
2713          "TC check is expected to dominate Bypass");
2714 
2715   // Update dominator for Bypass & LoopExit.
2716   DT->changeImmediateDominator(Bypass, TCCheckBlock);
2717   DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
2718 
2719   ReplaceInstWithInst(
2720       TCCheckBlock->getTerminator(),
2721       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
2722   LoopBypassBlocks.push_back(TCCheckBlock);
2723 }
2724 
2725 void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
2726   // Reuse existing vector loop preheader for SCEV checks.
2727   // Note that new preheader block is generated for vector loop.
2728   BasicBlock *const SCEVCheckBlock = LoopVectorPreHeader;
2729 
2730   // Generate the code to check that the SCEV assumptions that we made.
2731   // We want the new basic block to start at the first instruction in a
2732   // sequence of instructions that form a check.
2733   SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(),
2734                    "scev.check");
2735   Value *SCEVCheck = Exp.expandCodeForPredicate(
2736       &PSE.getUnionPredicate(), SCEVCheckBlock->getTerminator());
2737 
2738   if (auto *C = dyn_cast<ConstantInt>(SCEVCheck))
2739     if (C->isZero())
2740       return;
2741 
2742   assert(!SCEVCheckBlock->getParent()->hasOptSize() &&
2743          "Cannot SCEV check stride or overflow when optimizing for size");
2744 
2745   SCEVCheckBlock->setName("vector.scevcheck");
2746   // Create new preheader for vector loop.
2747   LoopVectorPreHeader =
2748       SplitBlock(SCEVCheckBlock, SCEVCheckBlock->getTerminator(), DT, LI,
2749                  nullptr, "vector.ph");
2750 
2751   // Update dominator only if this is first RT check.
2752   if (LoopBypassBlocks.empty()) {
2753     DT->changeImmediateDominator(Bypass, SCEVCheckBlock);
2754     DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock);
2755   }
2756 
2757   ReplaceInstWithInst(
2758       SCEVCheckBlock->getTerminator(),
2759       BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheck));
2760   LoopBypassBlocks.push_back(SCEVCheckBlock);
2761   AddedSafetyChecks = true;
2762 }
2763 
2764 void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {
2765   // VPlan-native path does not do any analysis for runtime checks currently.
2766   if (EnableVPlanNativePath)
2767     return;
2768 
2769   // Reuse existing vector loop preheader for runtime memory checks.
2770   // Note that new preheader block is generated for vector loop.
2771   BasicBlock *const MemCheckBlock = L->getLoopPreheader();
2772 
2773   // Generate the code that checks in runtime if arrays overlap. We put the
2774   // checks into a separate block to make the more common case of few elements
2775   // faster.
2776   Instruction *FirstCheckInst;
2777   Instruction *MemRuntimeCheck;
2778   std::tie(FirstCheckInst, MemRuntimeCheck) =
2779       Legal->getLAI()->addRuntimeChecks(MemCheckBlock->getTerminator());
2780   if (!MemRuntimeCheck)
2781     return;
2782 
2783   if (MemCheckBlock->getParent()->hasOptSize()) {
2784     assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
2785            "Cannot emit memory checks when optimizing for size, unless forced "
2786            "to vectorize.");
2787     ORE->emit([&]() {
2788       return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
2789                                         L->getStartLoc(), L->getHeader())
2790              << "Code-size may be reduced by not forcing "
2791                 "vectorization, or by source-code modifications "
2792                 "eliminating the need for runtime checks "
2793                 "(e.g., adding 'restrict').";
2794     });
2795   }
2796 
2797   MemCheckBlock->setName("vector.memcheck");
2798   // Create new preheader for vector loop.
2799   LoopVectorPreHeader =
2800       SplitBlock(MemCheckBlock, MemCheckBlock->getTerminator(), DT, LI, nullptr,
2801                  "vector.ph");
2802 
2803   // Update dominator only if this is first RT check.
2804   if (LoopBypassBlocks.empty()) {
2805     DT->changeImmediateDominator(Bypass, MemCheckBlock);
2806     DT->changeImmediateDominator(LoopExitBlock, MemCheckBlock);
2807   }
2808 
2809   ReplaceInstWithInst(
2810       MemCheckBlock->getTerminator(),
2811       BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheck));
2812   LoopBypassBlocks.push_back(MemCheckBlock);
2813   AddedSafetyChecks = true;
2814 
2815   // We currently don't use LoopVersioning for the actual loop cloning but we
2816   // still use it to add the noalias metadata.
2817   LVer = std::make_unique<LoopVersioning>(*Legal->getLAI(), OrigLoop, LI, DT,
2818                                           PSE.getSE());
2819   LVer->prepareNoAliasMetadata();
2820 }
2821 
2822 Value *InnerLoopVectorizer::emitTransformedIndex(
2823     IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL,
2824     const InductionDescriptor &ID) const {
2825 
2826   SCEVExpander Exp(*SE, DL, "induction");
2827   auto Step = ID.getStep();
2828   auto StartValue = ID.getStartValue();
2829   assert(Index->getType() == Step->getType() &&
2830          "Index type does not match StepValue type");
2831 
2832   // Note: the IR at this point is broken. We cannot use SE to create any new
2833   // SCEV and then expand it, hoping that SCEV's simplification will give us
2834   // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
2835   // lead to various SCEV crashes. So all we can do is to use builder and rely
2836   // on InstCombine for future simplifications. Here we handle some trivial
2837   // cases only.
2838   auto CreateAdd = [&B](Value *X, Value *Y) {
2839     assert(X->getType() == Y->getType() && "Types don't match!");
2840     if (auto *CX = dyn_cast<ConstantInt>(X))
2841       if (CX->isZero())
2842         return Y;
2843     if (auto *CY = dyn_cast<ConstantInt>(Y))
2844       if (CY->isZero())
2845         return X;
2846     return B.CreateAdd(X, Y);
2847   };
2848 
2849   auto CreateMul = [&B](Value *X, Value *Y) {
2850     assert(X->getType() == Y->getType() && "Types don't match!");
2851     if (auto *CX = dyn_cast<ConstantInt>(X))
2852       if (CX->isOne())
2853         return Y;
2854     if (auto *CY = dyn_cast<ConstantInt>(Y))
2855       if (CY->isOne())
2856         return X;
2857     return B.CreateMul(X, Y);
2858   };
2859 
2860   switch (ID.getKind()) {
2861   case InductionDescriptor::IK_IntInduction: {
2862     assert(Index->getType() == StartValue->getType() &&
2863            "Index type does not match StartValue type");
2864     if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne())
2865       return B.CreateSub(StartValue, Index);
2866     auto *Offset = CreateMul(
2867         Index, Exp.expandCodeFor(Step, Index->getType(), &*B.GetInsertPoint()));
2868     return CreateAdd(StartValue, Offset);
2869   }
2870   case InductionDescriptor::IK_PtrInduction: {
2871     assert(isa<SCEVConstant>(Step) &&
2872            "Expected constant step for pointer induction");
2873     return B.CreateGEP(
2874         StartValue->getType()->getPointerElementType(), StartValue,
2875         CreateMul(Index, Exp.expandCodeFor(Step, Index->getType(),
2876                                            &*B.GetInsertPoint())));
2877   }
2878   case InductionDescriptor::IK_FpInduction: {
2879     assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
2880     auto InductionBinOp = ID.getInductionBinOp();
2881     assert(InductionBinOp &&
2882            (InductionBinOp->getOpcode() == Instruction::FAdd ||
2883             InductionBinOp->getOpcode() == Instruction::FSub) &&
2884            "Original bin op should be defined for FP induction");
2885 
2886     Value *StepValue = cast<SCEVUnknown>(Step)->getValue();
2887 
2888     // Floating point operations had to be 'fast' to enable the induction.
2889     FastMathFlags Flags;
2890     Flags.setFast();
2891 
2892     Value *MulExp = B.CreateFMul(StepValue, Index);
2893     if (isa<Instruction>(MulExp))
2894       // We have to check, the MulExp may be a constant.
2895       cast<Instruction>(MulExp)->setFastMathFlags(Flags);
2896 
2897     Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
2898                                "induction");
2899     if (isa<Instruction>(BOp))
2900       cast<Instruction>(BOp)->setFastMathFlags(Flags);
2901 
2902     return BOp;
2903   }
2904   case InductionDescriptor::IK_NoInduction:
2905     return nullptr;
2906   }
2907   llvm_unreachable("invalid enum");
2908 }
2909 
2910 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
2911   /*
2912    In this function we generate a new loop. The new loop will contain
2913    the vectorized instructions while the old loop will continue to run the
2914    scalar remainder.
2915 
2916        [ ] <-- loop iteration number check.
2917     /   |
2918    /    v
2919   |    [ ] <-- vector loop bypass (may consist of multiple blocks).
2920   |  /  |
2921   | /   v
2922   ||   [ ]     <-- vector pre header.
2923   |/    |
2924   |     v
2925   |    [  ] \
2926   |    [  ]_|   <-- vector loop.
2927   |     |
2928   |     v
2929   |   -[ ]   <--- middle-block.
2930   |  /  |
2931   | /   v
2932   -|- >[ ]     <--- new preheader.
2933    |    |
2934    |    v
2935    |   [ ] \
2936    |   [ ]_|   <-- old scalar loop to handle remainder.
2937     \   |
2938      \  v
2939       >[ ]     <-- exit block.
2940    ...
2941    */
2942 
2943   MDNode *OrigLoopID = OrigLoop->getLoopID();
2944 
2945   // Some loops have a single integer induction variable, while other loops
2946   // don't. One example is c++ iterators that often have multiple pointer
2947   // induction variables. In the code below we also support a case where we
2948   // don't have a single induction variable.
2949   //
2950   // We try to obtain an induction variable from the original loop as hard
2951   // as possible. However if we don't find one that:
2952   //   - is an integer
2953   //   - counts from zero, stepping by one
2954   //   - is the size of the widest induction variable type
2955   // then we create a new one.
2956   OldInduction = Legal->getPrimaryInduction();
2957   Type *IdxTy = Legal->getWidestInductionType();
2958 
2959   // Split the single block loop into the two loop structure described above.
2960   LoopScalarBody = OrigLoop->getHeader();
2961   LoopVectorPreHeader = OrigLoop->getLoopPreheader();
2962   LoopExitBlock = OrigLoop->getExitBlock();
2963   assert(LoopExitBlock && "Must have an exit block");
2964   assert(LoopVectorPreHeader && "Invalid loop structure");
2965 
2966   LoopMiddleBlock =
2967       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
2968                  LI, nullptr, "middle.block");
2969   LoopScalarPreHeader =
2970       SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,
2971                  nullptr, "scalar.ph");
2972   // We intentionally don't let SplitBlock to update LoopInfo since
2973   // LoopVectorBody should belong to another loop than LoopVectorPreHeader.
2974   // LoopVectorBody is explicitly added to the correct place few lines later.
2975   LoopVectorBody =
2976       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
2977                  nullptr, nullptr, "vector.body");
2978 
2979   // Update dominator for loop exit.
2980   DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
2981 
2982   // Create and register the new vector loop.
2983   Loop *Lp = LI->AllocateLoop();
2984   Loop *ParentLoop = OrigLoop->getParentLoop();
2985 
2986   // Insert the new loop into the loop nest and register the new basic blocks
2987   // before calling any utilities such as SCEV that require valid LoopInfo.
2988   if (ParentLoop) {
2989     ParentLoop->addChildLoop(Lp);
2990   } else {
2991     LI->addTopLevelLoop(Lp);
2992   }
2993   Lp->addBasicBlockToLoop(LoopVectorBody, *LI);
2994 
2995   // Find the loop boundaries.
2996   Value *Count = getOrCreateTripCount(Lp);
2997 
2998   Value *StartIdx = ConstantInt::get(IdxTy, 0);
2999 
3000   // Now, compare the new count to zero. If it is zero skip the vector loop and
3001   // jump to the scalar loop. This check also covers the case where the
3002   // backedge-taken count is uint##_max: adding one to it will overflow leading
3003   // to an incorrect trip count of zero. In this (rare) case we will also jump
3004   // to the scalar loop.
3005   emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader);
3006 
3007   // Generate the code to check any assumptions that we've made for SCEV
3008   // expressions.
3009   emitSCEVChecks(Lp, LoopScalarPreHeader);
3010 
3011   // Generate the code that checks in runtime if arrays overlap. We put the
3012   // checks into a separate block to make the more common case of few elements
3013   // faster.
3014   emitMemRuntimeChecks(Lp, LoopScalarPreHeader);
3015 
3016   // Generate the induction variable.
3017   // The loop step is equal to the vectorization factor (num of SIMD elements)
3018   // times the unroll factor (num of SIMD instructions).
3019   Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
3020   Constant *Step = ConstantInt::get(IdxTy, VF * UF);
3021   Induction =
3022       createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
3023                               getDebugLocFromInstOrOperands(OldInduction));
3024 
3025   // We are going to resume the execution of the scalar loop.
3026   // Go over all of the induction variables that we found and fix the
3027   // PHIs that are left in the scalar version of the loop.
3028   // The starting values of PHI nodes depend on the counter of the last
3029   // iteration in the vectorized loop.
3030   // If we come from a bypass edge then we need to start from the original
3031   // start value.
3032 
3033   // This variable saves the new starting index for the scalar loop. It is used
3034   // to test if there are any tail iterations left once the vector loop has
3035   // completed.
3036   LoopVectorizationLegality::InductionList *List = Legal->getInductionVars();
3037   for (auto &InductionEntry : *List) {
3038     PHINode *OrigPhi = InductionEntry.first;
3039     InductionDescriptor II = InductionEntry.second;
3040 
3041     // Create phi nodes to merge from the  backedge-taken check block.
3042     PHINode *BCResumeVal =
3043         PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
3044                         LoopScalarPreHeader->getTerminator());
3045     // Copy original phi DL over to the new one.
3046     BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
3047     Value *&EndValue = IVEndValues[OrigPhi];
3048     if (OrigPhi == OldInduction) {
3049       // We know what the end value is.
3050       EndValue = CountRoundDown;
3051     } else {
3052       IRBuilder<> B(Lp->getLoopPreheader()->getTerminator());
3053       Type *StepType = II.getStep()->getType();
3054       Instruction::CastOps CastOp =
3055           CastInst::getCastOpcode(CountRoundDown, true, StepType, true);
3056       Value *CRD = B.CreateCast(CastOp, CountRoundDown, StepType, "cast.crd");
3057       const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout();
3058       EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
3059       EndValue->setName("ind.end");
3060     }
3061 
3062     // The new PHI merges the original incoming value, in case of a bypass,
3063     // or the value at the end of the vectorized loop.
3064     BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);
3065 
3066     // Fix the scalar body counter (PHI node).
3067     // The old induction's phi node in the scalar body needs the truncated
3068     // value.
3069     for (BasicBlock *BB : LoopBypassBlocks)
3070       BCResumeVal->addIncoming(II.getStartValue(), BB);
3071     OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
3072   }
3073 
3074   // We need the OrigLoop (scalar loop part) latch terminator to help
3075   // produce correct debug info for the middle block BB instructions.
3076   // The legality check stage guarantees that the loop will have a single
3077   // latch.
3078   assert(isa<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()) &&
3079          "Scalar loop latch terminator isn't a branch");
3080   BranchInst *ScalarLatchBr =
3081       cast<BranchInst>(OrigLoop->getLoopLatch()->getTerminator());
3082 
3083   // Add a check in the middle block to see if we have completed
3084   // all of the iterations in the first vector loop.
3085   // If (N - N%VF) == N, then we *don't* need to run the remainder.
3086   // If tail is to be folded, we know we don't need to run the remainder.
3087   Value *CmpN = Builder.getTrue();
3088   if (!Cost->foldTailByMasking()) {
3089     CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count,
3090                            CountRoundDown, "cmp.n",
3091                            LoopMiddleBlock->getTerminator());
3092 
3093     // Here we use the same DebugLoc as the scalar loop latch branch instead
3094     // of the corresponding compare because they may have ended up with
3095     // different line numbers and we want to avoid awkward line stepping while
3096     // debugging. Eg. if the compare has got a line number inside the loop.
3097     cast<Instruction>(CmpN)->setDebugLoc(ScalarLatchBr->getDebugLoc());
3098   }
3099 
3100   BranchInst *BrInst =
3101       BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, CmpN);
3102   BrInst->setDebugLoc(ScalarLatchBr->getDebugLoc());
3103   ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst);
3104 
3105   // Get ready to start creating new instructions into the vectorized body.
3106   assert(LoopVectorPreHeader == Lp->getLoopPreheader() &&
3107          "Inconsistent vector loop preheader");
3108   Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt());
3109 
3110   Optional<MDNode *> VectorizedLoopID =
3111       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
3112                                       LLVMLoopVectorizeFollowupVectorized});
3113   if (VectorizedLoopID.hasValue()) {
3114     Lp->setLoopID(VectorizedLoopID.getValue());
3115 
3116     // Do not setAlreadyVectorized if loop attributes have been defined
3117     // explicitly.
3118     return LoopVectorPreHeader;
3119   }
3120 
3121   // Keep all loop hints from the original loop on the vector loop (we'll
3122   // replace the vectorizer-specific hints below).
3123   if (MDNode *LID = OrigLoop->getLoopID())
3124     Lp->setLoopID(LID);
3125 
3126   LoopVectorizeHints Hints(Lp, true, *ORE);
3127   Hints.setAlreadyVectorized();
3128 
3129 #ifdef EXPENSIVE_CHECKS
3130   assert(DT->verify(DominatorTree::VerificationLevel::Fast));
3131   LI->verify(*DT);
3132 #endif
3133 
3134   return LoopVectorPreHeader;
3135 }
3136 
3137 // Fix up external users of the induction variable. At this point, we are
3138 // in LCSSA form, with all external PHIs that use the IV having one input value,
3139 // coming from the remainder loop. We need those PHIs to also have a correct
3140 // value for the IV when arriving directly from the middle block.
3141 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
3142                                        const InductionDescriptor &II,
3143                                        Value *CountRoundDown, Value *EndValue,
3144                                        BasicBlock *MiddleBlock) {
3145   // There are two kinds of external IV usages - those that use the value
3146   // computed in the last iteration (the PHI) and those that use the penultimate
3147   // value (the value that feeds into the phi from the loop latch).
3148   // We allow both, but they, obviously, have different values.
3149 
3150   assert(OrigLoop->getExitBlock() && "Expected a single exit block");
3151 
3152   DenseMap<Value *, Value *> MissingVals;
3153 
3154   // An external user of the last iteration's value should see the value that
3155   // the remainder loop uses to initialize its own IV.
3156   Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
3157   for (User *U : PostInc->users()) {
3158     Instruction *UI = cast<Instruction>(U);
3159     if (!OrigLoop->contains(UI)) {
3160       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3161       MissingVals[UI] = EndValue;
3162     }
3163   }
3164 
3165   // An external user of the penultimate value need to see EndValue - Step.
3166   // The simplest way to get this is to recompute it from the constituent SCEVs,
3167   // that is Start + (Step * (CRD - 1)).
3168   for (User *U : OrigPhi->users()) {
3169     auto *UI = cast<Instruction>(U);
3170     if (!OrigLoop->contains(UI)) {
3171       const DataLayout &DL =
3172           OrigLoop->getHeader()->getModule()->getDataLayout();
3173       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3174 
3175       IRBuilder<> B(MiddleBlock->getTerminator());
3176       Value *CountMinusOne = B.CreateSub(
3177           CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1));
3178       Value *CMO =
3179           !II.getStep()->getType()->isIntegerTy()
3180               ? B.CreateCast(Instruction::SIToFP, CountMinusOne,
3181                              II.getStep()->getType())
3182               : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType());
3183       CMO->setName("cast.cmo");
3184       Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II);
3185       Escape->setName("ind.escape");
3186       MissingVals[UI] = Escape;
3187     }
3188   }
3189 
3190   for (auto &I : MissingVals) {
3191     PHINode *PHI = cast<PHINode>(I.first);
3192     // One corner case we have to handle is two IVs "chasing" each-other,
3193     // that is %IV2 = phi [...], [ %IV1, %latch ]
3194     // In this case, if IV1 has an external use, we need to avoid adding both
3195     // "last value of IV1" and "penultimate value of IV2". So, verify that we
3196     // don't already have an incoming value for the middle block.
3197     if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
3198       PHI->addIncoming(I.second, MiddleBlock);
3199   }
3200 }
3201 
3202 namespace {
3203 
3204 struct CSEDenseMapInfo {
3205   static bool canHandle(const Instruction *I) {
3206     return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
3207            isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
3208   }
3209 
3210   static inline Instruction *getEmptyKey() {
3211     return DenseMapInfo<Instruction *>::getEmptyKey();
3212   }
3213 
3214   static inline Instruction *getTombstoneKey() {
3215     return DenseMapInfo<Instruction *>::getTombstoneKey();
3216   }
3217 
3218   static unsigned getHashValue(const Instruction *I) {
3219     assert(canHandle(I) && "Unknown instruction!");
3220     return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
3221                                                            I->value_op_end()));
3222   }
3223 
3224   static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
3225     if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3226         LHS == getTombstoneKey() || RHS == getTombstoneKey())
3227       return LHS == RHS;
3228     return LHS->isIdenticalTo(RHS);
3229   }
3230 };
3231 
3232 } // end anonymous namespace
3233 
3234 ///Perform cse of induction variable instructions.
3235 static void cse(BasicBlock *BB) {
3236   // Perform simple cse.
3237   SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
3238   for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
3239     Instruction *In = &*I++;
3240 
3241     if (!CSEDenseMapInfo::canHandle(In))
3242       continue;
3243 
3244     // Check if we can replace this instruction with any of the
3245     // visited instructions.
3246     if (Instruction *V = CSEMap.lookup(In)) {
3247       In->replaceAllUsesWith(V);
3248       In->eraseFromParent();
3249       continue;
3250     }
3251 
3252     CSEMap[In] = In;
3253   }
3254 }
3255 
3256 unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
3257                                                        unsigned VF,
3258                                                        bool &NeedToScalarize) {
3259   Function *F = CI->getCalledFunction();
3260   StringRef FnName = CI->getCalledFunction()->getName();
3261   Type *ScalarRetTy = CI->getType();
3262   SmallVector<Type *, 4> Tys, ScalarTys;
3263   for (auto &ArgOp : CI->arg_operands())
3264     ScalarTys.push_back(ArgOp->getType());
3265 
3266   // Estimate cost of scalarized vector call. The source operands are assumed
3267   // to be vectors, so we need to extract individual elements from there,
3268   // execute VF scalar calls, and then gather the result into the vector return
3269   // value.
3270   unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys);
3271   if (VF == 1)
3272     return ScalarCallCost;
3273 
3274   // Compute corresponding vector type for return value and arguments.
3275   Type *RetTy = ToVectorTy(ScalarRetTy, VF);
3276   for (Type *ScalarTy : ScalarTys)
3277     Tys.push_back(ToVectorTy(ScalarTy, VF));
3278 
3279   // Compute costs of unpacking argument values for the scalar calls and
3280   // packing the return values to a vector.
3281   unsigned ScalarizationCost = getScalarizationOverhead(CI, VF);
3282 
3283   unsigned Cost = ScalarCallCost * VF + ScalarizationCost;
3284 
3285   // If we can't emit a vector call for this function, then the currently found
3286   // cost is the cost we need to return.
3287   NeedToScalarize = true;
3288   if (!TLI || !TLI->isFunctionVectorizable(FnName, VF) || CI->isNoBuiltin())
3289     return Cost;
3290 
3291   // If the corresponding vector cost is cheaper, return its cost.
3292   unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys);
3293   if (VectorCallCost < Cost) {
3294     NeedToScalarize = false;
3295     return VectorCallCost;
3296   }
3297   return Cost;
3298 }
3299 
3300 unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
3301                                                             unsigned VF) {
3302   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
3303   assert(ID && "Expected intrinsic call!");
3304 
3305   FastMathFlags FMF;
3306   if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
3307     FMF = FPMO->getFastMathFlags();
3308 
3309   SmallVector<Value *, 4> Operands(CI->arg_operands());
3310   return TTI.getIntrinsicInstrCost(ID, CI->getType(), Operands, FMF, VF);
3311 }
3312 
3313 static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
3314   auto *I1 = cast<IntegerType>(T1->getVectorElementType());
3315   auto *I2 = cast<IntegerType>(T2->getVectorElementType());
3316   return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3317 }
3318 static Type *largestIntegerVectorType(Type *T1, Type *T2) {
3319   auto *I1 = cast<IntegerType>(T1->getVectorElementType());
3320   auto *I2 = cast<IntegerType>(T2->getVectorElementType());
3321   return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3322 }
3323 
3324 void InnerLoopVectorizer::truncateToMinimalBitwidths() {
3325   // For every instruction `I` in MinBWs, truncate the operands, create a
3326   // truncated version of `I` and reextend its result. InstCombine runs
3327   // later and will remove any ext/trunc pairs.
3328   SmallPtrSet<Value *, 4> Erased;
3329   for (const auto &KV : Cost->getMinimalBitwidths()) {
3330     // If the value wasn't vectorized, we must maintain the original scalar
3331     // type. The absence of the value from VectorLoopValueMap indicates that it
3332     // wasn't vectorized.
3333     if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3334       continue;
3335     for (unsigned Part = 0; Part < UF; ++Part) {
3336       Value *I = getOrCreateVectorValue(KV.first, Part);
3337       if (Erased.find(I) != Erased.end() || I->use_empty() ||
3338           !isa<Instruction>(I))
3339         continue;
3340       Type *OriginalTy = I->getType();
3341       Type *ScalarTruncatedTy =
3342           IntegerType::get(OriginalTy->getContext(), KV.second);
3343       Type *TruncatedTy = VectorType::get(ScalarTruncatedTy,
3344                                           OriginalTy->getVectorNumElements());
3345       if (TruncatedTy == OriginalTy)
3346         continue;
3347 
3348       IRBuilder<> B(cast<Instruction>(I));
3349       auto ShrinkOperand = [&](Value *V) -> Value * {
3350         if (auto *ZI = dyn_cast<ZExtInst>(V))
3351           if (ZI->getSrcTy() == TruncatedTy)
3352             return ZI->getOperand(0);
3353         return B.CreateZExtOrTrunc(V, TruncatedTy);
3354       };
3355 
3356       // The actual instruction modification depends on the instruction type,
3357       // unfortunately.
3358       Value *NewI = nullptr;
3359       if (auto *BO = dyn_cast<BinaryOperator>(I)) {
3360         NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
3361                              ShrinkOperand(BO->getOperand(1)));
3362 
3363         // Any wrapping introduced by shrinking this operation shouldn't be
3364         // considered undefined behavior. So, we can't unconditionally copy
3365         // arithmetic wrapping flags to NewI.
3366         cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
3367       } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
3368         NewI =
3369             B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
3370                          ShrinkOperand(CI->getOperand(1)));
3371       } else if (auto *SI = dyn_cast<SelectInst>(I)) {
3372         NewI = B.CreateSelect(SI->getCondition(),
3373                               ShrinkOperand(SI->getTrueValue()),
3374                               ShrinkOperand(SI->getFalseValue()));
3375       } else if (auto *CI = dyn_cast<CastInst>(I)) {
3376         switch (CI->getOpcode()) {
3377         default:
3378           llvm_unreachable("Unhandled cast!");
3379         case Instruction::Trunc:
3380           NewI = ShrinkOperand(CI->getOperand(0));
3381           break;
3382         case Instruction::SExt:
3383           NewI = B.CreateSExtOrTrunc(
3384               CI->getOperand(0),
3385               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3386           break;
3387         case Instruction::ZExt:
3388           NewI = B.CreateZExtOrTrunc(
3389               CI->getOperand(0),
3390               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3391           break;
3392         }
3393       } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
3394         auto Elements0 = SI->getOperand(0)->getType()->getVectorNumElements();
3395         auto *O0 = B.CreateZExtOrTrunc(
3396             SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0));
3397         auto Elements1 = SI->getOperand(1)->getType()->getVectorNumElements();
3398         auto *O1 = B.CreateZExtOrTrunc(
3399             SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1));
3400 
3401         NewI = B.CreateShuffleVector(O0, O1, SI->getMask());
3402       } else if (isa<LoadInst>(I) || isa<PHINode>(I)) {
3403         // Don't do anything with the operands, just extend the result.
3404         continue;
3405       } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
3406         auto Elements = IE->getOperand(0)->getType()->getVectorNumElements();
3407         auto *O0 = B.CreateZExtOrTrunc(
3408             IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3409         auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
3410         NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
3411       } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
3412         auto Elements = EE->getOperand(0)->getType()->getVectorNumElements();
3413         auto *O0 = B.CreateZExtOrTrunc(
3414             EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3415         NewI = B.CreateExtractElement(O0, EE->getOperand(2));
3416       } else {
3417         // If we don't know what to do, be conservative and don't do anything.
3418         continue;
3419       }
3420 
3421       // Lastly, extend the result.
3422       NewI->takeName(cast<Instruction>(I));
3423       Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
3424       I->replaceAllUsesWith(Res);
3425       cast<Instruction>(I)->eraseFromParent();
3426       Erased.insert(I);
3427       VectorLoopValueMap.resetVectorValue(KV.first, Part, Res);
3428     }
3429   }
3430 
3431   // We'll have created a bunch of ZExts that are now parentless. Clean up.
3432   for (const auto &KV : Cost->getMinimalBitwidths()) {
3433     // If the value wasn't vectorized, we must maintain the original scalar
3434     // type. The absence of the value from VectorLoopValueMap indicates that it
3435     // wasn't vectorized.
3436     if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3437       continue;
3438     for (unsigned Part = 0; Part < UF; ++Part) {
3439       Value *I = getOrCreateVectorValue(KV.first, Part);
3440       ZExtInst *Inst = dyn_cast<ZExtInst>(I);
3441       if (Inst && Inst->use_empty()) {
3442         Value *NewI = Inst->getOperand(0);
3443         Inst->eraseFromParent();
3444         VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI);
3445       }
3446     }
3447   }
3448 }
3449 
3450 void InnerLoopVectorizer::fixVectorizedLoop() {
3451   // Insert truncates and extends for any truncated instructions as hints to
3452   // InstCombine.
3453   if (VF > 1)
3454     truncateToMinimalBitwidths();
3455 
3456   // Fix widened non-induction PHIs by setting up the PHI operands.
3457   if (OrigPHIsToFix.size()) {
3458     assert(EnableVPlanNativePath &&
3459            "Unexpected non-induction PHIs for fixup in non VPlan-native path");
3460     fixNonInductionPHIs();
3461   }
3462 
3463   // At this point every instruction in the original loop is widened to a
3464   // vector form. Now we need to fix the recurrences in the loop. These PHI
3465   // nodes are currently empty because we did not want to introduce cycles.
3466   // This is the second stage of vectorizing recurrences.
3467   fixCrossIterationPHIs();
3468 
3469   // Forget the original basic block.
3470   PSE.getSE()->forgetLoop(OrigLoop);
3471 
3472   // Fix-up external users of the induction variables.
3473   for (auto &Entry : *Legal->getInductionVars())
3474     fixupIVUsers(Entry.first, Entry.second,
3475                  getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)),
3476                  IVEndValues[Entry.first], LoopMiddleBlock);
3477 
3478   fixLCSSAPHIs();
3479   for (Instruction *PI : PredicatedInstructions)
3480     sinkScalarOperands(&*PI);
3481 
3482   // Remove redundant induction instructions.
3483   cse(LoopVectorBody);
3484 }
3485 
3486 void InnerLoopVectorizer::fixCrossIterationPHIs() {
3487   // In order to support recurrences we need to be able to vectorize Phi nodes.
3488   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3489   // stage #2: We now need to fix the recurrences by adding incoming edges to
3490   // the currently empty PHI nodes. At this point every instruction in the
3491   // original loop is widened to a vector form so we can use them to construct
3492   // the incoming edges.
3493   for (PHINode &Phi : OrigLoop->getHeader()->phis()) {
3494     // Handle first-order recurrences and reductions that need to be fixed.
3495     if (Legal->isFirstOrderRecurrence(&Phi))
3496       fixFirstOrderRecurrence(&Phi);
3497     else if (Legal->isReductionVariable(&Phi))
3498       fixReduction(&Phi);
3499   }
3500 }
3501 
3502 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
3503   // This is the second phase of vectorizing first-order recurrences. An
3504   // overview of the transformation is described below. Suppose we have the
3505   // following loop.
3506   //
3507   //   for (int i = 0; i < n; ++i)
3508   //     b[i] = a[i] - a[i - 1];
3509   //
3510   // There is a first-order recurrence on "a". For this loop, the shorthand
3511   // scalar IR looks like:
3512   //
3513   //   scalar.ph:
3514   //     s_init = a[-1]
3515   //     br scalar.body
3516   //
3517   //   scalar.body:
3518   //     i = phi [0, scalar.ph], [i+1, scalar.body]
3519   //     s1 = phi [s_init, scalar.ph], [s2, scalar.body]
3520   //     s2 = a[i]
3521   //     b[i] = s2 - s1
3522   //     br cond, scalar.body, ...
3523   //
3524   // In this example, s1 is a recurrence because it's value depends on the
3525   // previous iteration. In the first phase of vectorization, we created a
3526   // temporary value for s1. We now complete the vectorization and produce the
3527   // shorthand vector IR shown below (for VF = 4, UF = 1).
3528   //
3529   //   vector.ph:
3530   //     v_init = vector(..., ..., ..., a[-1])
3531   //     br vector.body
3532   //
3533   //   vector.body
3534   //     i = phi [0, vector.ph], [i+4, vector.body]
3535   //     v1 = phi [v_init, vector.ph], [v2, vector.body]
3536   //     v2 = a[i, i+1, i+2, i+3];
3537   //     v3 = vector(v1(3), v2(0, 1, 2))
3538   //     b[i, i+1, i+2, i+3] = v2 - v3
3539   //     br cond, vector.body, middle.block
3540   //
3541   //   middle.block:
3542   //     x = v2(3)
3543   //     br scalar.ph
3544   //
3545   //   scalar.ph:
3546   //     s_init = phi [x, middle.block], [a[-1], otherwise]
3547   //     br scalar.body
3548   //
3549   // After execution completes the vector loop, we extract the next value of
3550   // the recurrence (x) to use as the initial value in the scalar loop.
3551 
3552   // Get the original loop preheader and single loop latch.
3553   auto *Preheader = OrigLoop->getLoopPreheader();
3554   auto *Latch = OrigLoop->getLoopLatch();
3555 
3556   // Get the initial and previous values of the scalar recurrence.
3557   auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader);
3558   auto *Previous = Phi->getIncomingValueForBlock(Latch);
3559 
3560   // Create a vector from the initial value.
3561   auto *VectorInit = ScalarInit;
3562   if (VF > 1) {
3563     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
3564     VectorInit = Builder.CreateInsertElement(
3565         UndefValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit,
3566         Builder.getInt32(VF - 1), "vector.recur.init");
3567   }
3568 
3569   // We constructed a temporary phi node in the first phase of vectorization.
3570   // This phi node will eventually be deleted.
3571   Builder.SetInsertPoint(
3572       cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0)));
3573 
3574   // Create a phi node for the new recurrence. The current value will either be
3575   // the initial value inserted into a vector or loop-varying vector value.
3576   auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur");
3577   VecPhi->addIncoming(VectorInit, LoopVectorPreHeader);
3578 
3579   // Get the vectorized previous value of the last part UF - 1. It appears last
3580   // among all unrolled iterations, due to the order of their construction.
3581   Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1);
3582 
3583   // Find and set the insertion point after the previous value if it is an
3584   // instruction.
3585   BasicBlock::iterator InsertPt;
3586   // Note that the previous value may have been constant-folded so it is not
3587   // guaranteed to be an instruction in the vector loop.
3588   // FIXME: Loop invariant values do not form recurrences. We should deal with
3589   //        them earlier.
3590   if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart))
3591     InsertPt = LoopVectorBody->getFirstInsertionPt();
3592   else {
3593     Instruction *PreviousInst = cast<Instruction>(PreviousLastPart);
3594     if (isa<PHINode>(PreviousLastPart))
3595       // If the previous value is a phi node, we should insert after all the phi
3596       // nodes in the block containing the PHI to avoid breaking basic block
3597       // verification. Note that the basic block may be different to
3598       // LoopVectorBody, in case we predicate the loop.
3599       InsertPt = PreviousInst->getParent()->getFirstInsertionPt();
3600     else
3601       InsertPt = ++PreviousInst->getIterator();
3602   }
3603   Builder.SetInsertPoint(&*InsertPt);
3604 
3605   // We will construct a vector for the recurrence by combining the values for
3606   // the current and previous iterations. This is the required shuffle mask.
3607   SmallVector<Constant *, 8> ShuffleMask(VF);
3608   ShuffleMask[0] = Builder.getInt32(VF - 1);
3609   for (unsigned I = 1; I < VF; ++I)
3610     ShuffleMask[I] = Builder.getInt32(I + VF - 1);
3611 
3612   // The vector from which to take the initial value for the current iteration
3613   // (actual or unrolled). Initially, this is the vector phi node.
3614   Value *Incoming = VecPhi;
3615 
3616   // Shuffle the current and previous vector and update the vector parts.
3617   for (unsigned Part = 0; Part < UF; ++Part) {
3618     Value *PreviousPart = getOrCreateVectorValue(Previous, Part);
3619     Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part);
3620     auto *Shuffle =
3621         VF > 1 ? Builder.CreateShuffleVector(Incoming, PreviousPart,
3622                                              ConstantVector::get(ShuffleMask))
3623                : Incoming;
3624     PhiPart->replaceAllUsesWith(Shuffle);
3625     cast<Instruction>(PhiPart)->eraseFromParent();
3626     VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle);
3627     Incoming = PreviousPart;
3628   }
3629 
3630   // Fix the latch value of the new recurrence in the vector loop.
3631   VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
3632 
3633   // Extract the last vector element in the middle block. This will be the
3634   // initial value for the recurrence when jumping to the scalar loop.
3635   auto *ExtractForScalar = Incoming;
3636   if (VF > 1) {
3637     Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
3638     ExtractForScalar = Builder.CreateExtractElement(
3639         ExtractForScalar, Builder.getInt32(VF - 1), "vector.recur.extract");
3640   }
3641   // Extract the second last element in the middle block if the
3642   // Phi is used outside the loop. We need to extract the phi itself
3643   // and not the last element (the phi update in the current iteration). This
3644   // will be the value when jumping to the exit block from the LoopMiddleBlock,
3645   // when the scalar loop is not run at all.
3646   Value *ExtractForPhiUsedOutsideLoop = nullptr;
3647   if (VF > 1)
3648     ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
3649         Incoming, Builder.getInt32(VF - 2), "vector.recur.extract.for.phi");
3650   // When loop is unrolled without vectorizing, initialize
3651   // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of
3652   // `Incoming`. This is analogous to the vectorized case above: extracting the
3653   // second last element when VF > 1.
3654   else if (UF > 1)
3655     ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2);
3656 
3657   // Fix the initial value of the original recurrence in the scalar loop.
3658   Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
3659   auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
3660   for (auto *BB : predecessors(LoopScalarPreHeader)) {
3661     auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
3662     Start->addIncoming(Incoming, BB);
3663   }
3664 
3665   Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start);
3666   Phi->setName("scalar.recur");
3667 
3668   // Finally, fix users of the recurrence outside the loop. The users will need
3669   // either the last value of the scalar recurrence or the last value of the
3670   // vector recurrence we extracted in the middle block. Since the loop is in
3671   // LCSSA form, we just need to find all the phi nodes for the original scalar
3672   // recurrence in the exit block, and then add an edge for the middle block.
3673   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3674     if (LCSSAPhi.getIncomingValue(0) == Phi) {
3675       LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
3676     }
3677   }
3678 }
3679 
3680 void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
3681   Constant *Zero = Builder.getInt32(0);
3682 
3683   // Get it's reduction variable descriptor.
3684   assert(Legal->isReductionVariable(Phi) &&
3685          "Unable to find the reduction variable");
3686   RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[Phi];
3687 
3688   RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();
3689   TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
3690   Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
3691   RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind =
3692     RdxDesc.getMinMaxRecurrenceKind();
3693   setDebugLocFromInst(Builder, ReductionStartValue);
3694 
3695   // We need to generate a reduction vector from the incoming scalar.
3696   // To do so, we need to generate the 'identity' vector and override
3697   // one of the elements with the incoming scalar reduction. We need
3698   // to do it in the vector-loop preheader.
3699   Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
3700 
3701   // This is the vector-clone of the value that leaves the loop.
3702   Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType();
3703 
3704   // Find the reduction identity variable. Zero for addition, or, xor,
3705   // one for multiplication, -1 for And.
3706   Value *Identity;
3707   Value *VectorStart;
3708   if (RK == RecurrenceDescriptor::RK_IntegerMinMax ||
3709       RK == RecurrenceDescriptor::RK_FloatMinMax) {
3710     // MinMax reduction have the start value as their identify.
3711     if (VF == 1) {
3712       VectorStart = Identity = ReductionStartValue;
3713     } else {
3714       VectorStart = Identity =
3715         Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident");
3716     }
3717   } else {
3718     // Handle other reduction kinds:
3719     Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity(
3720         RK, VecTy->getScalarType());
3721     if (VF == 1) {
3722       Identity = Iden;
3723       // This vector is the Identity vector where the first element is the
3724       // incoming scalar reduction.
3725       VectorStart = ReductionStartValue;
3726     } else {
3727       Identity = ConstantVector::getSplat(VF, Iden);
3728 
3729       // This vector is the Identity vector where the first element is the
3730       // incoming scalar reduction.
3731       VectorStart =
3732         Builder.CreateInsertElement(Identity, ReductionStartValue, Zero);
3733     }
3734   }
3735 
3736   // Wrap flags are in general invalid after vectorization, clear them.
3737   clearReductionWrapFlags(RdxDesc);
3738 
3739   // Fix the vector-loop phi.
3740 
3741   // Reductions do not have to start at zero. They can start with
3742   // any loop invariant values.
3743   BasicBlock *Latch = OrigLoop->getLoopLatch();
3744   Value *LoopVal = Phi->getIncomingValueForBlock(Latch);
3745 
3746   for (unsigned Part = 0; Part < UF; ++Part) {
3747     Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part);
3748     Value *Val = getOrCreateVectorValue(LoopVal, Part);
3749     // Make sure to add the reduction start value only to the
3750     // first unroll part.
3751     Value *StartVal = (Part == 0) ? VectorStart : Identity;
3752     cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader);
3753     cast<PHINode>(VecRdxPhi)
3754       ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
3755   }
3756 
3757   // Before each round, move the insertion point right between
3758   // the PHIs and the values we are going to write.
3759   // This allows us to write both PHINodes and the extractelement
3760   // instructions.
3761   Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3762 
3763   setDebugLocFromInst(Builder, LoopExitInst);
3764 
3765   // If tail is folded by masking, the vector value to leave the loop should be
3766   // a Select choosing between the vectorized LoopExitInst and vectorized Phi,
3767   // instead of the former.
3768   if (Cost->foldTailByMasking()) {
3769     for (unsigned Part = 0; Part < UF; ++Part) {
3770       Value *VecLoopExitInst =
3771           VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3772       Value *Sel = nullptr;
3773       for (User *U : VecLoopExitInst->users()) {
3774         if (isa<SelectInst>(U)) {
3775           assert(!Sel && "Reduction exit feeding two selects");
3776           Sel = U;
3777         } else
3778           assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select");
3779       }
3780       assert(Sel && "Reduction exit feeds no select");
3781       VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, Sel);
3782     }
3783   }
3784 
3785   // If the vector reduction can be performed in a smaller type, we truncate
3786   // then extend the loop exit value to enable InstCombine to evaluate the
3787   // entire expression in the smaller type.
3788   if (VF > 1 && Phi->getType() != RdxDesc.getRecurrenceType()) {
3789     Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
3790     Builder.SetInsertPoint(
3791         LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator());
3792     VectorParts RdxParts(UF);
3793     for (unsigned Part = 0; Part < UF; ++Part) {
3794       RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3795       Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3796       Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
3797                                         : Builder.CreateZExt(Trunc, VecTy);
3798       for (Value::user_iterator UI = RdxParts[Part]->user_begin();
3799            UI != RdxParts[Part]->user_end();)
3800         if (*UI != Trunc) {
3801           (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd);
3802           RdxParts[Part] = Extnd;
3803         } else {
3804           ++UI;
3805         }
3806     }
3807     Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3808     for (unsigned Part = 0; Part < UF; ++Part) {
3809       RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3810       VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]);
3811     }
3812   }
3813 
3814   // Reduce all of the unrolled parts into a single vector.
3815   Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0);
3816   unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK);
3817 
3818   // The middle block terminator has already been assigned a DebugLoc here (the
3819   // OrigLoop's single latch terminator). We want the whole middle block to
3820   // appear to execute on this line because: (a) it is all compiler generated,
3821   // (b) these instructions are always executed after evaluating the latch
3822   // conditional branch, and (c) other passes may add new predecessors which
3823   // terminate on this line. This is the easiest way to ensure we don't
3824   // accidentally cause an extra step back into the loop while debugging.
3825   setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator());
3826   for (unsigned Part = 1; Part < UF; ++Part) {
3827     Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3828     if (Op != Instruction::ICmp && Op != Instruction::FCmp)
3829       // Floating point operations had to be 'fast' to enable the reduction.
3830       ReducedPartRdx = addFastMathFlag(
3831           Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxPart,
3832                               ReducedPartRdx, "bin.rdx"),
3833           RdxDesc.getFastMathFlags());
3834     else
3835       ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx,
3836                                       RdxPart);
3837   }
3838 
3839   if (VF > 1) {
3840     bool NoNaN = Legal->hasFunNoNaNAttr();
3841     ReducedPartRdx =
3842         createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN);
3843     // If the reduction can be performed in a smaller type, we need to extend
3844     // the reduction to the wider type before we branch to the original loop.
3845     if (Phi->getType() != RdxDesc.getRecurrenceType())
3846       ReducedPartRdx =
3847         RdxDesc.isSigned()
3848         ? Builder.CreateSExt(ReducedPartRdx, Phi->getType())
3849         : Builder.CreateZExt(ReducedPartRdx, Phi->getType());
3850   }
3851 
3852   // Create a phi node that merges control-flow from the backedge-taken check
3853   // block and the middle block.
3854   PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx",
3855                                         LoopScalarPreHeader->getTerminator());
3856   for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
3857     BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]);
3858   BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
3859 
3860   // Now, we need to fix the users of the reduction variable
3861   // inside and outside of the scalar remainder loop.
3862   // We know that the loop is in LCSSA form. We need to update the
3863   // PHI nodes in the exit blocks.
3864   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3865     // All PHINodes need to have a single entry edge, or two if
3866     // we already fixed them.
3867     assert(LCSSAPhi.getNumIncomingValues() < 3 && "Invalid LCSSA PHI");
3868 
3869     // We found a reduction value exit-PHI. Update it with the
3870     // incoming bypass edge.
3871     if (LCSSAPhi.getIncomingValue(0) == LoopExitInst)
3872       LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
3873   } // end of the LCSSA phi scan.
3874 
3875     // Fix the scalar loop reduction variable with the incoming reduction sum
3876     // from the vector body and from the backedge value.
3877   int IncomingEdgeBlockIdx =
3878     Phi->getBasicBlockIndex(OrigLoop->getLoopLatch());
3879   assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
3880   // Pick the other block.
3881   int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
3882   Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
3883   Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
3884 }
3885 
3886 void InnerLoopVectorizer::clearReductionWrapFlags(
3887     RecurrenceDescriptor &RdxDesc) {
3888   RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();
3889   if (RK != RecurrenceDescriptor::RK_IntegerAdd &&
3890       RK != RecurrenceDescriptor::RK_IntegerMult)
3891     return;
3892 
3893   Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr();
3894   assert(LoopExitInstr && "null loop exit instruction");
3895   SmallVector<Instruction *, 8> Worklist;
3896   SmallPtrSet<Instruction *, 8> Visited;
3897   Worklist.push_back(LoopExitInstr);
3898   Visited.insert(LoopExitInstr);
3899 
3900   while (!Worklist.empty()) {
3901     Instruction *Cur = Worklist.pop_back_val();
3902     if (isa<OverflowingBinaryOperator>(Cur))
3903       for (unsigned Part = 0; Part < UF; ++Part) {
3904         Value *V = getOrCreateVectorValue(Cur, Part);
3905         cast<Instruction>(V)->dropPoisonGeneratingFlags();
3906       }
3907 
3908     for (User *U : Cur->users()) {
3909       Instruction *UI = cast<Instruction>(U);
3910       if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) &&
3911           Visited.insert(UI).second)
3912         Worklist.push_back(UI);
3913     }
3914   }
3915 }
3916 
3917 void InnerLoopVectorizer::fixLCSSAPHIs() {
3918   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3919     if (LCSSAPhi.getNumIncomingValues() == 1) {
3920       auto *IncomingValue = LCSSAPhi.getIncomingValue(0);
3921       // Non-instruction incoming values will have only one value.
3922       unsigned LastLane = 0;
3923       if (isa<Instruction>(IncomingValue))
3924           LastLane = Cost->isUniformAfterVectorization(
3925                          cast<Instruction>(IncomingValue), VF)
3926                          ? 0
3927                          : VF - 1;
3928       // Can be a loop invariant incoming value or the last scalar value to be
3929       // extracted from the vectorized loop.
3930       Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
3931       Value *lastIncomingValue =
3932           getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane });
3933       LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock);
3934     }
3935   }
3936 }
3937 
3938 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
3939   // The basic block and loop containing the predicated instruction.
3940   auto *PredBB = PredInst->getParent();
3941   auto *VectorLoop = LI->getLoopFor(PredBB);
3942 
3943   // Initialize a worklist with the operands of the predicated instruction.
3944   SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
3945 
3946   // Holds instructions that we need to analyze again. An instruction may be
3947   // reanalyzed if we don't yet know if we can sink it or not.
3948   SmallVector<Instruction *, 8> InstsToReanalyze;
3949 
3950   // Returns true if a given use occurs in the predicated block. Phi nodes use
3951   // their operands in their corresponding predecessor blocks.
3952   auto isBlockOfUsePredicated = [&](Use &U) -> bool {
3953     auto *I = cast<Instruction>(U.getUser());
3954     BasicBlock *BB = I->getParent();
3955     if (auto *Phi = dyn_cast<PHINode>(I))
3956       BB = Phi->getIncomingBlock(
3957           PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
3958     return BB == PredBB;
3959   };
3960 
3961   // Iteratively sink the scalarized operands of the predicated instruction
3962   // into the block we created for it. When an instruction is sunk, it's
3963   // operands are then added to the worklist. The algorithm ends after one pass
3964   // through the worklist doesn't sink a single instruction.
3965   bool Changed;
3966   do {
3967     // Add the instructions that need to be reanalyzed to the worklist, and
3968     // reset the changed indicator.
3969     Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
3970     InstsToReanalyze.clear();
3971     Changed = false;
3972 
3973     while (!Worklist.empty()) {
3974       auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
3975 
3976       // We can't sink an instruction if it is a phi node, is already in the
3977       // predicated block, is not in the loop, or may have side effects.
3978       if (!I || isa<PHINode>(I) || I->getParent() == PredBB ||
3979           !VectorLoop->contains(I) || I->mayHaveSideEffects())
3980         continue;
3981 
3982       // It's legal to sink the instruction if all its uses occur in the
3983       // predicated block. Otherwise, there's nothing to do yet, and we may
3984       // need to reanalyze the instruction.
3985       if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
3986         InstsToReanalyze.push_back(I);
3987         continue;
3988       }
3989 
3990       // Move the instruction to the beginning of the predicated block, and add
3991       // it's operands to the worklist.
3992       I->moveBefore(&*PredBB->getFirstInsertionPt());
3993       Worklist.insert(I->op_begin(), I->op_end());
3994 
3995       // The sinking may have enabled other instructions to be sunk, so we will
3996       // need to iterate.
3997       Changed = true;
3998     }
3999   } while (Changed);
4000 }
4001 
4002 void InnerLoopVectorizer::fixNonInductionPHIs() {
4003   for (PHINode *OrigPhi : OrigPHIsToFix) {
4004     PHINode *NewPhi =
4005         cast<PHINode>(VectorLoopValueMap.getVectorValue(OrigPhi, 0));
4006     unsigned NumIncomingValues = OrigPhi->getNumIncomingValues();
4007 
4008     SmallVector<BasicBlock *, 2> ScalarBBPredecessors(
4009         predecessors(OrigPhi->getParent()));
4010     SmallVector<BasicBlock *, 2> VectorBBPredecessors(
4011         predecessors(NewPhi->getParent()));
4012     assert(ScalarBBPredecessors.size() == VectorBBPredecessors.size() &&
4013            "Scalar and Vector BB should have the same number of predecessors");
4014 
4015     // The insertion point in Builder may be invalidated by the time we get
4016     // here. Force the Builder insertion point to something valid so that we do
4017     // not run into issues during insertion point restore in
4018     // getOrCreateVectorValue calls below.
4019     Builder.SetInsertPoint(NewPhi);
4020 
4021     // The predecessor order is preserved and we can rely on mapping between
4022     // scalar and vector block predecessors.
4023     for (unsigned i = 0; i < NumIncomingValues; ++i) {
4024       BasicBlock *NewPredBB = VectorBBPredecessors[i];
4025 
4026       // When looking up the new scalar/vector values to fix up, use incoming
4027       // values from original phi.
4028       Value *ScIncV =
4029           OrigPhi->getIncomingValueForBlock(ScalarBBPredecessors[i]);
4030 
4031       // Scalar incoming value may need a broadcast
4032       Value *NewIncV = getOrCreateVectorValue(ScIncV, 0);
4033       NewPhi->addIncoming(NewIncV, NewPredBB);
4034     }
4035   }
4036 }
4037 
4038 void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, unsigned UF,
4039                                    unsigned VF, bool IsPtrLoopInvariant,
4040                                    SmallBitVector &IsIndexLoopInvariant) {
4041   // Construct a vector GEP by widening the operands of the scalar GEP as
4042   // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
4043   // results in a vector of pointers when at least one operand of the GEP
4044   // is vector-typed. Thus, to keep the representation compact, we only use
4045   // vector-typed operands for loop-varying values.
4046 
4047   if (VF > 1 && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) {
4048     // If we are vectorizing, but the GEP has only loop-invariant operands,
4049     // the GEP we build (by only using vector-typed operands for
4050     // loop-varying values) would be a scalar pointer. Thus, to ensure we
4051     // produce a vector of pointers, we need to either arbitrarily pick an
4052     // operand to broadcast, or broadcast a clone of the original GEP.
4053     // Here, we broadcast a clone of the original.
4054     //
4055     // TODO: If at some point we decide to scalarize instructions having
4056     //       loop-invariant operands, this special case will no longer be
4057     //       required. We would add the scalarization decision to
4058     //       collectLoopScalars() and teach getVectorValue() to broadcast
4059     //       the lane-zero scalar value.
4060     auto *Clone = Builder.Insert(GEP->clone());
4061     for (unsigned Part = 0; Part < UF; ++Part) {
4062       Value *EntryPart = Builder.CreateVectorSplat(VF, Clone);
4063       VectorLoopValueMap.setVectorValue(GEP, Part, EntryPart);
4064       addMetadata(EntryPart, GEP);
4065     }
4066   } else {
4067     // If the GEP has at least one loop-varying operand, we are sure to
4068     // produce a vector of pointers. But if we are only unrolling, we want
4069     // to produce a scalar GEP for each unroll part. Thus, the GEP we
4070     // produce with the code below will be scalar (if VF == 1) or vector
4071     // (otherwise). Note that for the unroll-only case, we still maintain
4072     // values in the vector mapping with initVector, as we do for other
4073     // instructions.
4074     for (unsigned Part = 0; Part < UF; ++Part) {
4075       // The pointer operand of the new GEP. If it's loop-invariant, we
4076       // won't broadcast it.
4077       auto *Ptr = IsPtrLoopInvariant
4078                       ? GEP->getPointerOperand()
4079                       : getOrCreateVectorValue(GEP->getPointerOperand(), Part);
4080 
4081       // Collect all the indices for the new GEP. If any index is
4082       // loop-invariant, we won't broadcast it.
4083       SmallVector<Value *, 4> Indices;
4084       for (auto Index : enumerate(GEP->indices())) {
4085         Value *User = Index.value().get();
4086         if (IsIndexLoopInvariant[Index.index()])
4087           Indices.push_back(User);
4088         else
4089           Indices.push_back(getOrCreateVectorValue(User, Part));
4090       }
4091 
4092       // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
4093       // but it should be a vector, otherwise.
4094       auto *NewGEP =
4095           GEP->isInBounds()
4096               ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr,
4097                                           Indices)
4098               : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices);
4099       assert((VF == 1 || NewGEP->getType()->isVectorTy()) &&
4100              "NewGEP is not a pointer vector");
4101       VectorLoopValueMap.setVectorValue(GEP, Part, NewGEP);
4102       addMetadata(NewGEP, GEP);
4103     }
4104   }
4105 }
4106 
4107 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
4108                                               unsigned VF) {
4109   PHINode *P = cast<PHINode>(PN);
4110   if (EnableVPlanNativePath) {
4111     // Currently we enter here in the VPlan-native path for non-induction
4112     // PHIs where all control flow is uniform. We simply widen these PHIs.
4113     // Create a vector phi with no operands - the vector phi operands will be
4114     // set at the end of vector code generation.
4115     Type *VecTy =
4116         (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF);
4117     Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi");
4118     VectorLoopValueMap.setVectorValue(P, 0, VecPhi);
4119     OrigPHIsToFix.push_back(P);
4120 
4121     return;
4122   }
4123 
4124   assert(PN->getParent() == OrigLoop->getHeader() &&
4125          "Non-header phis should have been handled elsewhere");
4126 
4127   // In order to support recurrences we need to be able to vectorize Phi nodes.
4128   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
4129   // stage #1: We create a new vector PHI node with no incoming edges. We'll use
4130   // this value when we vectorize all of the instructions that use the PHI.
4131   if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) {
4132     for (unsigned Part = 0; Part < UF; ++Part) {
4133       // This is phase one of vectorizing PHIs.
4134       Type *VecTy =
4135           (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF);
4136       Value *EntryPart = PHINode::Create(
4137           VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt());
4138       VectorLoopValueMap.setVectorValue(P, Part, EntryPart);
4139     }
4140     return;
4141   }
4142 
4143   setDebugLocFromInst(Builder, P);
4144 
4145   // This PHINode must be an induction variable.
4146   // Make sure that we know about it.
4147   assert(Legal->getInductionVars()->count(P) && "Not an induction variable");
4148 
4149   InductionDescriptor II = Legal->getInductionVars()->lookup(P);
4150   const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
4151 
4152   // FIXME: The newly created binary instructions should contain nsw/nuw flags,
4153   // which can be found from the original scalar operations.
4154   switch (II.getKind()) {
4155   case InductionDescriptor::IK_NoInduction:
4156     llvm_unreachable("Unknown induction");
4157   case InductionDescriptor::IK_IntInduction:
4158   case InductionDescriptor::IK_FpInduction:
4159     llvm_unreachable("Integer/fp induction is handled elsewhere.");
4160   case InductionDescriptor::IK_PtrInduction: {
4161     // Handle the pointer induction variable case.
4162     assert(P->getType()->isPointerTy() && "Unexpected type.");
4163     // This is the normalized GEP that starts counting at zero.
4164     Value *PtrInd = Induction;
4165     PtrInd = Builder.CreateSExtOrTrunc(PtrInd, II.getStep()->getType());
4166     // Determine the number of scalars we need to generate for each unroll
4167     // iteration. If the instruction is uniform, we only need to generate the
4168     // first lane. Otherwise, we generate all VF values.
4169     unsigned Lanes = Cost->isUniformAfterVectorization(P, VF) ? 1 : VF;
4170     // These are the scalar results. Notice that we don't generate vector GEPs
4171     // because scalar GEPs result in better code.
4172     for (unsigned Part = 0; Part < UF; ++Part) {
4173       for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
4174         Constant *Idx = ConstantInt::get(PtrInd->getType(), Lane + Part * VF);
4175         Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
4176         Value *SclrGep =
4177             emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II);
4178         SclrGep->setName("next.gep");
4179         VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep);
4180       }
4181     }
4182     return;
4183   }
4184   }
4185 }
4186 
4187 /// A helper function for checking whether an integer division-related
4188 /// instruction may divide by zero (in which case it must be predicated if
4189 /// executed conditionally in the scalar code).
4190 /// TODO: It may be worthwhile to generalize and check isKnownNonZero().
4191 /// Non-zero divisors that are non compile-time constants will not be
4192 /// converted into multiplication, so we will still end up scalarizing
4193 /// the division, but can do so w/o predication.
4194 static bool mayDivideByZero(Instruction &I) {
4195   assert((I.getOpcode() == Instruction::UDiv ||
4196           I.getOpcode() == Instruction::SDiv ||
4197           I.getOpcode() == Instruction::URem ||
4198           I.getOpcode() == Instruction::SRem) &&
4199          "Unexpected instruction");
4200   Value *Divisor = I.getOperand(1);
4201   auto *CInt = dyn_cast<ConstantInt>(Divisor);
4202   return !CInt || CInt->isZero();
4203 }
4204 
4205 void InnerLoopVectorizer::widenInstruction(Instruction &I) {
4206   switch (I.getOpcode()) {
4207   case Instruction::Br:
4208   case Instruction::PHI:
4209   case Instruction::GetElementPtr:
4210     llvm_unreachable("This instruction is handled by a different recipe.");
4211   case Instruction::UDiv:
4212   case Instruction::SDiv:
4213   case Instruction::SRem:
4214   case Instruction::URem:
4215   case Instruction::Add:
4216   case Instruction::FAdd:
4217   case Instruction::Sub:
4218   case Instruction::FSub:
4219   case Instruction::FNeg:
4220   case Instruction::Mul:
4221   case Instruction::FMul:
4222   case Instruction::FDiv:
4223   case Instruction::FRem:
4224   case Instruction::Shl:
4225   case Instruction::LShr:
4226   case Instruction::AShr:
4227   case Instruction::And:
4228   case Instruction::Or:
4229   case Instruction::Xor: {
4230     // Just widen unops and binops.
4231     setDebugLocFromInst(Builder, &I);
4232 
4233     for (unsigned Part = 0; Part < UF; ++Part) {
4234       SmallVector<Value *, 2> Ops;
4235       for (Value *Op : I.operands())
4236         Ops.push_back(getOrCreateVectorValue(Op, Part));
4237 
4238       Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops);
4239 
4240       if (auto *VecOp = dyn_cast<Instruction>(V))
4241         VecOp->copyIRFlags(&I);
4242 
4243       // Use this vector value for all users of the original instruction.
4244       VectorLoopValueMap.setVectorValue(&I, Part, V);
4245       addMetadata(V, &I);
4246     }
4247 
4248     break;
4249   }
4250   case Instruction::Select: {
4251     // Widen selects.
4252     // If the selector is loop invariant we can create a select
4253     // instruction with a scalar condition. Otherwise, use vector-select.
4254     auto *SE = PSE.getSE();
4255     bool InvariantCond =
4256         SE->isLoopInvariant(PSE.getSCEV(I.getOperand(0)), OrigLoop);
4257     setDebugLocFromInst(Builder, &I);
4258 
4259     // The condition can be loop invariant  but still defined inside the
4260     // loop. This means that we can't just use the original 'cond' value.
4261     // We have to take the 'vectorized' value and pick the first lane.
4262     // Instcombine will make this a no-op.
4263 
4264     auto *ScalarCond = getOrCreateScalarValue(I.getOperand(0), {0, 0});
4265 
4266     for (unsigned Part = 0; Part < UF; ++Part) {
4267       Value *Cond = getOrCreateVectorValue(I.getOperand(0), Part);
4268       Value *Op0 = getOrCreateVectorValue(I.getOperand(1), Part);
4269       Value *Op1 = getOrCreateVectorValue(I.getOperand(2), Part);
4270       Value *Sel =
4271           Builder.CreateSelect(InvariantCond ? ScalarCond : Cond, Op0, Op1);
4272       VectorLoopValueMap.setVectorValue(&I, Part, Sel);
4273       addMetadata(Sel, &I);
4274     }
4275 
4276     break;
4277   }
4278 
4279   case Instruction::ICmp:
4280   case Instruction::FCmp: {
4281     // Widen compares. Generate vector compares.
4282     bool FCmp = (I.getOpcode() == Instruction::FCmp);
4283     auto *Cmp = cast<CmpInst>(&I);
4284     setDebugLocFromInst(Builder, Cmp);
4285     for (unsigned Part = 0; Part < UF; ++Part) {
4286       Value *A = getOrCreateVectorValue(Cmp->getOperand(0), Part);
4287       Value *B = getOrCreateVectorValue(Cmp->getOperand(1), Part);
4288       Value *C = nullptr;
4289       if (FCmp) {
4290         // Propagate fast math flags.
4291         IRBuilder<>::FastMathFlagGuard FMFG(Builder);
4292         Builder.setFastMathFlags(Cmp->getFastMathFlags());
4293         C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
4294       } else {
4295         C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
4296       }
4297       VectorLoopValueMap.setVectorValue(&I, Part, C);
4298       addMetadata(C, &I);
4299     }
4300 
4301     break;
4302   }
4303 
4304   case Instruction::ZExt:
4305   case Instruction::SExt:
4306   case Instruction::FPToUI:
4307   case Instruction::FPToSI:
4308   case Instruction::FPExt:
4309   case Instruction::PtrToInt:
4310   case Instruction::IntToPtr:
4311   case Instruction::SIToFP:
4312   case Instruction::UIToFP:
4313   case Instruction::Trunc:
4314   case Instruction::FPTrunc:
4315   case Instruction::BitCast: {
4316     auto *CI = cast<CastInst>(&I);
4317     setDebugLocFromInst(Builder, CI);
4318 
4319     /// Vectorize casts.
4320     Type *DestTy =
4321         (VF == 1) ? CI->getType() : VectorType::get(CI->getType(), VF);
4322 
4323     for (unsigned Part = 0; Part < UF; ++Part) {
4324       Value *A = getOrCreateVectorValue(CI->getOperand(0), Part);
4325       Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
4326       VectorLoopValueMap.setVectorValue(&I, Part, Cast);
4327       addMetadata(Cast, &I);
4328     }
4329     break;
4330   }
4331 
4332   case Instruction::Call: {
4333     // Ignore dbg intrinsics.
4334     if (isa<DbgInfoIntrinsic>(I))
4335       break;
4336     setDebugLocFromInst(Builder, &I);
4337 
4338     Module *M = I.getParent()->getParent()->getParent();
4339     auto *CI = cast<CallInst>(&I);
4340 
4341     StringRef FnName = CI->getCalledFunction()->getName();
4342     Function *F = CI->getCalledFunction();
4343     Type *RetTy = ToVectorTy(CI->getType(), VF);
4344     SmallVector<Type *, 4> Tys;
4345     for (Value *ArgOperand : CI->arg_operands())
4346       Tys.push_back(ToVectorTy(ArgOperand->getType(), VF));
4347 
4348     Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
4349 
4350     // The flag shows whether we use Intrinsic or a usual Call for vectorized
4351     // version of the instruction.
4352     // Is it beneficial to perform intrinsic call compared to lib call?
4353     bool NeedToScalarize;
4354     unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize);
4355     bool UseVectorIntrinsic =
4356         ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost;
4357     assert((UseVectorIntrinsic || !NeedToScalarize) &&
4358            "Instruction should be scalarized elsewhere.");
4359 
4360     for (unsigned Part = 0; Part < UF; ++Part) {
4361       SmallVector<Value *, 4> Args;
4362       for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) {
4363         Value *Arg = CI->getArgOperand(i);
4364         // Some intrinsics have a scalar argument - don't replace it with a
4365         // vector.
4366         if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, i))
4367           Arg = getOrCreateVectorValue(CI->getArgOperand(i), Part);
4368         Args.push_back(Arg);
4369       }
4370 
4371       Function *VectorF;
4372       if (UseVectorIntrinsic) {
4373         // Use vector version of the intrinsic.
4374         Type *TysForDecl[] = {CI->getType()};
4375         if (VF > 1)
4376           TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
4377         VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
4378       } else {
4379         // Use vector version of the library call.
4380         StringRef VFnName = TLI->getVectorizedFunction(FnName, VF);
4381         assert(!VFnName.empty() && "Vector function name is empty.");
4382         VectorF = M->getFunction(VFnName);
4383         if (!VectorF) {
4384           // Generate a declaration
4385           FunctionType *FTy = FunctionType::get(RetTy, Tys, false);
4386           VectorF =
4387               Function::Create(FTy, Function::ExternalLinkage, VFnName, M);
4388           VectorF->copyAttributesFrom(F);
4389         }
4390       }
4391       assert(VectorF && "Can't create vector function.");
4392 
4393       SmallVector<OperandBundleDef, 1> OpBundles;
4394       CI->getOperandBundlesAsDefs(OpBundles);
4395       CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
4396 
4397       if (isa<FPMathOperator>(V))
4398         V->copyFastMathFlags(CI);
4399 
4400       VectorLoopValueMap.setVectorValue(&I, Part, V);
4401       addMetadata(V, &I);
4402     }
4403 
4404     break;
4405   }
4406 
4407   default:
4408     // This instruction is not vectorized by simple widening.
4409     LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
4410     llvm_unreachable("Unhandled instruction!");
4411   } // end of switch.
4412 }
4413 
4414 void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
4415   // We should not collect Scalars more than once per VF. Right now, this
4416   // function is called from collectUniformsAndScalars(), which already does
4417   // this check. Collecting Scalars for VF=1 does not make any sense.
4418   assert(VF >= 2 && Scalars.find(VF) == Scalars.end() &&
4419          "This function should not be visited twice for the same VF");
4420 
4421   SmallSetVector<Instruction *, 8> Worklist;
4422 
4423   // These sets are used to seed the analysis with pointers used by memory
4424   // accesses that will remain scalar.
4425   SmallSetVector<Instruction *, 8> ScalarPtrs;
4426   SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
4427 
4428   // A helper that returns true if the use of Ptr by MemAccess will be scalar.
4429   // The pointer operands of loads and stores will be scalar as long as the
4430   // memory access is not a gather or scatter operation. The value operand of a
4431   // store will remain scalar if the store is scalarized.
4432   auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
4433     InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
4434     assert(WideningDecision != CM_Unknown &&
4435            "Widening decision should be ready at this moment");
4436     if (auto *Store = dyn_cast<StoreInst>(MemAccess))
4437       if (Ptr == Store->getValueOperand())
4438         return WideningDecision == CM_Scalarize;
4439     assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
4440            "Ptr is neither a value or pointer operand");
4441     return WideningDecision != CM_GatherScatter;
4442   };
4443 
4444   // A helper that returns true if the given value is a bitcast or
4445   // getelementptr instruction contained in the loop.
4446   auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
4447     return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
4448             isa<GetElementPtrInst>(V)) &&
4449            !TheLoop->isLoopInvariant(V);
4450   };
4451 
4452   // A helper that evaluates a memory access's use of a pointer. If the use
4453   // will be a scalar use, and the pointer is only used by memory accesses, we
4454   // place the pointer in ScalarPtrs. Otherwise, the pointer is placed in
4455   // PossibleNonScalarPtrs.
4456   auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
4457     // We only care about bitcast and getelementptr instructions contained in
4458     // the loop.
4459     if (!isLoopVaryingBitCastOrGEP(Ptr))
4460       return;
4461 
4462     // If the pointer has already been identified as scalar (e.g., if it was
4463     // also identified as uniform), there's nothing to do.
4464     auto *I = cast<Instruction>(Ptr);
4465     if (Worklist.count(I))
4466       return;
4467 
4468     // If the use of the pointer will be a scalar use, and all users of the
4469     // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
4470     // place the pointer in PossibleNonScalarPtrs.
4471     if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
4472           return isa<LoadInst>(U) || isa<StoreInst>(U);
4473         }))
4474       ScalarPtrs.insert(I);
4475     else
4476       PossibleNonScalarPtrs.insert(I);
4477   };
4478 
4479   // We seed the scalars analysis with three classes of instructions: (1)
4480   // instructions marked uniform-after-vectorization, (2) bitcast and
4481   // getelementptr instructions used by memory accesses requiring a scalar use,
4482   // and (3) pointer induction variables and their update instructions (we
4483   // currently only scalarize these).
4484   //
4485   // (1) Add to the worklist all instructions that have been identified as
4486   // uniform-after-vectorization.
4487   Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
4488 
4489   // (2) Add to the worklist all bitcast and getelementptr instructions used by
4490   // memory accesses requiring a scalar use. The pointer operands of loads and
4491   // stores will be scalar as long as the memory accesses is not a gather or
4492   // scatter operation. The value operand of a store will remain scalar if the
4493   // store is scalarized.
4494   for (auto *BB : TheLoop->blocks())
4495     for (auto &I : *BB) {
4496       if (auto *Load = dyn_cast<LoadInst>(&I)) {
4497         evaluatePtrUse(Load, Load->getPointerOperand());
4498       } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
4499         evaluatePtrUse(Store, Store->getPointerOperand());
4500         evaluatePtrUse(Store, Store->getValueOperand());
4501       }
4502     }
4503   for (auto *I : ScalarPtrs)
4504     if (PossibleNonScalarPtrs.find(I) == PossibleNonScalarPtrs.end()) {
4505       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
4506       Worklist.insert(I);
4507     }
4508 
4509   // (3) Add to the worklist all pointer induction variables and their update
4510   // instructions.
4511   //
4512   // TODO: Once we are able to vectorize pointer induction variables we should
4513   //       no longer insert them into the worklist here.
4514   auto *Latch = TheLoop->getLoopLatch();
4515   for (auto &Induction : *Legal->getInductionVars()) {
4516     auto *Ind = Induction.first;
4517     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4518     if (Induction.second.getKind() != InductionDescriptor::IK_PtrInduction)
4519       continue;
4520     Worklist.insert(Ind);
4521     Worklist.insert(IndUpdate);
4522     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
4523     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
4524                       << "\n");
4525   }
4526 
4527   // Insert the forced scalars.
4528   // FIXME: Currently widenPHIInstruction() often creates a dead vector
4529   // induction variable when the PHI user is scalarized.
4530   auto ForcedScalar = ForcedScalars.find(VF);
4531   if (ForcedScalar != ForcedScalars.end())
4532     for (auto *I : ForcedScalar->second)
4533       Worklist.insert(I);
4534 
4535   // Expand the worklist by looking through any bitcasts and getelementptr
4536   // instructions we've already identified as scalar. This is similar to the
4537   // expansion step in collectLoopUniforms(); however, here we're only
4538   // expanding to include additional bitcasts and getelementptr instructions.
4539   unsigned Idx = 0;
4540   while (Idx != Worklist.size()) {
4541     Instruction *Dst = Worklist[Idx++];
4542     if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
4543       continue;
4544     auto *Src = cast<Instruction>(Dst->getOperand(0));
4545     if (llvm::all_of(Src->users(), [&](User *U) -> bool {
4546           auto *J = cast<Instruction>(U);
4547           return !TheLoop->contains(J) || Worklist.count(J) ||
4548                  ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
4549                   isScalarUse(J, Src));
4550         })) {
4551       Worklist.insert(Src);
4552       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
4553     }
4554   }
4555 
4556   // An induction variable will remain scalar if all users of the induction
4557   // variable and induction variable update remain scalar.
4558   for (auto &Induction : *Legal->getInductionVars()) {
4559     auto *Ind = Induction.first;
4560     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4561 
4562     // We already considered pointer induction variables, so there's no reason
4563     // to look at their users again.
4564     //
4565     // TODO: Once we are able to vectorize pointer induction variables we
4566     //       should no longer skip over them here.
4567     if (Induction.second.getKind() == InductionDescriptor::IK_PtrInduction)
4568       continue;
4569 
4570     // Determine if all users of the induction variable are scalar after
4571     // vectorization.
4572     auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4573       auto *I = cast<Instruction>(U);
4574       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I);
4575     });
4576     if (!ScalarInd)
4577       continue;
4578 
4579     // Determine if all users of the induction variable update instruction are
4580     // scalar after vectorization.
4581     auto ScalarIndUpdate =
4582         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4583           auto *I = cast<Instruction>(U);
4584           return I == Ind || !TheLoop->contains(I) || Worklist.count(I);
4585         });
4586     if (!ScalarIndUpdate)
4587       continue;
4588 
4589     // The induction variable and its update instruction will remain scalar.
4590     Worklist.insert(Ind);
4591     Worklist.insert(IndUpdate);
4592     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
4593     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
4594                       << "\n");
4595   }
4596 
4597   Scalars[VF].insert(Worklist.begin(), Worklist.end());
4598 }
4599 
4600 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, unsigned VF) {
4601   if (!blockNeedsPredication(I->getParent()))
4602     return false;
4603   switch(I->getOpcode()) {
4604   default:
4605     break;
4606   case Instruction::Load:
4607   case Instruction::Store: {
4608     if (!Legal->isMaskRequired(I))
4609       return false;
4610     auto *Ptr = getLoadStorePointerOperand(I);
4611     auto *Ty = getMemInstValueType(I);
4612     // We have already decided how to vectorize this instruction, get that
4613     // result.
4614     if (VF > 1) {
4615       InstWidening WideningDecision = getWideningDecision(I, VF);
4616       assert(WideningDecision != CM_Unknown &&
4617              "Widening decision should be ready at this moment");
4618       return WideningDecision == CM_Scalarize;
4619     }
4620     const MaybeAlign Alignment = getLoadStoreAlignment(I);
4621     return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
4622                                 isLegalMaskedGather(Ty, Alignment))
4623                             : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
4624                                 isLegalMaskedScatter(Ty, Alignment));
4625   }
4626   case Instruction::UDiv:
4627   case Instruction::SDiv:
4628   case Instruction::SRem:
4629   case Instruction::URem:
4630     return mayDivideByZero(*I);
4631   }
4632   return false;
4633 }
4634 
4635 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(Instruction *I,
4636                                                                unsigned VF) {
4637   assert(isAccessInterleaved(I) && "Expecting interleaved access.");
4638   assert(getWideningDecision(I, VF) == CM_Unknown &&
4639          "Decision should not be set yet.");
4640   auto *Group = getInterleavedAccessGroup(I);
4641   assert(Group && "Must have a group.");
4642 
4643   // If the instruction's allocated size doesn't equal it's type size, it
4644   // requires padding and will be scalarized.
4645   auto &DL = I->getModule()->getDataLayout();
4646   auto *ScalarTy = getMemInstValueType(I);
4647   if (hasIrregularType(ScalarTy, DL, VF))
4648     return false;
4649 
4650   // Check if masking is required.
4651   // A Group may need masking for one of two reasons: it resides in a block that
4652   // needs predication, or it was decided to use masking to deal with gaps.
4653   bool PredicatedAccessRequiresMasking =
4654       Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I);
4655   bool AccessWithGapsRequiresMasking =
4656       Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
4657   if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking)
4658     return true;
4659 
4660   // If masked interleaving is required, we expect that the user/target had
4661   // enabled it, because otherwise it either wouldn't have been created or
4662   // it should have been invalidated by the CostModel.
4663   assert(useMaskedInterleavedAccesses(TTI) &&
4664          "Masked interleave-groups for predicated accesses are not enabled.");
4665 
4666   auto *Ty = getMemInstValueType(I);
4667   const MaybeAlign Alignment = getLoadStoreAlignment(I);
4668   return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
4669                           : TTI.isLegalMaskedStore(Ty, Alignment);
4670 }
4671 
4672 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(Instruction *I,
4673                                                                unsigned VF) {
4674   // Get and ensure we have a valid memory instruction.
4675   LoadInst *LI = dyn_cast<LoadInst>(I);
4676   StoreInst *SI = dyn_cast<StoreInst>(I);
4677   assert((LI || SI) && "Invalid memory instruction");
4678 
4679   auto *Ptr = getLoadStorePointerOperand(I);
4680 
4681   // In order to be widened, the pointer should be consecutive, first of all.
4682   if (!Legal->isConsecutivePtr(Ptr))
4683     return false;
4684 
4685   // If the instruction is a store located in a predicated block, it will be
4686   // scalarized.
4687   if (isScalarWithPredication(I))
4688     return false;
4689 
4690   // If the instruction's allocated size doesn't equal it's type size, it
4691   // requires padding and will be scalarized.
4692   auto &DL = I->getModule()->getDataLayout();
4693   auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType();
4694   if (hasIrregularType(ScalarTy, DL, VF))
4695     return false;
4696 
4697   return true;
4698 }
4699 
4700 void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
4701   // We should not collect Uniforms more than once per VF. Right now,
4702   // this function is called from collectUniformsAndScalars(), which
4703   // already does this check. Collecting Uniforms for VF=1 does not make any
4704   // sense.
4705 
4706   assert(VF >= 2 && Uniforms.find(VF) == Uniforms.end() &&
4707          "This function should not be visited twice for the same VF");
4708 
4709   // Visit the list of Uniforms. If we'll not find any uniform value, we'll
4710   // not analyze again.  Uniforms.count(VF) will return 1.
4711   Uniforms[VF].clear();
4712 
4713   // We now know that the loop is vectorizable!
4714   // Collect instructions inside the loop that will remain uniform after
4715   // vectorization.
4716 
4717   // Global values, params and instructions outside of current loop are out of
4718   // scope.
4719   auto isOutOfScope = [&](Value *V) -> bool {
4720     Instruction *I = dyn_cast<Instruction>(V);
4721     return (!I || !TheLoop->contains(I));
4722   };
4723 
4724   SetVector<Instruction *> Worklist;
4725   BasicBlock *Latch = TheLoop->getLoopLatch();
4726 
4727   // Instructions that are scalar with predication must not be considered
4728   // uniform after vectorization, because that would create an erroneous
4729   // replicating region where only a single instance out of VF should be formed.
4730   // TODO: optimize such seldom cases if found important, see PR40816.
4731   auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
4732     if (isScalarWithPredication(I, VF)) {
4733       LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "
4734                         << *I << "\n");
4735       return;
4736     }
4737     LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
4738     Worklist.insert(I);
4739   };
4740 
4741   // Start with the conditional branch. If the branch condition is an
4742   // instruction contained in the loop that is only used by the branch, it is
4743   // uniform.
4744   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
4745   if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
4746     addToWorklistIfAllowed(Cmp);
4747 
4748   // Holds consecutive and consecutive-like pointers. Consecutive-like pointers
4749   // are pointers that are treated like consecutive pointers during
4750   // vectorization. The pointer operands of interleaved accesses are an
4751   // example.
4752   SmallSetVector<Instruction *, 8> ConsecutiveLikePtrs;
4753 
4754   // Holds pointer operands of instructions that are possibly non-uniform.
4755   SmallPtrSet<Instruction *, 8> PossibleNonUniformPtrs;
4756 
4757   auto isUniformDecision = [&](Instruction *I, unsigned VF) {
4758     InstWidening WideningDecision = getWideningDecision(I, VF);
4759     assert(WideningDecision != CM_Unknown &&
4760            "Widening decision should be ready at this moment");
4761 
4762     return (WideningDecision == CM_Widen ||
4763             WideningDecision == CM_Widen_Reverse ||
4764             WideningDecision == CM_Interleave);
4765   };
4766   // Iterate over the instructions in the loop, and collect all
4767   // consecutive-like pointer operands in ConsecutiveLikePtrs. If it's possible
4768   // that a consecutive-like pointer operand will be scalarized, we collect it
4769   // in PossibleNonUniformPtrs instead. We use two sets here because a single
4770   // getelementptr instruction can be used by both vectorized and scalarized
4771   // memory instructions. For example, if a loop loads and stores from the same
4772   // location, but the store is conditional, the store will be scalarized, and
4773   // the getelementptr won't remain uniform.
4774   for (auto *BB : TheLoop->blocks())
4775     for (auto &I : *BB) {
4776       // If there's no pointer operand, there's nothing to do.
4777       auto *Ptr = dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
4778       if (!Ptr)
4779         continue;
4780 
4781       // True if all users of Ptr are memory accesses that have Ptr as their
4782       // pointer operand.
4783       auto UsersAreMemAccesses =
4784           llvm::all_of(Ptr->users(), [&](User *U) -> bool {
4785             return getLoadStorePointerOperand(U) == Ptr;
4786           });
4787 
4788       // Ensure the memory instruction will not be scalarized or used by
4789       // gather/scatter, making its pointer operand non-uniform. If the pointer
4790       // operand is used by any instruction other than a memory access, we
4791       // conservatively assume the pointer operand may be non-uniform.
4792       if (!UsersAreMemAccesses || !isUniformDecision(&I, VF))
4793         PossibleNonUniformPtrs.insert(Ptr);
4794 
4795       // If the memory instruction will be vectorized and its pointer operand
4796       // is consecutive-like, or interleaving - the pointer operand should
4797       // remain uniform.
4798       else
4799         ConsecutiveLikePtrs.insert(Ptr);
4800     }
4801 
4802   // Add to the Worklist all consecutive and consecutive-like pointers that
4803   // aren't also identified as possibly non-uniform.
4804   for (auto *V : ConsecutiveLikePtrs)
4805     if (PossibleNonUniformPtrs.find(V) == PossibleNonUniformPtrs.end())
4806       addToWorklistIfAllowed(V);
4807 
4808   // Expand Worklist in topological order: whenever a new instruction
4809   // is added , its users should be already inside Worklist.  It ensures
4810   // a uniform instruction will only be used by uniform instructions.
4811   unsigned idx = 0;
4812   while (idx != Worklist.size()) {
4813     Instruction *I = Worklist[idx++];
4814 
4815     for (auto OV : I->operand_values()) {
4816       // isOutOfScope operands cannot be uniform instructions.
4817       if (isOutOfScope(OV))
4818         continue;
4819       // First order recurrence Phi's should typically be considered
4820       // non-uniform.
4821       auto *OP = dyn_cast<PHINode>(OV);
4822       if (OP && Legal->isFirstOrderRecurrence(OP))
4823         continue;
4824       // If all the users of the operand are uniform, then add the
4825       // operand into the uniform worklist.
4826       auto *OI = cast<Instruction>(OV);
4827       if (llvm::all_of(OI->users(), [&](User *U) -> bool {
4828             auto *J = cast<Instruction>(U);
4829             return Worklist.count(J) ||
4830                    (OI == getLoadStorePointerOperand(J) &&
4831                     isUniformDecision(J, VF));
4832           }))
4833         addToWorklistIfAllowed(OI);
4834     }
4835   }
4836 
4837   // Returns true if Ptr is the pointer operand of a memory access instruction
4838   // I, and I is known to not require scalarization.
4839   auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
4840     return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF);
4841   };
4842 
4843   // For an instruction to be added into Worklist above, all its users inside
4844   // the loop should also be in Worklist. However, this condition cannot be
4845   // true for phi nodes that form a cyclic dependence. We must process phi
4846   // nodes separately. An induction variable will remain uniform if all users
4847   // of the induction variable and induction variable update remain uniform.
4848   // The code below handles both pointer and non-pointer induction variables.
4849   for (auto &Induction : *Legal->getInductionVars()) {
4850     auto *Ind = Induction.first;
4851     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4852 
4853     // Determine if all users of the induction variable are uniform after
4854     // vectorization.
4855     auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4856       auto *I = cast<Instruction>(U);
4857       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
4858              isVectorizedMemAccessUse(I, Ind);
4859     });
4860     if (!UniformInd)
4861       continue;
4862 
4863     // Determine if all users of the induction variable update instruction are
4864     // uniform after vectorization.
4865     auto UniformIndUpdate =
4866         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4867           auto *I = cast<Instruction>(U);
4868           return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
4869                  isVectorizedMemAccessUse(I, IndUpdate);
4870         });
4871     if (!UniformIndUpdate)
4872       continue;
4873 
4874     // The induction variable and its update instruction will remain uniform.
4875     addToWorklistIfAllowed(Ind);
4876     addToWorklistIfAllowed(IndUpdate);
4877   }
4878 
4879   Uniforms[VF].insert(Worklist.begin(), Worklist.end());
4880 }
4881 
4882 bool LoopVectorizationCostModel::runtimeChecksRequired() {
4883   LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
4884 
4885   if (Legal->getRuntimePointerChecking()->Need) {
4886     reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
4887         "runtime pointer checks needed. Enable vectorization of this "
4888         "loop with '#pragma clang loop vectorize(enable)' when "
4889         "compiling with -Os/-Oz",
4890         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4891     return true;
4892   }
4893 
4894   if (!PSE.getUnionPredicate().getPredicates().empty()) {
4895     reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
4896         "runtime SCEV checks needed. Enable vectorization of this "
4897         "loop with '#pragma clang loop vectorize(enable)' when "
4898         "compiling with -Os/-Oz",
4899         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4900     return true;
4901   }
4902 
4903   // FIXME: Avoid specializing for stride==1 instead of bailing out.
4904   if (!Legal->getLAI()->getSymbolicStrides().empty()) {
4905     reportVectorizationFailure("Runtime stride check is required with -Os/-Oz",
4906         "runtime stride == 1 checks needed. Enable vectorization of "
4907         "this loop with '#pragma clang loop vectorize(enable)' when "
4908         "compiling with -Os/-Oz",
4909         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4910     return true;
4911   }
4912 
4913   return false;
4914 }
4915 
4916 Optional<unsigned> LoopVectorizationCostModel::computeMaxVF() {
4917   if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
4918     // TODO: It may by useful to do since it's still likely to be dynamically
4919     // uniform if the target can skip.
4920     reportVectorizationFailure(
4921         "Not inserting runtime ptr check for divergent target",
4922         "runtime pointer checks needed. Not enabled for divergent target",
4923         "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
4924     return None;
4925   }
4926 
4927   unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
4928   LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
4929   if (TC == 1) {
4930     reportVectorizationFailure("Single iteration (non) loop",
4931         "loop trip count is one, irrelevant for vectorization",
4932         "SingleIterationLoop", ORE, TheLoop);
4933     return None;
4934   }
4935 
4936   switch (ScalarEpilogueStatus) {
4937   case CM_ScalarEpilogueAllowed:
4938     return computeFeasibleMaxVF(TC);
4939   case CM_ScalarEpilogueNotNeededUsePredicate:
4940     LLVM_DEBUG(
4941         dbgs() << "LV: vector predicate hint/switch found.\n"
4942                << "LV: Not allowing scalar epilogue, creating predicated "
4943                << "vector loop.\n");
4944     break;
4945   case CM_ScalarEpilogueNotAllowedLowTripLoop:
4946     // fallthrough as a special case of OptForSize
4947   case CM_ScalarEpilogueNotAllowedOptSize:
4948     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
4949       LLVM_DEBUG(
4950           dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
4951     else
4952       LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
4953                         << "count.\n");
4954 
4955     // Bail if runtime checks are required, which are not good when optimising
4956     // for size.
4957     if (runtimeChecksRequired())
4958       return None;
4959     break;
4960   }
4961 
4962   // Now try the tail folding
4963 
4964   // Invalidate interleave groups that require an epilogue if we can't mask
4965   // the interleave-group.
4966   if (!useMaskedInterleavedAccesses(TTI))
4967     InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
4968 
4969   unsigned MaxVF = computeFeasibleMaxVF(TC);
4970   if (TC > 0 && TC % MaxVF == 0) {
4971     // Accept MaxVF if we do not have a tail.
4972     LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
4973     return MaxVF;
4974   }
4975 
4976   // If we don't know the precise trip count, or if the trip count that we
4977   // found modulo the vectorization factor is not zero, try to fold the tail
4978   // by masking.
4979   // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
4980   if (Legal->prepareToFoldTailByMasking()) {
4981     FoldTailByMasking = true;
4982     return MaxVF;
4983   }
4984 
4985   if (TC == 0) {
4986     reportVectorizationFailure(
4987         "Unable to calculate the loop count due to complex control flow",
4988         "unable to calculate the loop count due to complex control flow",
4989         "UnknownLoopCountComplexCFG", ORE, TheLoop);
4990     return None;
4991   }
4992 
4993   reportVectorizationFailure(
4994       "Cannot optimize for size and vectorize at the same time.",
4995       "cannot optimize for size and vectorize at the same time. "
4996       "Enable vectorization of this loop with '#pragma clang loop "
4997       "vectorize(enable)' when compiling with -Os/-Oz",
4998       "NoTailLoopWithOptForSize", ORE, TheLoop);
4999   return None;
5000 }
5001 
5002 unsigned
5003 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) {
5004   MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
5005   unsigned SmallestType, WidestType;
5006   std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
5007   unsigned WidestRegister = TTI.getRegisterBitWidth(true);
5008 
5009   // Get the maximum safe dependence distance in bits computed by LAA.
5010   // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
5011   // the memory accesses that is most restrictive (involved in the smallest
5012   // dependence distance).
5013   unsigned MaxSafeRegisterWidth = Legal->getMaxSafeRegisterWidth();
5014 
5015   WidestRegister = std::min(WidestRegister, MaxSafeRegisterWidth);
5016 
5017   unsigned MaxVectorSize = WidestRegister / WidestType;
5018 
5019   LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
5020                     << " / " << WidestType << " bits.\n");
5021   LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
5022                     << WidestRegister << " bits.\n");
5023 
5024   assert(MaxVectorSize <= 256 && "Did not expect to pack so many elements"
5025                                  " into one vector!");
5026   if (MaxVectorSize == 0) {
5027     LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n");
5028     MaxVectorSize = 1;
5029     return MaxVectorSize;
5030   } else if (ConstTripCount && ConstTripCount < MaxVectorSize &&
5031              isPowerOf2_32(ConstTripCount)) {
5032     // We need to clamp the VF to be the ConstTripCount. There is no point in
5033     // choosing a higher viable VF as done in the loop below.
5034     LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: "
5035                       << ConstTripCount << "\n");
5036     MaxVectorSize = ConstTripCount;
5037     return MaxVectorSize;
5038   }
5039 
5040   unsigned MaxVF = MaxVectorSize;
5041   if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) ||
5042       (MaximizeBandwidth && isScalarEpilogueAllowed())) {
5043     // Collect all viable vectorization factors larger than the default MaxVF
5044     // (i.e. MaxVectorSize).
5045     SmallVector<unsigned, 8> VFs;
5046     unsigned NewMaxVectorSize = WidestRegister / SmallestType;
5047     for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2)
5048       VFs.push_back(VS);
5049 
5050     // For each VF calculate its register usage.
5051     auto RUs = calculateRegisterUsage(VFs);
5052 
5053     // Select the largest VF which doesn't require more registers than existing
5054     // ones.
5055     for (int i = RUs.size() - 1; i >= 0; --i) {
5056       bool Selected = true;
5057       for (auto& pair : RUs[i].MaxLocalUsers) {
5058         unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5059         if (pair.second > TargetNumRegisters)
5060           Selected = false;
5061       }
5062       if (Selected) {
5063         MaxVF = VFs[i];
5064         break;
5065       }
5066     }
5067     if (unsigned MinVF = TTI.getMinimumVF(SmallestType)) {
5068       if (MaxVF < MinVF) {
5069         LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
5070                           << ") with target's minimum: " << MinVF << '\n');
5071         MaxVF = MinVF;
5072       }
5073     }
5074   }
5075   return MaxVF;
5076 }
5077 
5078 VectorizationFactor
5079 LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) {
5080   float Cost = expectedCost(1).first;
5081   const float ScalarCost = Cost;
5082   unsigned Width = 1;
5083   LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n");
5084 
5085   bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
5086   if (ForceVectorization && MaxVF > 1) {
5087     // Ignore scalar width, because the user explicitly wants vectorization.
5088     // Initialize cost to max so that VF = 2 is, at least, chosen during cost
5089     // evaluation.
5090     Cost = std::numeric_limits<float>::max();
5091   }
5092 
5093   for (unsigned i = 2; i <= MaxVF; i *= 2) {
5094     // Notice that the vector loop needs to be executed less times, so
5095     // we need to divide the cost of the vector loops by the width of
5096     // the vector elements.
5097     VectorizationCostTy C = expectedCost(i);
5098     float VectorCost = C.first / (float)i;
5099     LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
5100                       << " costs: " << (int)VectorCost << ".\n");
5101     if (!C.second && !ForceVectorization) {
5102       LLVM_DEBUG(
5103           dbgs() << "LV: Not considering vector loop of width " << i
5104                  << " because it will not generate any vector instructions.\n");
5105       continue;
5106     }
5107     if (VectorCost < Cost) {
5108       Cost = VectorCost;
5109       Width = i;
5110     }
5111   }
5112 
5113   if (!EnableCondStoresVectorization && NumPredStores) {
5114     reportVectorizationFailure("There are conditional stores.",
5115         "store that is conditionally executed prevents vectorization",
5116         "ConditionalStore", ORE, TheLoop);
5117     Width = 1;
5118     Cost = ScalarCost;
5119   }
5120 
5121   LLVM_DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs()
5122              << "LV: Vectorization seems to be not beneficial, "
5123              << "but was forced by a user.\n");
5124   LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n");
5125   VectorizationFactor Factor = {Width, (unsigned)(Width * Cost)};
5126   return Factor;
5127 }
5128 
5129 std::pair<unsigned, unsigned>
5130 LoopVectorizationCostModel::getSmallestAndWidestTypes() {
5131   unsigned MinWidth = -1U;
5132   unsigned MaxWidth = 8;
5133   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5134 
5135   // For each block.
5136   for (BasicBlock *BB : TheLoop->blocks()) {
5137     // For each instruction in the loop.
5138     for (Instruction &I : BB->instructionsWithoutDebug()) {
5139       Type *T = I.getType();
5140 
5141       // Skip ignored values.
5142       if (ValuesToIgnore.find(&I) != ValuesToIgnore.end())
5143         continue;
5144 
5145       // Only examine Loads, Stores and PHINodes.
5146       if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
5147         continue;
5148 
5149       // Examine PHI nodes that are reduction variables. Update the type to
5150       // account for the recurrence type.
5151       if (auto *PN = dyn_cast<PHINode>(&I)) {
5152         if (!Legal->isReductionVariable(PN))
5153           continue;
5154         RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[PN];
5155         T = RdxDesc.getRecurrenceType();
5156       }
5157 
5158       // Examine the stored values.
5159       if (auto *ST = dyn_cast<StoreInst>(&I))
5160         T = ST->getValueOperand()->getType();
5161 
5162       // Ignore loaded pointer types and stored pointer types that are not
5163       // vectorizable.
5164       //
5165       // FIXME: The check here attempts to predict whether a load or store will
5166       //        be vectorized. We only know this for certain after a VF has
5167       //        been selected. Here, we assume that if an access can be
5168       //        vectorized, it will be. We should also look at extending this
5169       //        optimization to non-pointer types.
5170       //
5171       if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) &&
5172           !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I))
5173         continue;
5174 
5175       MinWidth = std::min(MinWidth,
5176                           (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
5177       MaxWidth = std::max(MaxWidth,
5178                           (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
5179     }
5180   }
5181 
5182   return {MinWidth, MaxWidth};
5183 }
5184 
5185 unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF,
5186                                                            unsigned LoopCost) {
5187   // -- The interleave heuristics --
5188   // We interleave the loop in order to expose ILP and reduce the loop overhead.
5189   // There are many micro-architectural considerations that we can't predict
5190   // at this level. For example, frontend pressure (on decode or fetch) due to
5191   // code size, or the number and capabilities of the execution ports.
5192   //
5193   // We use the following heuristics to select the interleave count:
5194   // 1. If the code has reductions, then we interleave to break the cross
5195   // iteration dependency.
5196   // 2. If the loop is really small, then we interleave to reduce the loop
5197   // overhead.
5198   // 3. We don't interleave if we think that we will spill registers to memory
5199   // due to the increased register pressure.
5200 
5201   if (!isScalarEpilogueAllowed())
5202     return 1;
5203 
5204   // We used the distance for the interleave count.
5205   if (Legal->getMaxSafeDepDistBytes() != -1U)
5206     return 1;
5207 
5208   // Do not interleave loops with a relatively small known or estimated trip
5209   // count.
5210   auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
5211   if (BestKnownTC && *BestKnownTC < TinyTripCountInterleaveThreshold)
5212     return 1;
5213 
5214   RegisterUsage R = calculateRegisterUsage({VF})[0];
5215   // We divide by these constants so assume that we have at least one
5216   // instruction that uses at least one register.
5217   for (auto& pair : R.MaxLocalUsers) {
5218     pair.second = std::max(pair.second, 1U);
5219   }
5220 
5221   // We calculate the interleave count using the following formula.
5222   // Subtract the number of loop invariants from the number of available
5223   // registers. These registers are used by all of the interleaved instances.
5224   // Next, divide the remaining registers by the number of registers that is
5225   // required by the loop, in order to estimate how many parallel instances
5226   // fit without causing spills. All of this is rounded down if necessary to be
5227   // a power of two. We want power of two interleave count to simplify any
5228   // addressing operations or alignment considerations.
5229   // We also want power of two interleave counts to ensure that the induction
5230   // variable of the vector loop wraps to zero, when tail is folded by masking;
5231   // this currently happens when OptForSize, in which case IC is set to 1 above.
5232   unsigned IC = UINT_MAX;
5233 
5234   for (auto& pair : R.MaxLocalUsers) {
5235     unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5236     LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
5237                       << " registers of "
5238                       << TTI.getRegisterClassName(pair.first) << " register class\n");
5239     if (VF == 1) {
5240       if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
5241         TargetNumRegisters = ForceTargetNumScalarRegs;
5242     } else {
5243       if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
5244         TargetNumRegisters = ForceTargetNumVectorRegs;
5245     }
5246     unsigned MaxLocalUsers = pair.second;
5247     unsigned LoopInvariantRegs = 0;
5248     if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end())
5249       LoopInvariantRegs = R.LoopInvariantRegs[pair.first];
5250 
5251     unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers);
5252     // Don't count the induction variable as interleaved.
5253     if (EnableIndVarRegisterHeur) {
5254       TmpIC =
5255           PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) /
5256                         std::max(1U, (MaxLocalUsers - 1)));
5257     }
5258 
5259     IC = std::min(IC, TmpIC);
5260   }
5261 
5262   // Clamp the interleave ranges to reasonable counts.
5263   unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF);
5264 
5265   // Check if the user has overridden the max.
5266   if (VF == 1) {
5267     if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
5268       MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
5269   } else {
5270     if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
5271       MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
5272   }
5273 
5274   // If trip count is known or estimated compile time constant, limit the
5275   // interleave count to be less than the trip count divided by VF.
5276   if (BestKnownTC) {
5277     MaxInterleaveCount = std::min(*BestKnownTC / VF, MaxInterleaveCount);
5278   }
5279 
5280   // If we did not calculate the cost for VF (because the user selected the VF)
5281   // then we calculate the cost of VF here.
5282   if (LoopCost == 0)
5283     LoopCost = expectedCost(VF).first;
5284 
5285   assert(LoopCost && "Non-zero loop cost expected");
5286 
5287   // Clamp the calculated IC to be between the 1 and the max interleave count
5288   // that the target and trip count allows.
5289   if (IC > MaxInterleaveCount)
5290     IC = MaxInterleaveCount;
5291   else if (IC < 1)
5292     IC = 1;
5293 
5294   // Interleave if we vectorized this loop and there is a reduction that could
5295   // benefit from interleaving.
5296   if (VF > 1 && !Legal->getReductionVars()->empty()) {
5297     LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
5298     return IC;
5299   }
5300 
5301   // Note that if we've already vectorized the loop we will have done the
5302   // runtime check and so interleaving won't require further checks.
5303   bool InterleavingRequiresRuntimePointerCheck =
5304       (VF == 1 && Legal->getRuntimePointerChecking()->Need);
5305 
5306   // We want to interleave small loops in order to reduce the loop overhead and
5307   // potentially expose ILP opportunities.
5308   LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n');
5309   if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) {
5310     // We assume that the cost overhead is 1 and we use the cost model
5311     // to estimate the cost of the loop and interleave until the cost of the
5312     // loop overhead is about 5% of the cost of the loop.
5313     unsigned SmallIC =
5314         std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost));
5315 
5316     // Interleave until store/load ports (estimated by max interleave count) are
5317     // saturated.
5318     unsigned NumStores = Legal->getNumStores();
5319     unsigned NumLoads = Legal->getNumLoads();
5320     unsigned StoresIC = IC / (NumStores ? NumStores : 1);
5321     unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
5322 
5323     // If we have a scalar reduction (vector reductions are already dealt with
5324     // by this point), we can increase the critical path length if the loop
5325     // we're interleaving is inside another loop. Limit, by default to 2, so the
5326     // critical path only gets increased by one reduction operation.
5327     if (!Legal->getReductionVars()->empty() && TheLoop->getLoopDepth() > 1) {
5328       unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
5329       SmallIC = std::min(SmallIC, F);
5330       StoresIC = std::min(StoresIC, F);
5331       LoadsIC = std::min(LoadsIC, F);
5332     }
5333 
5334     if (EnableLoadStoreRuntimeInterleave &&
5335         std::max(StoresIC, LoadsIC) > SmallIC) {
5336       LLVM_DEBUG(
5337           dbgs() << "LV: Interleaving to saturate store or load ports.\n");
5338       return std::max(StoresIC, LoadsIC);
5339     }
5340 
5341     LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
5342     return SmallIC;
5343   }
5344 
5345   // Interleave if this is a large loop (small loops are already dealt with by
5346   // this point) that could benefit from interleaving.
5347   bool HasReductions = !Legal->getReductionVars()->empty();
5348   if (TTI.enableAggressiveInterleaving(HasReductions)) {
5349     LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5350     return IC;
5351   }
5352 
5353   LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
5354   return 1;
5355 }
5356 
5357 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
5358 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {
5359   // This function calculates the register usage by measuring the highest number
5360   // of values that are alive at a single location. Obviously, this is a very
5361   // rough estimation. We scan the loop in a topological order in order and
5362   // assign a number to each instruction. We use RPO to ensure that defs are
5363   // met before their users. We assume that each instruction that has in-loop
5364   // users starts an interval. We record every time that an in-loop value is
5365   // used, so we have a list of the first and last occurrences of each
5366   // instruction. Next, we transpose this data structure into a multi map that
5367   // holds the list of intervals that *end* at a specific location. This multi
5368   // map allows us to perform a linear search. We scan the instructions linearly
5369   // and record each time that a new interval starts, by placing it in a set.
5370   // If we find this value in the multi-map then we remove it from the set.
5371   // The max register usage is the maximum size of the set.
5372   // We also search for instructions that are defined outside the loop, but are
5373   // used inside the loop. We need this number separately from the max-interval
5374   // usage number because when we unroll, loop-invariant values do not take
5375   // more register.
5376   LoopBlocksDFS DFS(TheLoop);
5377   DFS.perform(LI);
5378 
5379   RegisterUsage RU;
5380 
5381   // Each 'key' in the map opens a new interval. The values
5382   // of the map are the index of the 'last seen' usage of the
5383   // instruction that is the key.
5384   using IntervalMap = DenseMap<Instruction *, unsigned>;
5385 
5386   // Maps instruction to its index.
5387   SmallVector<Instruction *, 64> IdxToInstr;
5388   // Marks the end of each interval.
5389   IntervalMap EndPoint;
5390   // Saves the list of instruction indices that are used in the loop.
5391   SmallPtrSet<Instruction *, 8> Ends;
5392   // Saves the list of values that are used in the loop but are
5393   // defined outside the loop, such as arguments and constants.
5394   SmallPtrSet<Value *, 8> LoopInvariants;
5395 
5396   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
5397     for (Instruction &I : BB->instructionsWithoutDebug()) {
5398       IdxToInstr.push_back(&I);
5399 
5400       // Save the end location of each USE.
5401       for (Value *U : I.operands()) {
5402         auto *Instr = dyn_cast<Instruction>(U);
5403 
5404         // Ignore non-instruction values such as arguments, constants, etc.
5405         if (!Instr)
5406           continue;
5407 
5408         // If this instruction is outside the loop then record it and continue.
5409         if (!TheLoop->contains(Instr)) {
5410           LoopInvariants.insert(Instr);
5411           continue;
5412         }
5413 
5414         // Overwrite previous end points.
5415         EndPoint[Instr] = IdxToInstr.size();
5416         Ends.insert(Instr);
5417       }
5418     }
5419   }
5420 
5421   // Saves the list of intervals that end with the index in 'key'.
5422   using InstrList = SmallVector<Instruction *, 2>;
5423   DenseMap<unsigned, InstrList> TransposeEnds;
5424 
5425   // Transpose the EndPoints to a list of values that end at each index.
5426   for (auto &Interval : EndPoint)
5427     TransposeEnds[Interval.second].push_back(Interval.first);
5428 
5429   SmallPtrSet<Instruction *, 8> OpenIntervals;
5430 
5431   // Get the size of the widest register.
5432   unsigned MaxSafeDepDist = -1U;
5433   if (Legal->getMaxSafeDepDistBytes() != -1U)
5434     MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8;
5435   unsigned WidestRegister =
5436       std::min(TTI.getRegisterBitWidth(true), MaxSafeDepDist);
5437   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5438 
5439   SmallVector<RegisterUsage, 8> RUs(VFs.size());
5440   SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size());
5441 
5442   LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
5443 
5444   // A lambda that gets the register usage for the given type and VF.
5445   auto GetRegUsage = [&DL, WidestRegister](Type *Ty, unsigned VF) {
5446     if (Ty->isTokenTy())
5447       return 0U;
5448     unsigned TypeSize = DL.getTypeSizeInBits(Ty->getScalarType());
5449     return std::max<unsigned>(1, VF * TypeSize / WidestRegister);
5450   };
5451 
5452   for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
5453     Instruction *I = IdxToInstr[i];
5454 
5455     // Remove all of the instructions that end at this location.
5456     InstrList &List = TransposeEnds[i];
5457     for (Instruction *ToRemove : List)
5458       OpenIntervals.erase(ToRemove);
5459 
5460     // Ignore instructions that are never used within the loop.
5461     if (Ends.find(I) == Ends.end())
5462       continue;
5463 
5464     // Skip ignored values.
5465     if (ValuesToIgnore.find(I) != ValuesToIgnore.end())
5466       continue;
5467 
5468     // For each VF find the maximum usage of registers.
5469     for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
5470       // Count the number of live intervals.
5471       SmallMapVector<unsigned, unsigned, 4> RegUsage;
5472 
5473       if (VFs[j] == 1) {
5474         for (auto Inst : OpenIntervals) {
5475           unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
5476           if (RegUsage.find(ClassID) == RegUsage.end())
5477             RegUsage[ClassID] = 1;
5478           else
5479             RegUsage[ClassID] += 1;
5480         }
5481       } else {
5482         collectUniformsAndScalars(VFs[j]);
5483         for (auto Inst : OpenIntervals) {
5484           // Skip ignored values for VF > 1.
5485           if (VecValuesToIgnore.find(Inst) != VecValuesToIgnore.end())
5486             continue;
5487           if (isScalarAfterVectorization(Inst, VFs[j])) {
5488             unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
5489             if (RegUsage.find(ClassID) == RegUsage.end())
5490               RegUsage[ClassID] = 1;
5491             else
5492               RegUsage[ClassID] += 1;
5493           } else {
5494             unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType());
5495             if (RegUsage.find(ClassID) == RegUsage.end())
5496               RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]);
5497             else
5498               RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]);
5499           }
5500         }
5501       }
5502 
5503       for (auto& pair : RegUsage) {
5504         if (MaxUsages[j].find(pair.first) != MaxUsages[j].end())
5505           MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second);
5506         else
5507           MaxUsages[j][pair.first] = pair.second;
5508       }
5509     }
5510 
5511     LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
5512                       << OpenIntervals.size() << '\n');
5513 
5514     // Add the current instruction to the list of open intervals.
5515     OpenIntervals.insert(I);
5516   }
5517 
5518   for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
5519     SmallMapVector<unsigned, unsigned, 4> Invariant;
5520 
5521     for (auto Inst : LoopInvariants) {
5522       unsigned Usage = VFs[i] == 1 ? 1 : GetRegUsage(Inst->getType(), VFs[i]);
5523       unsigned ClassID = TTI.getRegisterClassForType(VFs[i] > 1, Inst->getType());
5524       if (Invariant.find(ClassID) == Invariant.end())
5525         Invariant[ClassID] = Usage;
5526       else
5527         Invariant[ClassID] += Usage;
5528     }
5529 
5530     LLVM_DEBUG({
5531       dbgs() << "LV(REG): VF = " << VFs[i] << '\n';
5532       dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size()
5533              << " item\n";
5534       for (const auto &pair : MaxUsages[i]) {
5535         dbgs() << "LV(REG): RegisterClass: "
5536                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5537                << " registers\n";
5538       }
5539       dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()
5540              << " item\n";
5541       for (const auto &pair : Invariant) {
5542         dbgs() << "LV(REG): RegisterClass: "
5543                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5544                << " registers\n";
5545       }
5546     });
5547 
5548     RU.LoopInvariantRegs = Invariant;
5549     RU.MaxLocalUsers = MaxUsages[i];
5550     RUs[i] = RU;
5551   }
5552 
5553   return RUs;
5554 }
5555 
5556 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){
5557   // TODO: Cost model for emulated masked load/store is completely
5558   // broken. This hack guides the cost model to use an artificially
5559   // high enough value to practically disable vectorization with such
5560   // operations, except where previously deployed legality hack allowed
5561   // using very low cost values. This is to avoid regressions coming simply
5562   // from moving "masked load/store" check from legality to cost model.
5563   // Masked Load/Gather emulation was previously never allowed.
5564   // Limited number of Masked Store/Scatter emulation was allowed.
5565   assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction");
5566   return isa<LoadInst>(I) ||
5567          (isa<StoreInst>(I) &&
5568           NumPredStores > NumberOfStoresToPredicate);
5569 }
5570 
5571 void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) {
5572   // If we aren't vectorizing the loop, or if we've already collected the
5573   // instructions to scalarize, there's nothing to do. Collection may already
5574   // have occurred if we have a user-selected VF and are now computing the
5575   // expected cost for interleaving.
5576   if (VF < 2 || InstsToScalarize.find(VF) != InstsToScalarize.end())
5577     return;
5578 
5579   // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
5580   // not profitable to scalarize any instructions, the presence of VF in the
5581   // map will indicate that we've analyzed it already.
5582   ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
5583 
5584   // Find all the instructions that are scalar with predication in the loop and
5585   // determine if it would be better to not if-convert the blocks they are in.
5586   // If so, we also record the instructions to scalarize.
5587   for (BasicBlock *BB : TheLoop->blocks()) {
5588     if (!blockNeedsPredication(BB))
5589       continue;
5590     for (Instruction &I : *BB)
5591       if (isScalarWithPredication(&I)) {
5592         ScalarCostsTy ScalarCosts;
5593         // Do not apply discount logic if hacked cost is needed
5594         // for emulated masked memrefs.
5595         if (!useEmulatedMaskMemRefHack(&I) &&
5596             computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
5597           ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
5598         // Remember that BB will remain after vectorization.
5599         PredicatedBBsAfterVectorization.insert(BB);
5600       }
5601   }
5602 }
5603 
5604 int LoopVectorizationCostModel::computePredInstDiscount(
5605     Instruction *PredInst, DenseMap<Instruction *, unsigned> &ScalarCosts,
5606     unsigned VF) {
5607   assert(!isUniformAfterVectorization(PredInst, VF) &&
5608          "Instruction marked uniform-after-vectorization will be predicated");
5609 
5610   // Initialize the discount to zero, meaning that the scalar version and the
5611   // vector version cost the same.
5612   int Discount = 0;
5613 
5614   // Holds instructions to analyze. The instructions we visit are mapped in
5615   // ScalarCosts. Those instructions are the ones that would be scalarized if
5616   // we find that the scalar version costs less.
5617   SmallVector<Instruction *, 8> Worklist;
5618 
5619   // Returns true if the given instruction can be scalarized.
5620   auto canBeScalarized = [&](Instruction *I) -> bool {
5621     // We only attempt to scalarize instructions forming a single-use chain
5622     // from the original predicated block that would otherwise be vectorized.
5623     // Although not strictly necessary, we give up on instructions we know will
5624     // already be scalar to avoid traversing chains that are unlikely to be
5625     // beneficial.
5626     if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
5627         isScalarAfterVectorization(I, VF))
5628       return false;
5629 
5630     // If the instruction is scalar with predication, it will be analyzed
5631     // separately. We ignore it within the context of PredInst.
5632     if (isScalarWithPredication(I))
5633       return false;
5634 
5635     // If any of the instruction's operands are uniform after vectorization,
5636     // the instruction cannot be scalarized. This prevents, for example, a
5637     // masked load from being scalarized.
5638     //
5639     // We assume we will only emit a value for lane zero of an instruction
5640     // marked uniform after vectorization, rather than VF identical values.
5641     // Thus, if we scalarize an instruction that uses a uniform, we would
5642     // create uses of values corresponding to the lanes we aren't emitting code
5643     // for. This behavior can be changed by allowing getScalarValue to clone
5644     // the lane zero values for uniforms rather than asserting.
5645     for (Use &U : I->operands())
5646       if (auto *J = dyn_cast<Instruction>(U.get()))
5647         if (isUniformAfterVectorization(J, VF))
5648           return false;
5649 
5650     // Otherwise, we can scalarize the instruction.
5651     return true;
5652   };
5653 
5654   // Compute the expected cost discount from scalarizing the entire expression
5655   // feeding the predicated instruction. We currently only consider expressions
5656   // that are single-use instruction chains.
5657   Worklist.push_back(PredInst);
5658   while (!Worklist.empty()) {
5659     Instruction *I = Worklist.pop_back_val();
5660 
5661     // If we've already analyzed the instruction, there's nothing to do.
5662     if (ScalarCosts.find(I) != ScalarCosts.end())
5663       continue;
5664 
5665     // Compute the cost of the vector instruction. Note that this cost already
5666     // includes the scalarization overhead of the predicated instruction.
5667     unsigned VectorCost = getInstructionCost(I, VF).first;
5668 
5669     // Compute the cost of the scalarized instruction. This cost is the cost of
5670     // the instruction as if it wasn't if-converted and instead remained in the
5671     // predicated block. We will scale this cost by block probability after
5672     // computing the scalarization overhead.
5673     unsigned ScalarCost = VF * getInstructionCost(I, 1).first;
5674 
5675     // Compute the scalarization overhead of needed insertelement instructions
5676     // and phi nodes.
5677     if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) {
5678       ScalarCost += TTI.getScalarizationOverhead(ToVectorTy(I->getType(), VF),
5679                                                  true, false);
5680       ScalarCost += VF * TTI.getCFInstrCost(Instruction::PHI);
5681     }
5682 
5683     // Compute the scalarization overhead of needed extractelement
5684     // instructions. For each of the instruction's operands, if the operand can
5685     // be scalarized, add it to the worklist; otherwise, account for the
5686     // overhead.
5687     for (Use &U : I->operands())
5688       if (auto *J = dyn_cast<Instruction>(U.get())) {
5689         assert(VectorType::isValidElementType(J->getType()) &&
5690                "Instruction has non-scalar type");
5691         if (canBeScalarized(J))
5692           Worklist.push_back(J);
5693         else if (needsExtract(J, VF))
5694           ScalarCost += TTI.getScalarizationOverhead(
5695                               ToVectorTy(J->getType(),VF), false, true);
5696       }
5697 
5698     // Scale the total scalar cost by block probability.
5699     ScalarCost /= getReciprocalPredBlockProb();
5700 
5701     // Compute the discount. A non-negative discount means the vector version
5702     // of the instruction costs more, and scalarizing would be beneficial.
5703     Discount += VectorCost - ScalarCost;
5704     ScalarCosts[I] = ScalarCost;
5705   }
5706 
5707   return Discount;
5708 }
5709 
5710 LoopVectorizationCostModel::VectorizationCostTy
5711 LoopVectorizationCostModel::expectedCost(unsigned VF) {
5712   VectorizationCostTy Cost;
5713 
5714   // For each block.
5715   for (BasicBlock *BB : TheLoop->blocks()) {
5716     VectorizationCostTy BlockCost;
5717 
5718     // For each instruction in the old loop.
5719     for (Instruction &I : BB->instructionsWithoutDebug()) {
5720       // Skip ignored values.
5721       if (ValuesToIgnore.find(&I) != ValuesToIgnore.end() ||
5722           (VF > 1 && VecValuesToIgnore.find(&I) != VecValuesToIgnore.end()))
5723         continue;
5724 
5725       VectorizationCostTy C = getInstructionCost(&I, VF);
5726 
5727       // Check if we should override the cost.
5728       if (ForceTargetInstructionCost.getNumOccurrences() > 0)
5729         C.first = ForceTargetInstructionCost;
5730 
5731       BlockCost.first += C.first;
5732       BlockCost.second |= C.second;
5733       LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first
5734                         << " for VF " << VF << " For instruction: " << I
5735                         << '\n');
5736     }
5737 
5738     // If we are vectorizing a predicated block, it will have been
5739     // if-converted. This means that the block's instructions (aside from
5740     // stores and instructions that may divide by zero) will now be
5741     // unconditionally executed. For the scalar case, we may not always execute
5742     // the predicated block. Thus, scale the block's cost by the probability of
5743     // executing it.
5744     if (VF == 1 && blockNeedsPredication(BB))
5745       BlockCost.first /= getReciprocalPredBlockProb();
5746 
5747     Cost.first += BlockCost.first;
5748     Cost.second |= BlockCost.second;
5749   }
5750 
5751   return Cost;
5752 }
5753 
5754 /// Gets Address Access SCEV after verifying that the access pattern
5755 /// is loop invariant except the induction variable dependence.
5756 ///
5757 /// This SCEV can be sent to the Target in order to estimate the address
5758 /// calculation cost.
5759 static const SCEV *getAddressAccessSCEV(
5760               Value *Ptr,
5761               LoopVectorizationLegality *Legal,
5762               PredicatedScalarEvolution &PSE,
5763               const Loop *TheLoop) {
5764 
5765   auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
5766   if (!Gep)
5767     return nullptr;
5768 
5769   // We are looking for a gep with all loop invariant indices except for one
5770   // which should be an induction variable.
5771   auto SE = PSE.getSE();
5772   unsigned NumOperands = Gep->getNumOperands();
5773   for (unsigned i = 1; i < NumOperands; ++i) {
5774     Value *Opd = Gep->getOperand(i);
5775     if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
5776         !Legal->isInductionVariable(Opd))
5777       return nullptr;
5778   }
5779 
5780   // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
5781   return PSE.getSCEV(Ptr);
5782 }
5783 
5784 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
5785   return Legal->hasStride(I->getOperand(0)) ||
5786          Legal->hasStride(I->getOperand(1));
5787 }
5788 
5789 unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
5790                                                                  unsigned VF) {
5791   assert(VF > 1 && "Scalarization cost of instruction implies vectorization.");
5792   Type *ValTy = getMemInstValueType(I);
5793   auto SE = PSE.getSE();
5794 
5795   unsigned AS = getLoadStoreAddressSpace(I);
5796   Value *Ptr = getLoadStorePointerOperand(I);
5797   Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
5798 
5799   // Figure out whether the access is strided and get the stride value
5800   // if it's known in compile time
5801   const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
5802 
5803   // Get the cost of the scalar memory instruction and address computation.
5804   unsigned Cost = VF * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
5805 
5806   // Don't pass *I here, since it is scalar but will actually be part of a
5807   // vectorized loop where the user of it is a vectorized instruction.
5808   const MaybeAlign Alignment = getLoadStoreAlignment(I);
5809   Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(),
5810                                    Alignment, AS);
5811 
5812   // Get the overhead of the extractelement and insertelement instructions
5813   // we might create due to scalarization.
5814   Cost += getScalarizationOverhead(I, VF);
5815 
5816   // If we have a predicated store, it may not be executed for each vector
5817   // lane. Scale the cost by the probability of executing the predicated
5818   // block.
5819   if (isPredicatedInst(I)) {
5820     Cost /= getReciprocalPredBlockProb();
5821 
5822     if (useEmulatedMaskMemRefHack(I))
5823       // Artificially setting to a high enough value to practically disable
5824       // vectorization with such operations.
5825       Cost = 3000000;
5826   }
5827 
5828   return Cost;
5829 }
5830 
5831 unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
5832                                                              unsigned VF) {
5833   Type *ValTy = getMemInstValueType(I);
5834   Type *VectorTy = ToVectorTy(ValTy, VF);
5835   Value *Ptr = getLoadStorePointerOperand(I);
5836   unsigned AS = getLoadStoreAddressSpace(I);
5837   int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
5838 
5839   assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
5840          "Stride should be 1 or -1 for consecutive memory access");
5841   const MaybeAlign Alignment = getLoadStoreAlignment(I);
5842   unsigned Cost = 0;
5843   if (Legal->isMaskRequired(I))
5844     Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy,
5845                                       Alignment ? Alignment->value() : 0, AS);
5846   else
5847     Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, I);
5848 
5849   bool Reverse = ConsecutiveStride < 0;
5850   if (Reverse)
5851     Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
5852   return Cost;
5853 }
5854 
5855 unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
5856                                                          unsigned VF) {
5857   Type *ValTy = getMemInstValueType(I);
5858   Type *VectorTy = ToVectorTy(ValTy, VF);
5859   const MaybeAlign Alignment = getLoadStoreAlignment(I);
5860   unsigned AS = getLoadStoreAddressSpace(I);
5861   if (isa<LoadInst>(I)) {
5862     return TTI.getAddressComputationCost(ValTy) +
5863            TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS) +
5864            TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
5865   }
5866   StoreInst *SI = cast<StoreInst>(I);
5867 
5868   bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand());
5869   return TTI.getAddressComputationCost(ValTy) +
5870          TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS) +
5871          (isLoopInvariantStoreValue
5872               ? 0
5873               : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
5874                                        VF - 1));
5875 }
5876 
5877 unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
5878                                                           unsigned VF) {
5879   Type *ValTy = getMemInstValueType(I);
5880   Type *VectorTy = ToVectorTy(ValTy, VF);
5881   const MaybeAlign Alignment = getLoadStoreAlignment(I);
5882   Value *Ptr = getLoadStorePointerOperand(I);
5883 
5884   return TTI.getAddressComputationCost(VectorTy) +
5885          TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr,
5886                                     Legal->isMaskRequired(I),
5887                                     Alignment ? Alignment->value() : 0);
5888 }
5889 
5890 unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
5891                                                             unsigned VF) {
5892   Type *ValTy = getMemInstValueType(I);
5893   Type *VectorTy = ToVectorTy(ValTy, VF);
5894   unsigned AS = getLoadStoreAddressSpace(I);
5895 
5896   auto Group = getInterleavedAccessGroup(I);
5897   assert(Group && "Fail to get an interleaved access group.");
5898 
5899   unsigned InterleaveFactor = Group->getFactor();
5900   Type *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
5901 
5902   // Holds the indices of existing members in an interleaved load group.
5903   // An interleaved store group doesn't need this as it doesn't allow gaps.
5904   SmallVector<unsigned, 4> Indices;
5905   if (isa<LoadInst>(I)) {
5906     for (unsigned i = 0; i < InterleaveFactor; i++)
5907       if (Group->getMember(i))
5908         Indices.push_back(i);
5909   }
5910 
5911   // Calculate the cost of the whole interleaved group.
5912   bool UseMaskForGaps =
5913       Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
5914   unsigned Cost = TTI.getInterleavedMemoryOpCost(
5915       I->getOpcode(), WideVecTy, Group->getFactor(), Indices,
5916       Group->getAlignment(), AS, Legal->isMaskRequired(I), UseMaskForGaps);
5917 
5918   if (Group->isReverse()) {
5919     // TODO: Add support for reversed masked interleaved access.
5920     assert(!Legal->isMaskRequired(I) &&
5921            "Reverse masked interleaved access not supported.");
5922     Cost += Group->getNumMembers() *
5923             TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
5924   }
5925   return Cost;
5926 }
5927 
5928 unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
5929                                                               unsigned VF) {
5930   // Calculate scalar cost only. Vectorization cost should be ready at this
5931   // moment.
5932   if (VF == 1) {
5933     Type *ValTy = getMemInstValueType(I);
5934     const MaybeAlign Alignment = getLoadStoreAlignment(I);
5935     unsigned AS = getLoadStoreAddressSpace(I);
5936 
5937     return TTI.getAddressComputationCost(ValTy) +
5938            TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, I);
5939   }
5940   return getWideningCost(I, VF);
5941 }
5942 
5943 LoopVectorizationCostModel::VectorizationCostTy
5944 LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
5945   // If we know that this instruction will remain uniform, check the cost of
5946   // the scalar version.
5947   if (isUniformAfterVectorization(I, VF))
5948     VF = 1;
5949 
5950   if (VF > 1 && isProfitableToScalarize(I, VF))
5951     return VectorizationCostTy(InstsToScalarize[VF][I], false);
5952 
5953   // Forced scalars do not have any scalarization overhead.
5954   auto ForcedScalar = ForcedScalars.find(VF);
5955   if (VF > 1 && ForcedScalar != ForcedScalars.end()) {
5956     auto InstSet = ForcedScalar->second;
5957     if (InstSet.find(I) != InstSet.end())
5958       return VectorizationCostTy((getInstructionCost(I, 1).first * VF), false);
5959   }
5960 
5961   Type *VectorTy;
5962   unsigned C = getInstructionCost(I, VF, VectorTy);
5963 
5964   bool TypeNotScalarized =
5965       VF > 1 && VectorTy->isVectorTy() && TTI.getNumberOfParts(VectorTy) < VF;
5966   return VectorizationCostTy(C, TypeNotScalarized);
5967 }
5968 
5969 unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
5970                                                               unsigned VF) {
5971 
5972   if (VF == 1)
5973     return 0;
5974 
5975   unsigned Cost = 0;
5976   Type *RetTy = ToVectorTy(I->getType(), VF);
5977   if (!RetTy->isVoidTy() &&
5978       (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
5979     Cost += TTI.getScalarizationOverhead(RetTy, true, false);
5980 
5981   // Some targets keep addresses scalar.
5982   if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
5983     return Cost;
5984 
5985   // Some targets support efficient element stores.
5986   if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
5987     return Cost;
5988 
5989   // Collect operands to consider.
5990   CallInst *CI = dyn_cast<CallInst>(I);
5991   Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands();
5992 
5993   // Skip operands that do not require extraction/scalarization and do not incur
5994   // any overhead.
5995   return Cost + TTI.getOperandsScalarizationOverhead(
5996                     filterExtractingOperands(Ops, VF), VF);
5997 }
5998 
5999 void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {
6000   if (VF == 1)
6001     return;
6002   NumPredStores = 0;
6003   for (BasicBlock *BB : TheLoop->blocks()) {
6004     // For each instruction in the old loop.
6005     for (Instruction &I : *BB) {
6006       Value *Ptr =  getLoadStorePointerOperand(&I);
6007       if (!Ptr)
6008         continue;
6009 
6010       // TODO: We should generate better code and update the cost model for
6011       // predicated uniform stores. Today they are treated as any other
6012       // predicated store (see added test cases in
6013       // invariant-store-vectorization.ll).
6014       if (isa<StoreInst>(&I) && isScalarWithPredication(&I))
6015         NumPredStores++;
6016 
6017       if (Legal->isUniform(Ptr) &&
6018           // Conditional loads and stores should be scalarized and predicated.
6019           // isScalarWithPredication cannot be used here since masked
6020           // gather/scatters are not considered scalar with predication.
6021           !Legal->blockNeedsPredication(I.getParent())) {
6022         // TODO: Avoid replicating loads and stores instead of
6023         // relying on instcombine to remove them.
6024         // Load: Scalar load + broadcast
6025         // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
6026         unsigned Cost = getUniformMemOpCost(&I, VF);
6027         setWideningDecision(&I, VF, CM_Scalarize, Cost);
6028         continue;
6029       }
6030 
6031       // We assume that widening is the best solution when possible.
6032       if (memoryInstructionCanBeWidened(&I, VF)) {
6033         unsigned Cost = getConsecutiveMemOpCost(&I, VF);
6034         int ConsecutiveStride =
6035                Legal->isConsecutivePtr(getLoadStorePointerOperand(&I));
6036         assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6037                "Expected consecutive stride.");
6038         InstWidening Decision =
6039             ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
6040         setWideningDecision(&I, VF, Decision, Cost);
6041         continue;
6042       }
6043 
6044       // Choose between Interleaving, Gather/Scatter or Scalarization.
6045       unsigned InterleaveCost = std::numeric_limits<unsigned>::max();
6046       unsigned NumAccesses = 1;
6047       if (isAccessInterleaved(&I)) {
6048         auto Group = getInterleavedAccessGroup(&I);
6049         assert(Group && "Fail to get an interleaved access group.");
6050 
6051         // Make one decision for the whole group.
6052         if (getWideningDecision(&I, VF) != CM_Unknown)
6053           continue;
6054 
6055         NumAccesses = Group->getNumMembers();
6056         if (interleavedAccessCanBeWidened(&I, VF))
6057           InterleaveCost = getInterleaveGroupCost(&I, VF);
6058       }
6059 
6060       unsigned GatherScatterCost =
6061           isLegalGatherOrScatter(&I)
6062               ? getGatherScatterCost(&I, VF) * NumAccesses
6063               : std::numeric_limits<unsigned>::max();
6064 
6065       unsigned ScalarizationCost =
6066           getMemInstScalarizationCost(&I, VF) * NumAccesses;
6067 
6068       // Choose better solution for the current VF,
6069       // write down this decision and use it during vectorization.
6070       unsigned Cost;
6071       InstWidening Decision;
6072       if (InterleaveCost <= GatherScatterCost &&
6073           InterleaveCost < ScalarizationCost) {
6074         Decision = CM_Interleave;
6075         Cost = InterleaveCost;
6076       } else if (GatherScatterCost < ScalarizationCost) {
6077         Decision = CM_GatherScatter;
6078         Cost = GatherScatterCost;
6079       } else {
6080         Decision = CM_Scalarize;
6081         Cost = ScalarizationCost;
6082       }
6083       // If the instructions belongs to an interleave group, the whole group
6084       // receives the same decision. The whole group receives the cost, but
6085       // the cost will actually be assigned to one instruction.
6086       if (auto Group = getInterleavedAccessGroup(&I))
6087         setWideningDecision(Group, VF, Decision, Cost);
6088       else
6089         setWideningDecision(&I, VF, Decision, Cost);
6090     }
6091   }
6092 
6093   // Make sure that any load of address and any other address computation
6094   // remains scalar unless there is gather/scatter support. This avoids
6095   // inevitable extracts into address registers, and also has the benefit of
6096   // activating LSR more, since that pass can't optimize vectorized
6097   // addresses.
6098   if (TTI.prefersVectorizedAddressing())
6099     return;
6100 
6101   // Start with all scalar pointer uses.
6102   SmallPtrSet<Instruction *, 8> AddrDefs;
6103   for (BasicBlock *BB : TheLoop->blocks())
6104     for (Instruction &I : *BB) {
6105       Instruction *PtrDef =
6106         dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
6107       if (PtrDef && TheLoop->contains(PtrDef) &&
6108           getWideningDecision(&I, VF) != CM_GatherScatter)
6109         AddrDefs.insert(PtrDef);
6110     }
6111 
6112   // Add all instructions used to generate the addresses.
6113   SmallVector<Instruction *, 4> Worklist;
6114   for (auto *I : AddrDefs)
6115     Worklist.push_back(I);
6116   while (!Worklist.empty()) {
6117     Instruction *I = Worklist.pop_back_val();
6118     for (auto &Op : I->operands())
6119       if (auto *InstOp = dyn_cast<Instruction>(Op))
6120         if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
6121             AddrDefs.insert(InstOp).second)
6122           Worklist.push_back(InstOp);
6123   }
6124 
6125   for (auto *I : AddrDefs) {
6126     if (isa<LoadInst>(I)) {
6127       // Setting the desired widening decision should ideally be handled in
6128       // by cost functions, but since this involves the task of finding out
6129       // if the loaded register is involved in an address computation, it is
6130       // instead changed here when we know this is the case.
6131       InstWidening Decision = getWideningDecision(I, VF);
6132       if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
6133         // Scalarize a widened load of address.
6134         setWideningDecision(I, VF, CM_Scalarize,
6135                             (VF * getMemoryInstructionCost(I, 1)));
6136       else if (auto Group = getInterleavedAccessGroup(I)) {
6137         // Scalarize an interleave group of address loads.
6138         for (unsigned I = 0; I < Group->getFactor(); ++I) {
6139           if (Instruction *Member = Group->getMember(I))
6140             setWideningDecision(Member, VF, CM_Scalarize,
6141                                 (VF * getMemoryInstructionCost(Member, 1)));
6142         }
6143       }
6144     } else
6145       // Make sure I gets scalarized and a cost estimate without
6146       // scalarization overhead.
6147       ForcedScalars[VF].insert(I);
6148   }
6149 }
6150 
6151 unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
6152                                                         unsigned VF,
6153                                                         Type *&VectorTy) {
6154   Type *RetTy = I->getType();
6155   if (canTruncateToMinimalBitwidth(I, VF))
6156     RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
6157   VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF);
6158   auto SE = PSE.getSE();
6159 
6160   // TODO: We need to estimate the cost of intrinsic calls.
6161   switch (I->getOpcode()) {
6162   case Instruction::GetElementPtr:
6163     // We mark this instruction as zero-cost because the cost of GEPs in
6164     // vectorized code depends on whether the corresponding memory instruction
6165     // is scalarized or not. Therefore, we handle GEPs with the memory
6166     // instruction cost.
6167     return 0;
6168   case Instruction::Br: {
6169     // In cases of scalarized and predicated instructions, there will be VF
6170     // predicated blocks in the vectorized loop. Each branch around these
6171     // blocks requires also an extract of its vector compare i1 element.
6172     bool ScalarPredicatedBB = false;
6173     BranchInst *BI = cast<BranchInst>(I);
6174     if (VF > 1 && BI->isConditional() &&
6175         (PredicatedBBsAfterVectorization.find(BI->getSuccessor(0)) !=
6176              PredicatedBBsAfterVectorization.end() ||
6177          PredicatedBBsAfterVectorization.find(BI->getSuccessor(1)) !=
6178              PredicatedBBsAfterVectorization.end()))
6179       ScalarPredicatedBB = true;
6180 
6181     if (ScalarPredicatedBB) {
6182       // Return cost for branches around scalarized and predicated blocks.
6183       Type *Vec_i1Ty =
6184           VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
6185       return (TTI.getScalarizationOverhead(Vec_i1Ty, false, true) +
6186               (TTI.getCFInstrCost(Instruction::Br) * VF));
6187     } else if (I->getParent() == TheLoop->getLoopLatch() || VF == 1)
6188       // The back-edge branch will remain, as will all scalar branches.
6189       return TTI.getCFInstrCost(Instruction::Br);
6190     else
6191       // This branch will be eliminated by if-conversion.
6192       return 0;
6193     // Note: We currently assume zero cost for an unconditional branch inside
6194     // a predicated block since it will become a fall-through, although we
6195     // may decide in the future to call TTI for all branches.
6196   }
6197   case Instruction::PHI: {
6198     auto *Phi = cast<PHINode>(I);
6199 
6200     // First-order recurrences are replaced by vector shuffles inside the loop.
6201     // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type.
6202     if (VF > 1 && Legal->isFirstOrderRecurrence(Phi))
6203       return TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector,
6204                                 VectorTy, VF - 1, VectorType::get(RetTy, 1));
6205 
6206     // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
6207     // converted into select instructions. We require N - 1 selects per phi
6208     // node, where N is the number of incoming values.
6209     if (VF > 1 && Phi->getParent() != TheLoop->getHeader())
6210       return (Phi->getNumIncomingValues() - 1) *
6211              TTI.getCmpSelInstrCost(
6212                  Instruction::Select, ToVectorTy(Phi->getType(), VF),
6213                  ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF));
6214 
6215     return TTI.getCFInstrCost(Instruction::PHI);
6216   }
6217   case Instruction::UDiv:
6218   case Instruction::SDiv:
6219   case Instruction::URem:
6220   case Instruction::SRem:
6221     // If we have a predicated instruction, it may not be executed for each
6222     // vector lane. Get the scalarization cost and scale this amount by the
6223     // probability of executing the predicated block. If the instruction is not
6224     // predicated, we fall through to the next case.
6225     if (VF > 1 && isScalarWithPredication(I)) {
6226       unsigned Cost = 0;
6227 
6228       // These instructions have a non-void type, so account for the phi nodes
6229       // that we will create. This cost is likely to be zero. The phi node
6230       // cost, if any, should be scaled by the block probability because it
6231       // models a copy at the end of each predicated block.
6232       Cost += VF * TTI.getCFInstrCost(Instruction::PHI);
6233 
6234       // The cost of the non-predicated instruction.
6235       Cost += VF * TTI.getArithmeticInstrCost(I->getOpcode(), RetTy);
6236 
6237       // The cost of insertelement and extractelement instructions needed for
6238       // scalarization.
6239       Cost += getScalarizationOverhead(I, VF);
6240 
6241       // Scale the cost by the probability of executing the predicated blocks.
6242       // This assumes the predicated block for each vector lane is equally
6243       // likely.
6244       return Cost / getReciprocalPredBlockProb();
6245     }
6246     LLVM_FALLTHROUGH;
6247   case Instruction::Add:
6248   case Instruction::FAdd:
6249   case Instruction::Sub:
6250   case Instruction::FSub:
6251   case Instruction::Mul:
6252   case Instruction::FMul:
6253   case Instruction::FDiv:
6254   case Instruction::FRem:
6255   case Instruction::Shl:
6256   case Instruction::LShr:
6257   case Instruction::AShr:
6258   case Instruction::And:
6259   case Instruction::Or:
6260   case Instruction::Xor: {
6261     // Since we will replace the stride by 1 the multiplication should go away.
6262     if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal))
6263       return 0;
6264     // Certain instructions can be cheaper to vectorize if they have a constant
6265     // second vector operand. One example of this are shifts on x86.
6266     Value *Op2 = I->getOperand(1);
6267     TargetTransformInfo::OperandValueProperties Op2VP;
6268     TargetTransformInfo::OperandValueKind Op2VK =
6269         TTI.getOperandInfo(Op2, Op2VP);
6270     if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2))
6271       Op2VK = TargetTransformInfo::OK_UniformValue;
6272 
6273     SmallVector<const Value *, 4> Operands(I->operand_values());
6274     unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
6275     return N * TTI.getArithmeticInstrCost(
6276                    I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue,
6277                    Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I);
6278   }
6279   case Instruction::FNeg: {
6280     unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
6281     return N * TTI.getArithmeticInstrCost(
6282                    I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue,
6283                    TargetTransformInfo::OK_AnyValue,
6284                    TargetTransformInfo::OP_None, TargetTransformInfo::OP_None,
6285                    I->getOperand(0), I);
6286   }
6287   case Instruction::Select: {
6288     SelectInst *SI = cast<SelectInst>(I);
6289     const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
6290     bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
6291     Type *CondTy = SI->getCondition()->getType();
6292     if (!ScalarCond)
6293       CondTy = VectorType::get(CondTy, VF);
6294 
6295     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, I);
6296   }
6297   case Instruction::ICmp:
6298   case Instruction::FCmp: {
6299     Type *ValTy = I->getOperand(0)->getType();
6300     Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
6301     if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
6302       ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
6303     VectorTy = ToVectorTy(ValTy, VF);
6304     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, I);
6305   }
6306   case Instruction::Store:
6307   case Instruction::Load: {
6308     unsigned Width = VF;
6309     if (Width > 1) {
6310       InstWidening Decision = getWideningDecision(I, Width);
6311       assert(Decision != CM_Unknown &&
6312              "CM decision should be taken at this point");
6313       if (Decision == CM_Scalarize)
6314         Width = 1;
6315     }
6316     VectorTy = ToVectorTy(getMemInstValueType(I), Width);
6317     return getMemoryInstructionCost(I, VF);
6318   }
6319   case Instruction::ZExt:
6320   case Instruction::SExt:
6321   case Instruction::FPToUI:
6322   case Instruction::FPToSI:
6323   case Instruction::FPExt:
6324   case Instruction::PtrToInt:
6325   case Instruction::IntToPtr:
6326   case Instruction::SIToFP:
6327   case Instruction::UIToFP:
6328   case Instruction::Trunc:
6329   case Instruction::FPTrunc:
6330   case Instruction::BitCast: {
6331     // We optimize the truncation of induction variables having constant
6332     // integer steps. The cost of these truncations is the same as the scalar
6333     // operation.
6334     if (isOptimizableIVTruncate(I, VF)) {
6335       auto *Trunc = cast<TruncInst>(I);
6336       return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
6337                                   Trunc->getSrcTy(), Trunc);
6338     }
6339 
6340     Type *SrcScalarTy = I->getOperand(0)->getType();
6341     Type *SrcVecTy =
6342         VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
6343     if (canTruncateToMinimalBitwidth(I, VF)) {
6344       // This cast is going to be shrunk. This may remove the cast or it might
6345       // turn it into slightly different cast. For example, if MinBW == 16,
6346       // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
6347       //
6348       // Calculate the modified src and dest types.
6349       Type *MinVecTy = VectorTy;
6350       if (I->getOpcode() == Instruction::Trunc) {
6351         SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
6352         VectorTy =
6353             largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
6354       } else if (I->getOpcode() == Instruction::ZExt ||
6355                  I->getOpcode() == Instruction::SExt) {
6356         SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy);
6357         VectorTy =
6358             smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
6359       }
6360     }
6361 
6362     unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
6363     return N * TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy, I);
6364   }
6365   case Instruction::Call: {
6366     bool NeedToScalarize;
6367     CallInst *CI = cast<CallInst>(I);
6368     unsigned CallCost = getVectorCallCost(CI, VF, NeedToScalarize);
6369     if (getVectorIntrinsicIDForCall(CI, TLI))
6370       return std::min(CallCost, getVectorIntrinsicCost(CI, VF));
6371     return CallCost;
6372   }
6373   default:
6374     // The cost of executing VF copies of the scalar instruction. This opcode
6375     // is unknown. Assume that it is the same as 'mul'.
6376     return VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy) +
6377            getScalarizationOverhead(I, VF);
6378   } // end of switch.
6379 }
6380 
6381 char LoopVectorize::ID = 0;
6382 
6383 static const char lv_name[] = "Loop Vectorization";
6384 
6385 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
6386 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
6387 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)
6388 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
6389 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
6390 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
6391 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
6392 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
6393 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
6394 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
6395 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
6396 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
6397 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
6398 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
6399 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
6400 
6401 namespace llvm {
6402 
6403 Pass *createLoopVectorizePass() { return new LoopVectorize(); }
6404 
6405 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced,
6406                               bool VectorizeOnlyWhenForced) {
6407   return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced);
6408 }
6409 
6410 } // end namespace llvm
6411 
6412 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
6413   // Check if the pointer operand of a load or store instruction is
6414   // consecutive.
6415   if (auto *Ptr = getLoadStorePointerOperand(Inst))
6416     return Legal->isConsecutivePtr(Ptr);
6417   return false;
6418 }
6419 
6420 void LoopVectorizationCostModel::collectValuesToIgnore() {
6421   // Ignore ephemeral values.
6422   CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
6423 
6424   // Ignore type-promoting instructions we identified during reduction
6425   // detection.
6426   for (auto &Reduction : *Legal->getReductionVars()) {
6427     RecurrenceDescriptor &RedDes = Reduction.second;
6428     SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
6429     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
6430   }
6431   // Ignore type-casting instructions we identified during induction
6432   // detection.
6433   for (auto &Induction : *Legal->getInductionVars()) {
6434     InductionDescriptor &IndDes = Induction.second;
6435     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
6436     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
6437   }
6438 }
6439 
6440 // TODO: we could return a pair of values that specify the max VF and
6441 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
6442 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
6443 // doesn't have a cost model that can choose which plan to execute if
6444 // more than one is generated.
6445 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits,
6446                                  LoopVectorizationCostModel &CM) {
6447   unsigned WidestType;
6448   std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
6449   return WidestVectorRegBits / WidestType;
6450 }
6451 
6452 VectorizationFactor
6453 LoopVectorizationPlanner::planInVPlanNativePath(unsigned UserVF) {
6454   unsigned VF = UserVF;
6455   // Outer loop handling: They may require CFG and instruction level
6456   // transformations before even evaluating whether vectorization is profitable.
6457   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
6458   // the vectorization pipeline.
6459   if (!OrigLoop->empty()) {
6460     // If the user doesn't provide a vectorization factor, determine a
6461     // reasonable one.
6462     if (!UserVF) {
6463       VF = determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM);
6464       LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
6465 
6466       // Make sure we have a VF > 1 for stress testing.
6467       if (VPlanBuildStressTest && VF < 2) {
6468         LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
6469                           << "overriding computed VF.\n");
6470         VF = 4;
6471       }
6472     }
6473     assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
6474     assert(isPowerOf2_32(VF) && "VF needs to be a power of two");
6475     LLVM_DEBUG(dbgs() << "LV: Using " << (UserVF ? "user " : "") << "VF " << VF
6476                       << " to build VPlans.\n");
6477     buildVPlans(VF, VF);
6478 
6479     // For VPlan build stress testing, we bail out after VPlan construction.
6480     if (VPlanBuildStressTest)
6481       return VectorizationFactor::Disabled();
6482 
6483     return {VF, 0};
6484   }
6485 
6486   LLVM_DEBUG(
6487       dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
6488                 "VPlan-native path.\n");
6489   return VectorizationFactor::Disabled();
6490 }
6491 
6492 Optional<VectorizationFactor> LoopVectorizationPlanner::plan(unsigned UserVF) {
6493   assert(OrigLoop->empty() && "Inner loop expected.");
6494   Optional<unsigned> MaybeMaxVF = CM.computeMaxVF();
6495   if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved.
6496     return None;
6497 
6498   // Invalidate interleave groups if all blocks of loop will be predicated.
6499   if (CM.blockNeedsPredication(OrigLoop->getHeader()) &&
6500       !useMaskedInterleavedAccesses(*TTI)) {
6501     LLVM_DEBUG(
6502         dbgs()
6503         << "LV: Invalidate all interleaved groups due to fold-tail by masking "
6504            "which requires masked-interleaved support.\n");
6505     CM.InterleaveInfo.reset();
6506   }
6507 
6508   if (UserVF) {
6509     LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
6510     assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two");
6511     // Collect the instructions (and their associated costs) that will be more
6512     // profitable to scalarize.
6513     CM.selectUserVectorizationFactor(UserVF);
6514     buildVPlansWithVPRecipes(UserVF, UserVF);
6515     LLVM_DEBUG(printPlans(dbgs()));
6516     return {{UserVF, 0}};
6517   }
6518 
6519   unsigned MaxVF = MaybeMaxVF.getValue();
6520   assert(MaxVF != 0 && "MaxVF is zero.");
6521 
6522   for (unsigned VF = 1; VF <= MaxVF; VF *= 2) {
6523     // Collect Uniform and Scalar instructions after vectorization with VF.
6524     CM.collectUniformsAndScalars(VF);
6525 
6526     // Collect the instructions (and their associated costs) that will be more
6527     // profitable to scalarize.
6528     if (VF > 1)
6529       CM.collectInstsToScalarize(VF);
6530   }
6531 
6532   buildVPlansWithVPRecipes(1, MaxVF);
6533   LLVM_DEBUG(printPlans(dbgs()));
6534   if (MaxVF == 1)
6535     return VectorizationFactor::Disabled();
6536 
6537   // Select the optimal vectorization factor.
6538   return CM.selectVectorizationFactor(MaxVF);
6539 }
6540 
6541 void LoopVectorizationPlanner::setBestPlan(unsigned VF, unsigned UF) {
6542   LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF
6543                     << '\n');
6544   BestVF = VF;
6545   BestUF = UF;
6546 
6547   erase_if(VPlans, [VF](const VPlanPtr &Plan) {
6548     return !Plan->hasVF(VF);
6549   });
6550   assert(VPlans.size() == 1 && "Best VF has not a single VPlan.");
6551 }
6552 
6553 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV,
6554                                            DominatorTree *DT) {
6555   // Perform the actual loop transformation.
6556 
6557   // 1. Create a new empty loop. Unlink the old loop and connect the new one.
6558   VPCallbackILV CallbackILV(ILV);
6559 
6560   VPTransformState State{BestVF, BestUF,      LI,
6561                          DT,     ILV.Builder, ILV.VectorLoopValueMap,
6562                          &ILV,   CallbackILV};
6563   State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
6564   State.TripCount = ILV.getOrCreateTripCount(nullptr);
6565 
6566   //===------------------------------------------------===//
6567   //
6568   // Notice: any optimization or new instruction that go
6569   // into the code below should also be implemented in
6570   // the cost-model.
6571   //
6572   //===------------------------------------------------===//
6573 
6574   // 2. Copy and widen instructions from the old loop into the new loop.
6575   assert(VPlans.size() == 1 && "Not a single VPlan to execute.");
6576   VPlans.front()->execute(&State);
6577 
6578   // 3. Fix the vectorized code: take care of header phi's, live-outs,
6579   //    predication, updating analyses.
6580   ILV.fixVectorizedLoop();
6581 }
6582 
6583 void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
6584     SmallPtrSetImpl<Instruction *> &DeadInstructions) {
6585   BasicBlock *Latch = OrigLoop->getLoopLatch();
6586 
6587   // We create new control-flow for the vectorized loop, so the original
6588   // condition will be dead after vectorization if it's only used by the
6589   // branch.
6590   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
6591   if (Cmp && Cmp->hasOneUse())
6592     DeadInstructions.insert(Cmp);
6593 
6594   // We create new "steps" for induction variable updates to which the original
6595   // induction variables map. An original update instruction will be dead if
6596   // all its users except the induction variable are dead.
6597   for (auto &Induction : *Legal->getInductionVars()) {
6598     PHINode *Ind = Induction.first;
6599     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
6600     if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
6601           return U == Ind || DeadInstructions.find(cast<Instruction>(U)) !=
6602                                  DeadInstructions.end();
6603         }))
6604       DeadInstructions.insert(IndUpdate);
6605 
6606     // We record as "Dead" also the type-casting instructions we had identified
6607     // during induction analysis. We don't need any handling for them in the
6608     // vectorized loop because we have proven that, under a proper runtime
6609     // test guarding the vectorized loop, the value of the phi, and the casted
6610     // value of the phi, are the same. The last instruction in this casting chain
6611     // will get its scalar/vector/widened def from the scalar/vector/widened def
6612     // of the respective phi node. Any other casts in the induction def-use chain
6613     // have no other uses outside the phi update chain, and will be ignored.
6614     InductionDescriptor &IndDes = Induction.second;
6615     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
6616     DeadInstructions.insert(Casts.begin(), Casts.end());
6617   }
6618 }
6619 
6620 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; }
6621 
6622 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; }
6623 
6624 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step,
6625                                         Instruction::BinaryOps BinOp) {
6626   // When unrolling and the VF is 1, we only need to add a simple scalar.
6627   Type *Ty = Val->getType();
6628   assert(!Ty->isVectorTy() && "Val must be a scalar");
6629 
6630   if (Ty->isFloatingPointTy()) {
6631     Constant *C = ConstantFP::get(Ty, (double)StartIdx);
6632 
6633     // Floating point operations had to be 'fast' to enable the unrolling.
6634     Value *MulOp = addFastMathFlag(Builder.CreateFMul(C, Step));
6635     return addFastMathFlag(Builder.CreateBinOp(BinOp, Val, MulOp));
6636   }
6637   Constant *C = ConstantInt::get(Ty, StartIdx);
6638   return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction");
6639 }
6640 
6641 static void AddRuntimeUnrollDisableMetaData(Loop *L) {
6642   SmallVector<Metadata *, 4> MDs;
6643   // Reserve first location for self reference to the LoopID metadata node.
6644   MDs.push_back(nullptr);
6645   bool IsUnrollMetadata = false;
6646   MDNode *LoopID = L->getLoopID();
6647   if (LoopID) {
6648     // First find existing loop unrolling disable metadata.
6649     for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
6650       auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
6651       if (MD) {
6652         const auto *S = dyn_cast<MDString>(MD->getOperand(0));
6653         IsUnrollMetadata =
6654             S && S->getString().startswith("llvm.loop.unroll.disable");
6655       }
6656       MDs.push_back(LoopID->getOperand(i));
6657     }
6658   }
6659 
6660   if (!IsUnrollMetadata) {
6661     // Add runtime unroll disable metadata.
6662     LLVMContext &Context = L->getHeader()->getContext();
6663     SmallVector<Metadata *, 1> DisableOperands;
6664     DisableOperands.push_back(
6665         MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
6666     MDNode *DisableNode = MDNode::get(Context, DisableOperands);
6667     MDs.push_back(DisableNode);
6668     MDNode *NewLoopID = MDNode::get(Context, MDs);
6669     // Set operand 0 to refer to the loop id itself.
6670     NewLoopID->replaceOperandWith(0, NewLoopID);
6671     L->setLoopID(NewLoopID);
6672   }
6673 }
6674 
6675 bool LoopVectorizationPlanner::getDecisionAndClampRange(
6676     const std::function<bool(unsigned)> &Predicate, VFRange &Range) {
6677   assert(Range.End > Range.Start && "Trying to test an empty VF range.");
6678   bool PredicateAtRangeStart = Predicate(Range.Start);
6679 
6680   for (unsigned TmpVF = Range.Start * 2; TmpVF < Range.End; TmpVF *= 2)
6681     if (Predicate(TmpVF) != PredicateAtRangeStart) {
6682       Range.End = TmpVF;
6683       break;
6684     }
6685 
6686   return PredicateAtRangeStart;
6687 }
6688 
6689 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
6690 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
6691 /// of VF's starting at a given VF and extending it as much as possible. Each
6692 /// vectorization decision can potentially shorten this sub-range during
6693 /// buildVPlan().
6694 void LoopVectorizationPlanner::buildVPlans(unsigned MinVF, unsigned MaxVF) {
6695   for (unsigned VF = MinVF; VF < MaxVF + 1;) {
6696     VFRange SubRange = {VF, MaxVF + 1};
6697     VPlans.push_back(buildVPlan(SubRange));
6698     VF = SubRange.End;
6699   }
6700 }
6701 
6702 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
6703                                          VPlanPtr &Plan) {
6704   assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
6705 
6706   // Look for cached value.
6707   std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
6708   EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
6709   if (ECEntryIt != EdgeMaskCache.end())
6710     return ECEntryIt->second;
6711 
6712   VPValue *SrcMask = createBlockInMask(Src, Plan);
6713 
6714   // The terminator has to be a branch inst!
6715   BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
6716   assert(BI && "Unexpected terminator found");
6717 
6718   if (!BI->isConditional())
6719     return EdgeMaskCache[Edge] = SrcMask;
6720 
6721   VPValue *EdgeMask = Plan->getVPValue(BI->getCondition());
6722   assert(EdgeMask && "No Edge Mask found for condition");
6723 
6724   if (BI->getSuccessor(0) != Dst)
6725     EdgeMask = Builder.createNot(EdgeMask);
6726 
6727   if (SrcMask) // Otherwise block in-mask is all-one, no need to AND.
6728     EdgeMask = Builder.createAnd(EdgeMask, SrcMask);
6729 
6730   return EdgeMaskCache[Edge] = EdgeMask;
6731 }
6732 
6733 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
6734   assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
6735 
6736   // Look for cached value.
6737   BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB);
6738   if (BCEntryIt != BlockMaskCache.end())
6739     return BCEntryIt->second;
6740 
6741   // All-one mask is modelled as no-mask following the convention for masked
6742   // load/store/gather/scatter. Initialize BlockMask to no-mask.
6743   VPValue *BlockMask = nullptr;
6744 
6745   if (OrigLoop->getHeader() == BB) {
6746     if (!CM.blockNeedsPredication(BB))
6747       return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one.
6748 
6749     // Introduce the early-exit compare IV <= BTC to form header block mask.
6750     // This is used instead of IV < TC because TC may wrap, unlike BTC.
6751     VPValue *IV = Plan->getVPValue(Legal->getPrimaryInduction());
6752     VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
6753     BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
6754     return BlockMaskCache[BB] = BlockMask;
6755   }
6756 
6757   // This is the block mask. We OR all incoming edges.
6758   for (auto *Predecessor : predecessors(BB)) {
6759     VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan);
6760     if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too.
6761       return BlockMaskCache[BB] = EdgeMask;
6762 
6763     if (!BlockMask) { // BlockMask has its initialized nullptr value.
6764       BlockMask = EdgeMask;
6765       continue;
6766     }
6767 
6768     BlockMask = Builder.createOr(BlockMask, EdgeMask);
6769   }
6770 
6771   return BlockMaskCache[BB] = BlockMask;
6772 }
6773 
6774 VPWidenMemoryInstructionRecipe *
6775 VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range,
6776                                   VPlanPtr &Plan) {
6777   if (!isa<LoadInst>(I) && !isa<StoreInst>(I))
6778     return nullptr;
6779 
6780   auto willWiden = [&](unsigned VF) -> bool {
6781     if (VF == 1)
6782       return false;
6783     LoopVectorizationCostModel::InstWidening Decision =
6784         CM.getWideningDecision(I, VF);
6785     assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
6786            "CM decision should be taken at this point.");
6787     if (Decision == LoopVectorizationCostModel::CM_Interleave)
6788       return true;
6789     if (CM.isScalarAfterVectorization(I, VF) ||
6790         CM.isProfitableToScalarize(I, VF))
6791       return false;
6792     return Decision != LoopVectorizationCostModel::CM_Scalarize;
6793   };
6794 
6795   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
6796     return nullptr;
6797 
6798   VPValue *Mask = nullptr;
6799   if (Legal->isMaskRequired(I))
6800     Mask = createBlockInMask(I->getParent(), Plan);
6801 
6802   return new VPWidenMemoryInstructionRecipe(*I, Mask);
6803 }
6804 
6805 VPWidenIntOrFpInductionRecipe *
6806 VPRecipeBuilder::tryToOptimizeInduction(Instruction *I, VFRange &Range) {
6807   if (PHINode *Phi = dyn_cast<PHINode>(I)) {
6808     // Check if this is an integer or fp induction. If so, build the recipe that
6809     // produces its scalar and vector values.
6810     InductionDescriptor II = Legal->getInductionVars()->lookup(Phi);
6811     if (II.getKind() == InductionDescriptor::IK_IntInduction ||
6812         II.getKind() == InductionDescriptor::IK_FpInduction)
6813       return new VPWidenIntOrFpInductionRecipe(Phi);
6814 
6815     return nullptr;
6816   }
6817 
6818   // Optimize the special case where the source is a constant integer
6819   // induction variable. Notice that we can only optimize the 'trunc' case
6820   // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
6821   // (c) other casts depend on pointer size.
6822 
6823   // Determine whether \p K is a truncation based on an induction variable that
6824   // can be optimized.
6825   auto isOptimizableIVTruncate =
6826       [&](Instruction *K) -> std::function<bool(unsigned)> {
6827     return
6828         [=](unsigned VF) -> bool { return CM.isOptimizableIVTruncate(K, VF); };
6829   };
6830 
6831   if (isa<TruncInst>(I) && LoopVectorizationPlanner::getDecisionAndClampRange(
6832                                isOptimizableIVTruncate(I), Range))
6833     return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)),
6834                                              cast<TruncInst>(I));
6835   return nullptr;
6836 }
6837 
6838 VPBlendRecipe *VPRecipeBuilder::tryToBlend(Instruction *I, VPlanPtr &Plan) {
6839   PHINode *Phi = dyn_cast<PHINode>(I);
6840   if (!Phi || Phi->getParent() == OrigLoop->getHeader())
6841     return nullptr;
6842 
6843   // We know that all PHIs in non-header blocks are converted into selects, so
6844   // we don't have to worry about the insertion order and we can just use the
6845   // builder. At this point we generate the predication tree. There may be
6846   // duplications since this is a simple recursive scan, but future
6847   // optimizations will clean it up.
6848 
6849   SmallVector<VPValue *, 2> Masks;
6850   unsigned NumIncoming = Phi->getNumIncomingValues();
6851   for (unsigned In = 0; In < NumIncoming; In++) {
6852     VPValue *EdgeMask =
6853       createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan);
6854     assert((EdgeMask || NumIncoming == 1) &&
6855            "Multiple predecessors with one having a full mask");
6856     if (EdgeMask)
6857       Masks.push_back(EdgeMask);
6858   }
6859   return new VPBlendRecipe(Phi, Masks);
6860 }
6861 
6862 bool VPRecipeBuilder::tryToWiden(Instruction *I, VPBasicBlock *VPBB,
6863                                  VFRange &Range) {
6864 
6865   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
6866       [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range);
6867 
6868   if (IsPredicated)
6869     return false;
6870 
6871   auto IsVectorizableOpcode = [](unsigned Opcode) {
6872     switch (Opcode) {
6873     case Instruction::Add:
6874     case Instruction::And:
6875     case Instruction::AShr:
6876     case Instruction::BitCast:
6877     case Instruction::Br:
6878     case Instruction::Call:
6879     case Instruction::FAdd:
6880     case Instruction::FCmp:
6881     case Instruction::FDiv:
6882     case Instruction::FMul:
6883     case Instruction::FNeg:
6884     case Instruction::FPExt:
6885     case Instruction::FPToSI:
6886     case Instruction::FPToUI:
6887     case Instruction::FPTrunc:
6888     case Instruction::FRem:
6889     case Instruction::FSub:
6890     case Instruction::ICmp:
6891     case Instruction::IntToPtr:
6892     case Instruction::Load:
6893     case Instruction::LShr:
6894     case Instruction::Mul:
6895     case Instruction::Or:
6896     case Instruction::PHI:
6897     case Instruction::PtrToInt:
6898     case Instruction::SDiv:
6899     case Instruction::Select:
6900     case Instruction::SExt:
6901     case Instruction::Shl:
6902     case Instruction::SIToFP:
6903     case Instruction::SRem:
6904     case Instruction::Store:
6905     case Instruction::Sub:
6906     case Instruction::Trunc:
6907     case Instruction::UDiv:
6908     case Instruction::UIToFP:
6909     case Instruction::URem:
6910     case Instruction::Xor:
6911     case Instruction::ZExt:
6912       return true;
6913     }
6914     return false;
6915   };
6916 
6917   if (!IsVectorizableOpcode(I->getOpcode()))
6918     return false;
6919 
6920   if (CallInst *CI = dyn_cast<CallInst>(I)) {
6921     Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
6922     if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
6923                ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect))
6924       return false;
6925   }
6926 
6927   auto willWiden = [&](unsigned VF) -> bool {
6928     if (!isa<PHINode>(I) && (CM.isScalarAfterVectorization(I, VF) ||
6929                              CM.isProfitableToScalarize(I, VF)))
6930       return false;
6931     if (CallInst *CI = dyn_cast<CallInst>(I)) {
6932       Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
6933       // The following case may be scalarized depending on the VF.
6934       // The flag shows whether we use Intrinsic or a usual Call for vectorized
6935       // version of the instruction.
6936       // Is it beneficial to perform intrinsic call compared to lib call?
6937       bool NeedToScalarize;
6938       unsigned CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize);
6939       bool UseVectorIntrinsic =
6940           ID && CM.getVectorIntrinsicCost(CI, VF) <= CallCost;
6941       return UseVectorIntrinsic || !NeedToScalarize;
6942     }
6943     if (isa<LoadInst>(I) || isa<StoreInst>(I)) {
6944       assert(CM.getWideningDecision(I, VF) ==
6945                  LoopVectorizationCostModel::CM_Scalarize &&
6946              "Memory widening decisions should have been taken care by now");
6947       return false;
6948     }
6949     return true;
6950   };
6951 
6952   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
6953     return false;
6954   // If this ingredient's recipe is to be recorded, keep its recipe a singleton
6955   // to avoid having to split recipes later.
6956   bool IsSingleton = Ingredient2Recipe.count(I);
6957 
6958   // Success: widen this instruction.
6959 
6960   // Use the default widening recipe. We optimize the common case where
6961   // consecutive instructions can be represented by a single recipe.
6962   if (!IsSingleton && !VPBB->empty() && LastExtensibleRecipe == &VPBB->back() &&
6963       LastExtensibleRecipe->appendInstruction(I))
6964     return true;
6965 
6966   VPWidenRecipe *WidenRecipe = new VPWidenRecipe(I);
6967   if (!IsSingleton)
6968     LastExtensibleRecipe = WidenRecipe;
6969   setRecipe(I, WidenRecipe);
6970   VPBB->appendRecipe(WidenRecipe);
6971   return true;
6972 }
6973 
6974 VPBasicBlock *VPRecipeBuilder::handleReplication(
6975     Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
6976     DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe,
6977     VPlanPtr &Plan) {
6978   bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
6979       [&](unsigned VF) { return CM.isUniformAfterVectorization(I, VF); },
6980       Range);
6981 
6982   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
6983       [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range);
6984 
6985   auto *Recipe = new VPReplicateRecipe(I, IsUniform, IsPredicated);
6986   setRecipe(I, Recipe);
6987 
6988   // Find if I uses a predicated instruction. If so, it will use its scalar
6989   // value. Avoid hoisting the insert-element which packs the scalar value into
6990   // a vector value, as that happens iff all users use the vector value.
6991   for (auto &Op : I->operands())
6992     if (auto *PredInst = dyn_cast<Instruction>(Op))
6993       if (PredInst2Recipe.find(PredInst) != PredInst2Recipe.end())
6994         PredInst2Recipe[PredInst]->setAlsoPack(false);
6995 
6996   // Finalize the recipe for Instr, first if it is not predicated.
6997   if (!IsPredicated) {
6998     LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
6999     VPBB->appendRecipe(Recipe);
7000     return VPBB;
7001   }
7002   LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
7003   assert(VPBB->getSuccessors().empty() &&
7004          "VPBB has successors when handling predicated replication.");
7005   // Record predicated instructions for above packing optimizations.
7006   PredInst2Recipe[I] = Recipe;
7007   VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan);
7008   VPBlockUtils::insertBlockAfter(Region, VPBB);
7009   auto *RegSucc = new VPBasicBlock();
7010   VPBlockUtils::insertBlockAfter(RegSucc, Region);
7011   return RegSucc;
7012 }
7013 
7014 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr,
7015                                                       VPRecipeBase *PredRecipe,
7016                                                       VPlanPtr &Plan) {
7017   // Instructions marked for predication are replicated and placed under an
7018   // if-then construct to prevent side-effects.
7019 
7020   // Generate recipes to compute the block mask for this region.
7021   VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan);
7022 
7023   // Build the triangular if-then region.
7024   std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
7025   assert(Instr->getParent() && "Predicated instruction not in any basic block");
7026   auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask);
7027   auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
7028   auto *PHIRecipe =
7029       Instr->getType()->isVoidTy() ? nullptr : new VPPredInstPHIRecipe(Instr);
7030   auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
7031   auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe);
7032   VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true);
7033 
7034   // Note: first set Entry as region entry and then connect successors starting
7035   // from it in order, to propagate the "parent" of each VPBasicBlock.
7036   VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry);
7037   VPBlockUtils::connectBlocks(Pred, Exit);
7038 
7039   return Region;
7040 }
7041 
7042 bool VPRecipeBuilder::tryToCreateRecipe(Instruction *Instr, VFRange &Range,
7043                                         VPlanPtr &Plan, VPBasicBlock *VPBB) {
7044   VPRecipeBase *Recipe = nullptr;
7045 
7046   // First, check for specific widening recipes that deal with memory
7047   // operations, inductions and Phi nodes.
7048   if ((Recipe = tryToWidenMemory(Instr, Range, Plan)) ||
7049       (Recipe = tryToOptimizeInduction(Instr, Range)) ||
7050       (Recipe = tryToBlend(Instr, Plan)) ||
7051       (isa<PHINode>(Instr) &&
7052        (Recipe = new VPWidenPHIRecipe(cast<PHINode>(Instr))))) {
7053     setRecipe(Instr, Recipe);
7054     VPBB->appendRecipe(Recipe);
7055     return true;
7056   }
7057 
7058   // Handle GEP widening.
7059   if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Instr)) {
7060     auto Scalarize = [&](unsigned VF) {
7061       return CM.isScalarWithPredication(Instr, VF) ||
7062              CM.isScalarAfterVectorization(Instr, VF) ||
7063              CM.isProfitableToScalarize(Instr, VF);
7064     };
7065     if (LoopVectorizationPlanner::getDecisionAndClampRange(Scalarize, Range))
7066       return false;
7067     VPWidenGEPRecipe *Recipe = new VPWidenGEPRecipe(GEP, OrigLoop);
7068     setRecipe(Instr, Recipe);
7069     VPBB->appendRecipe(Recipe);
7070     return true;
7071   }
7072 
7073   // Check if Instr is to be widened by a general VPWidenRecipe, after
7074   // having first checked for specific widening recipes.
7075   if (tryToWiden(Instr, VPBB, Range))
7076     return true;
7077 
7078   return false;
7079 }
7080 
7081 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF,
7082                                                         unsigned MaxVF) {
7083   assert(OrigLoop->empty() && "Inner loop expected.");
7084 
7085   // Collect conditions feeding internal conditional branches; they need to be
7086   // represented in VPlan for it to model masking.
7087   SmallPtrSet<Value *, 1> NeedDef;
7088 
7089   auto *Latch = OrigLoop->getLoopLatch();
7090   for (BasicBlock *BB : OrigLoop->blocks()) {
7091     if (BB == Latch)
7092       continue;
7093     BranchInst *Branch = dyn_cast<BranchInst>(BB->getTerminator());
7094     if (Branch && Branch->isConditional())
7095       NeedDef.insert(Branch->getCondition());
7096   }
7097 
7098   // If the tail is to be folded by masking, the primary induction variable
7099   // needs to be represented in VPlan for it to model early-exit masking.
7100   // Also, both the Phi and the live-out instruction of each reduction are
7101   // required in order to introduce a select between them in VPlan.
7102   if (CM.foldTailByMasking()) {
7103     NeedDef.insert(Legal->getPrimaryInduction());
7104     for (auto &Reduction : *Legal->getReductionVars()) {
7105       NeedDef.insert(Reduction.first);
7106       NeedDef.insert(Reduction.second.getLoopExitInstr());
7107     }
7108   }
7109 
7110   // Collect instructions from the original loop that will become trivially dead
7111   // in the vectorized loop. We don't need to vectorize these instructions. For
7112   // example, original induction update instructions can become dead because we
7113   // separately emit induction "steps" when generating code for the new loop.
7114   // Similarly, we create a new latch condition when setting up the structure
7115   // of the new loop, so the old one can become dead.
7116   SmallPtrSet<Instruction *, 4> DeadInstructions;
7117   collectTriviallyDeadInstructions(DeadInstructions);
7118 
7119   for (unsigned VF = MinVF; VF < MaxVF + 1;) {
7120     VFRange SubRange = {VF, MaxVF + 1};
7121     VPlans.push_back(
7122         buildVPlanWithVPRecipes(SubRange, NeedDef, DeadInstructions));
7123     VF = SubRange.End;
7124   }
7125 }
7126 
7127 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
7128     VFRange &Range, SmallPtrSetImpl<Value *> &NeedDef,
7129     SmallPtrSetImpl<Instruction *> &DeadInstructions) {
7130 
7131   // Hold a mapping from predicated instructions to their recipes, in order to
7132   // fix their AlsoPack behavior if a user is determined to replicate and use a
7133   // scalar instead of vector value.
7134   DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe;
7135 
7136   DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter();
7137 
7138   SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
7139 
7140   VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, Builder);
7141 
7142   // ---------------------------------------------------------------------------
7143   // Pre-construction: record ingredients whose recipes we'll need to further
7144   // process after constructing the initial VPlan.
7145   // ---------------------------------------------------------------------------
7146 
7147   // Mark instructions we'll need to sink later and their targets as
7148   // ingredients whose recipe we'll need to record.
7149   for (auto &Entry : SinkAfter) {
7150     RecipeBuilder.recordRecipeOf(Entry.first);
7151     RecipeBuilder.recordRecipeOf(Entry.second);
7152   }
7153 
7154   // For each interleave group which is relevant for this (possibly trimmed)
7155   // Range, add it to the set of groups to be later applied to the VPlan and add
7156   // placeholders for its members' Recipes which we'll be replacing with a
7157   // single VPInterleaveRecipe.
7158   for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
7159     auto applyIG = [IG, this](unsigned VF) -> bool {
7160       return (VF >= 2 && // Query is illegal for VF == 1
7161               CM.getWideningDecision(IG->getInsertPos(), VF) ==
7162                   LoopVectorizationCostModel::CM_Interleave);
7163     };
7164     if (!getDecisionAndClampRange(applyIG, Range))
7165       continue;
7166     InterleaveGroups.insert(IG);
7167     for (unsigned i = 0; i < IG->getFactor(); i++)
7168       if (Instruction *Member = IG->getMember(i))
7169         RecipeBuilder.recordRecipeOf(Member);
7170   };
7171 
7172   // ---------------------------------------------------------------------------
7173   // Build initial VPlan: Scan the body of the loop in a topological order to
7174   // visit each basic block after having visited its predecessor basic blocks.
7175   // ---------------------------------------------------------------------------
7176 
7177   // Create a dummy pre-entry VPBasicBlock to start building the VPlan.
7178   VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry");
7179   auto Plan = std::make_unique<VPlan>(VPBB);
7180 
7181   // Represent values that will have defs inside VPlan.
7182   for (Value *V : NeedDef)
7183     Plan->addVPValue(V);
7184 
7185   // Scan the body of the loop in a topological order to visit each basic block
7186   // after having visited its predecessor basic blocks.
7187   LoopBlocksDFS DFS(OrigLoop);
7188   DFS.perform(LI);
7189 
7190   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
7191     // Relevant instructions from basic block BB will be grouped into VPRecipe
7192     // ingredients and fill a new VPBasicBlock.
7193     unsigned VPBBsForBB = 0;
7194     auto *FirstVPBBForBB = new VPBasicBlock(BB->getName());
7195     VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB);
7196     VPBB = FirstVPBBForBB;
7197     Builder.setInsertPoint(VPBB);
7198 
7199     // Introduce each ingredient into VPlan.
7200     for (Instruction &I : BB->instructionsWithoutDebug()) {
7201       Instruction *Instr = &I;
7202 
7203       // First filter out irrelevant instructions, to ensure no recipes are
7204       // built for them.
7205       if (isa<BranchInst>(Instr) ||
7206           DeadInstructions.find(Instr) != DeadInstructions.end())
7207         continue;
7208 
7209       if (RecipeBuilder.tryToCreateRecipe(Instr, Range, Plan, VPBB))
7210         continue;
7211 
7212       // Otherwise, if all widening options failed, Instruction is to be
7213       // replicated. This may create a successor for VPBB.
7214       VPBasicBlock *NextVPBB = RecipeBuilder.handleReplication(
7215           Instr, Range, VPBB, PredInst2Recipe, Plan);
7216       if (NextVPBB != VPBB) {
7217         VPBB = NextVPBB;
7218         VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++)
7219                                     : "");
7220       }
7221     }
7222   }
7223 
7224   // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks
7225   // may also be empty, such as the last one VPBB, reflecting original
7226   // basic-blocks with no recipes.
7227   VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry());
7228   assert(PreEntry->empty() && "Expecting empty pre-entry block.");
7229   VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor());
7230   VPBlockUtils::disconnectBlocks(PreEntry, Entry);
7231   delete PreEntry;
7232 
7233   // ---------------------------------------------------------------------------
7234   // Transform initial VPlan: Apply previously taken decisions, in order, to
7235   // bring the VPlan to its final state.
7236   // ---------------------------------------------------------------------------
7237 
7238   // Apply Sink-After legal constraints.
7239   for (auto &Entry : SinkAfter) {
7240     VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first);
7241     VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second);
7242     Sink->moveAfter(Target);
7243   }
7244 
7245   // Interleave memory: for each Interleave Group we marked earlier as relevant
7246   // for this VPlan, replace the Recipes widening its memory instructions with a
7247   // single VPInterleaveRecipe at its insertion point.
7248   for (auto IG : InterleaveGroups) {
7249     auto *Recipe = cast<VPWidenMemoryInstructionRecipe>(
7250         RecipeBuilder.getRecipe(IG->getInsertPos()));
7251     (new VPInterleaveRecipe(IG, Recipe->getMask()))->insertBefore(Recipe);
7252 
7253     for (unsigned i = 0; i < IG->getFactor(); ++i)
7254       if (Instruction *Member = IG->getMember(i)) {
7255         RecipeBuilder.getRecipe(Member)->eraseFromParent();
7256       }
7257   }
7258 
7259   // Finally, if tail is folded by masking, introduce selects between the phi
7260   // and the live-out instruction of each reduction, at the end of the latch.
7261   if (CM.foldTailByMasking()) {
7262     Builder.setInsertPoint(VPBB);
7263     auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan);
7264     for (auto &Reduction : *Legal->getReductionVars()) {
7265       VPValue *Phi = Plan->getVPValue(Reduction.first);
7266       VPValue *Red = Plan->getVPValue(Reduction.second.getLoopExitInstr());
7267       Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi});
7268     }
7269   }
7270 
7271   std::string PlanName;
7272   raw_string_ostream RSO(PlanName);
7273   unsigned VF = Range.Start;
7274   Plan->addVF(VF);
7275   RSO << "Initial VPlan for VF={" << VF;
7276   for (VF *= 2; VF < Range.End; VF *= 2) {
7277     Plan->addVF(VF);
7278     RSO << "," << VF;
7279   }
7280   RSO << "},UF>=1";
7281   RSO.flush();
7282   Plan->setName(PlanName);
7283 
7284   return Plan;
7285 }
7286 
7287 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
7288   // Outer loop handling: They may require CFG and instruction level
7289   // transformations before even evaluating whether vectorization is profitable.
7290   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
7291   // the vectorization pipeline.
7292   assert(!OrigLoop->empty());
7293   assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
7294 
7295   // Create new empty VPlan
7296   auto Plan = std::make_unique<VPlan>();
7297 
7298   // Build hierarchical CFG
7299   VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
7300   HCFGBuilder.buildHierarchicalCFG();
7301 
7302   for (unsigned VF = Range.Start; VF < Range.End; VF *= 2)
7303     Plan->addVF(VF);
7304 
7305   if (EnableVPlanPredication) {
7306     VPlanPredicator VPP(*Plan);
7307     VPP.predicate();
7308 
7309     // Avoid running transformation to recipes until masked code generation in
7310     // VPlan-native path is in place.
7311     return Plan;
7312   }
7313 
7314   SmallPtrSet<Instruction *, 1> DeadInstructions;
7315   VPlanTransforms::VPInstructionsToVPRecipes(
7316       OrigLoop, Plan, Legal->getInductionVars(), DeadInstructions);
7317   return Plan;
7318 }
7319 
7320 Value* LoopVectorizationPlanner::VPCallbackILV::
7321 getOrCreateVectorValues(Value *V, unsigned Part) {
7322       return ILV.getOrCreateVectorValue(V, Part);
7323 }
7324 
7325 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent) const {
7326   O << " +\n"
7327     << Indent << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
7328   IG->getInsertPos()->printAsOperand(O, false);
7329   if (User) {
7330     O << ", ";
7331     User->getOperand(0)->printAsOperand(O);
7332   }
7333   O << "\\l\"";
7334   for (unsigned i = 0; i < IG->getFactor(); ++i)
7335     if (Instruction *I = IG->getMember(i))
7336       O << " +\n"
7337         << Indent << "\"  " << VPlanIngredient(I) << " " << i << "\\l\"";
7338 }
7339 
7340 void VPWidenRecipe::execute(VPTransformState &State) {
7341   for (auto &Instr : make_range(Begin, End))
7342     State.ILV->widenInstruction(Instr);
7343 }
7344 
7345 void VPWidenGEPRecipe::execute(VPTransformState &State) {
7346   State.ILV->widenGEP(GEP, State.UF, State.VF, IsPtrLoopInvariant,
7347                       IsIndexLoopInvariant);
7348 }
7349 
7350 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
7351   assert(!State.Instance && "Int or FP induction being replicated.");
7352   State.ILV->widenIntOrFpInduction(IV, Trunc);
7353 }
7354 
7355 void VPWidenPHIRecipe::execute(VPTransformState &State) {
7356   State.ILV->widenPHIInstruction(Phi, State.UF, State.VF);
7357 }
7358 
7359 void VPBlendRecipe::execute(VPTransformState &State) {
7360   State.ILV->setDebugLocFromInst(State.Builder, Phi);
7361   // We know that all PHIs in non-header blocks are converted into
7362   // selects, so we don't have to worry about the insertion order and we
7363   // can just use the builder.
7364   // At this point we generate the predication tree. There may be
7365   // duplications since this is a simple recursive scan, but future
7366   // optimizations will clean it up.
7367 
7368   unsigned NumIncoming = Phi->getNumIncomingValues();
7369 
7370   assert((User || NumIncoming == 1) &&
7371          "Multiple predecessors with predecessors having a full mask");
7372   // Generate a sequence of selects of the form:
7373   // SELECT(Mask3, In3,
7374   //      SELECT(Mask2, In2,
7375   //                   ( ...)))
7376   InnerLoopVectorizer::VectorParts Entry(State.UF);
7377   for (unsigned In = 0; In < NumIncoming; ++In) {
7378     for (unsigned Part = 0; Part < State.UF; ++Part) {
7379       // We might have single edge PHIs (blocks) - use an identity
7380       // 'select' for the first PHI operand.
7381       Value *In0 =
7382           State.ILV->getOrCreateVectorValue(Phi->getIncomingValue(In), Part);
7383       if (In == 0)
7384         Entry[Part] = In0; // Initialize with the first incoming value.
7385       else {
7386         // Select between the current value and the previous incoming edge
7387         // based on the incoming mask.
7388         Value *Cond = State.get(User->getOperand(In), Part);
7389         Entry[Part] =
7390             State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi");
7391       }
7392     }
7393   }
7394   for (unsigned Part = 0; Part < State.UF; ++Part)
7395     State.ValueMap.setVectorValue(Phi, Part, Entry[Part]);
7396 }
7397 
7398 void VPInterleaveRecipe::execute(VPTransformState &State) {
7399   assert(!State.Instance && "Interleave group being replicated.");
7400   if (!User)
7401     return State.ILV->vectorizeInterleaveGroup(IG->getInsertPos());
7402 
7403   // Last (and currently only) operand is a mask.
7404   InnerLoopVectorizer::VectorParts MaskValues(State.UF);
7405   VPValue *Mask = User->getOperand(User->getNumOperands() - 1);
7406   for (unsigned Part = 0; Part < State.UF; ++Part)
7407     MaskValues[Part] = State.get(Mask, Part);
7408   State.ILV->vectorizeInterleaveGroup(IG->getInsertPos(), &MaskValues);
7409 }
7410 
7411 void VPReplicateRecipe::execute(VPTransformState &State) {
7412   if (State.Instance) { // Generate a single instance.
7413     State.ILV->scalarizeInstruction(Ingredient, *State.Instance, IsPredicated);
7414     // Insert scalar instance packing it into a vector.
7415     if (AlsoPack && State.VF > 1) {
7416       // If we're constructing lane 0, initialize to start from undef.
7417       if (State.Instance->Lane == 0) {
7418         Value *Undef =
7419             UndefValue::get(VectorType::get(Ingredient->getType(), State.VF));
7420         State.ValueMap.setVectorValue(Ingredient, State.Instance->Part, Undef);
7421       }
7422       State.ILV->packScalarIntoVectorValue(Ingredient, *State.Instance);
7423     }
7424     return;
7425   }
7426 
7427   // Generate scalar instances for all VF lanes of all UF parts, unless the
7428   // instruction is uniform inwhich case generate only the first lane for each
7429   // of the UF parts.
7430   unsigned EndLane = IsUniform ? 1 : State.VF;
7431   for (unsigned Part = 0; Part < State.UF; ++Part)
7432     for (unsigned Lane = 0; Lane < EndLane; ++Lane)
7433       State.ILV->scalarizeInstruction(Ingredient, {Part, Lane}, IsPredicated);
7434 }
7435 
7436 void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
7437   assert(State.Instance && "Branch on Mask works only on single instance.");
7438 
7439   unsigned Part = State.Instance->Part;
7440   unsigned Lane = State.Instance->Lane;
7441 
7442   Value *ConditionBit = nullptr;
7443   if (!User) // Block in mask is all-one.
7444     ConditionBit = State.Builder.getTrue();
7445   else {
7446     VPValue *BlockInMask = User->getOperand(0);
7447     ConditionBit = State.get(BlockInMask, Part);
7448     if (ConditionBit->getType()->isVectorTy())
7449       ConditionBit = State.Builder.CreateExtractElement(
7450           ConditionBit, State.Builder.getInt32(Lane));
7451   }
7452 
7453   // Replace the temporary unreachable terminator with a new conditional branch,
7454   // whose two destinations will be set later when they are created.
7455   auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
7456   assert(isa<UnreachableInst>(CurrentTerminator) &&
7457          "Expected to replace unreachable terminator with conditional branch.");
7458   auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
7459   CondBr->setSuccessor(0, nullptr);
7460   ReplaceInstWithInst(CurrentTerminator, CondBr);
7461 }
7462 
7463 void VPPredInstPHIRecipe::execute(VPTransformState &State) {
7464   assert(State.Instance && "Predicated instruction PHI works per instance.");
7465   Instruction *ScalarPredInst = cast<Instruction>(
7466       State.ValueMap.getScalarValue(PredInst, *State.Instance));
7467   BasicBlock *PredicatedBB = ScalarPredInst->getParent();
7468   BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
7469   assert(PredicatingBB && "Predicated block has no single predecessor.");
7470 
7471   // By current pack/unpack logic we need to generate only a single phi node: if
7472   // a vector value for the predicated instruction exists at this point it means
7473   // the instruction has vector users only, and a phi for the vector value is
7474   // needed. In this case the recipe of the predicated instruction is marked to
7475   // also do that packing, thereby "hoisting" the insert-element sequence.
7476   // Otherwise, a phi node for the scalar value is needed.
7477   unsigned Part = State.Instance->Part;
7478   if (State.ValueMap.hasVectorValue(PredInst, Part)) {
7479     Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part);
7480     InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
7481     PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
7482     VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
7483     VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
7484     State.ValueMap.resetVectorValue(PredInst, Part, VPhi); // Update cache.
7485   } else {
7486     Type *PredInstType = PredInst->getType();
7487     PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
7488     Phi->addIncoming(UndefValue::get(ScalarPredInst->getType()), PredicatingBB);
7489     Phi->addIncoming(ScalarPredInst, PredicatedBB);
7490     State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi);
7491   }
7492 }
7493 
7494 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
7495   VPValue *Mask = getMask();
7496   if (!Mask)
7497     return State.ILV->vectorizeMemoryInstruction(&Instr);
7498 
7499   InnerLoopVectorizer::VectorParts MaskValues(State.UF);
7500   for (unsigned Part = 0; Part < State.UF; ++Part)
7501     MaskValues[Part] = State.get(Mask, Part);
7502   State.ILV->vectorizeMemoryInstruction(&Instr, &MaskValues);
7503 }
7504 
7505 // Determine how to lower the scalar epilogue, which depends on 1) optimising
7506 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
7507 // predication, and 4) a TTI hook that analyses whether the loop is suitable
7508 // for predication.
7509 static ScalarEpilogueLowering getScalarEpilogueLowering(
7510     Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI,
7511     BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
7512     AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
7513     LoopVectorizationLegality &LVL) {
7514   bool OptSize =
7515       F->hasOptSize() || llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
7516                                                      PGSOQueryType::IRPass);
7517   // 1) OptSize takes precedence over all other options, i.e. if this is set,
7518   // don't look at hints or options, and don't request a scalar epilogue.
7519   if (OptSize && Hints.getForce() != LoopVectorizeHints::FK_Enabled)
7520     return CM_ScalarEpilogueNotAllowedOptSize;
7521 
7522   bool PredicateOptDisabled = PreferPredicateOverEpilog.getNumOccurrences() &&
7523                               !PreferPredicateOverEpilog;
7524 
7525   // 2) Next, if disabling predication is requested on the command line, honour
7526   // this and request a scalar epilogue. Also do this if we don't have a
7527   // primary induction variable, which is required for predication.
7528   if (PredicateOptDisabled || !LVL.getPrimaryInduction())
7529     return CM_ScalarEpilogueAllowed;
7530 
7531   // 3) and 4) look if enabling predication is requested on the command line,
7532   // with a loop hint, or if the TTI hook indicates this is profitable, request
7533   // predication .
7534   if (PreferPredicateOverEpilog ||
7535       Hints.getPredicate() == LoopVectorizeHints::FK_Enabled ||
7536       (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT,
7537                                         LVL.getLAI()) &&
7538        Hints.getPredicate() != LoopVectorizeHints::FK_Disabled))
7539     return CM_ScalarEpilogueNotNeededUsePredicate;
7540 
7541   return CM_ScalarEpilogueAllowed;
7542 }
7543 
7544 // Process the loop in the VPlan-native vectorization path. This path builds
7545 // VPlan upfront in the vectorization pipeline, which allows to apply
7546 // VPlan-to-VPlan transformations from the very beginning without modifying the
7547 // input LLVM IR.
7548 static bool processLoopInVPlanNativePath(
7549     Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
7550     LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
7551     TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
7552     OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
7553     ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) {
7554 
7555   assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
7556   Function *F = L->getHeader()->getParent();
7557   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
7558 
7559   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
7560       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL);
7561 
7562   LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
7563                                 &Hints, IAI);
7564   // Use the planner for outer loop vectorization.
7565   // TODO: CM is not used at this point inside the planner. Turn CM into an
7566   // optional argument if we don't need it in the future.
7567   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI);
7568 
7569   // Get user vectorization factor.
7570   const unsigned UserVF = Hints.getWidth();
7571 
7572   // Plan how to best vectorize, return the best VF and its cost.
7573   const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
7574 
7575   // If we are stress testing VPlan builds, do not attempt to generate vector
7576   // code. Masked vector code generation support will follow soon.
7577   // Also, do not attempt to vectorize if no vector code will be produced.
7578   if (VPlanBuildStressTest || EnableVPlanPredication ||
7579       VectorizationFactor::Disabled() == VF)
7580     return false;
7581 
7582   LVP.setBestPlan(VF.Width, 1);
7583 
7584   InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL,
7585                          &CM);
7586   LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
7587                     << L->getHeader()->getParent()->getName() << "\"\n");
7588   LVP.executePlan(LB, DT);
7589 
7590   // Mark the loop as already vectorized to avoid vectorizing again.
7591   Hints.setAlreadyVectorized();
7592 
7593   LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent()));
7594   return true;
7595 }
7596 
7597 bool LoopVectorizePass::processLoop(Loop *L) {
7598   assert((EnableVPlanNativePath || L->empty()) &&
7599          "VPlan-native path is not enabled. Only process inner loops.");
7600 
7601 #ifndef NDEBUG
7602   const std::string DebugLocStr = getDebugLocString(L);
7603 #endif /* NDEBUG */
7604 
7605   LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \""
7606                     << L->getHeader()->getParent()->getName() << "\" from "
7607                     << DebugLocStr << "\n");
7608 
7609   LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE);
7610 
7611   LLVM_DEBUG(
7612       dbgs() << "LV: Loop hints:"
7613              << " force="
7614              << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
7615                      ? "disabled"
7616                      : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
7617                             ? "enabled"
7618                             : "?"))
7619              << " width=" << Hints.getWidth()
7620              << " unroll=" << Hints.getInterleave() << "\n");
7621 
7622   // Function containing loop
7623   Function *F = L->getHeader()->getParent();
7624 
7625   // Looking at the diagnostic output is the only way to determine if a loop
7626   // was vectorized (other than looking at the IR or machine code), so it
7627   // is important to generate an optimization remark for each loop. Most of
7628   // these messages are generated as OptimizationRemarkAnalysis. Remarks
7629   // generated as OptimizationRemark and OptimizationRemarkMissed are
7630   // less verbose reporting vectorized loops and unvectorized loops that may
7631   // benefit from vectorization, respectively.
7632 
7633   if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
7634     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
7635     return false;
7636   }
7637 
7638   PredicatedScalarEvolution PSE(*SE, *L);
7639 
7640   // Check if it is legal to vectorize the loop.
7641   LoopVectorizationRequirements Requirements(*ORE);
7642   LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE,
7643                                 &Requirements, &Hints, DB, AC);
7644   if (!LVL.canVectorize(EnableVPlanNativePath)) {
7645     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
7646     Hints.emitRemarkWithHints();
7647     return false;
7648   }
7649 
7650   // Check the function attributes and profiles to find out if this function
7651   // should be optimized for size.
7652   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
7653       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL);
7654 
7655   // Entrance to the VPlan-native vectorization path. Outer loops are processed
7656   // here. They may require CFG and instruction level transformations before
7657   // even evaluating whether vectorization is profitable. Since we cannot modify
7658   // the incoming IR, we need to build VPlan upfront in the vectorization
7659   // pipeline.
7660   if (!L->empty())
7661     return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
7662                                         ORE, BFI, PSI, Hints);
7663 
7664   assert(L->empty() && "Inner loop expected.");
7665 
7666   // Check the loop for a trip count threshold: vectorize loops with a tiny trip
7667   // count by optimizing for size, to minimize overheads.
7668   auto ExpectedTC = getSmallBestKnownTC(*SE, L);
7669   if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
7670     LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
7671                       << "This loop is worth vectorizing only if no scalar "
7672                       << "iteration overheads are incurred.");
7673     if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
7674       LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
7675     else {
7676       LLVM_DEBUG(dbgs() << "\n");
7677       SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
7678     }
7679   }
7680 
7681   // Check the function attributes to see if implicit floats are allowed.
7682   // FIXME: This check doesn't seem possibly correct -- what if the loop is
7683   // an integer loop and the vector instructions selected are purely integer
7684   // vector instructions?
7685   if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
7686     reportVectorizationFailure(
7687         "Can't vectorize when the NoImplicitFloat attribute is used",
7688         "loop not vectorized due to NoImplicitFloat attribute",
7689         "NoImplicitFloat", ORE, L);
7690     Hints.emitRemarkWithHints();
7691     return false;
7692   }
7693 
7694   // Check if the target supports potentially unsafe FP vectorization.
7695   // FIXME: Add a check for the type of safety issue (denormal, signaling)
7696   // for the target we're vectorizing for, to make sure none of the
7697   // additional fp-math flags can help.
7698   if (Hints.isPotentiallyUnsafe() &&
7699       TTI->isFPVectorizationPotentiallyUnsafe()) {
7700     reportVectorizationFailure(
7701         "Potentially unsafe FP op prevents vectorization",
7702         "loop not vectorized due to unsafe FP support.",
7703         "UnsafeFP", ORE, L);
7704     Hints.emitRemarkWithHints();
7705     return false;
7706   }
7707 
7708   bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
7709   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
7710 
7711   // If an override option has been passed in for interleaved accesses, use it.
7712   if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
7713     UseInterleaved = EnableInterleavedMemAccesses;
7714 
7715   // Analyze interleaved memory accesses.
7716   if (UseInterleaved) {
7717     IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
7718   }
7719 
7720   // Use the cost model.
7721   LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
7722                                 F, &Hints, IAI);
7723   CM.collectValuesToIgnore();
7724 
7725   // Use the planner for vectorization.
7726   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI);
7727 
7728   // Get user vectorization factor.
7729   unsigned UserVF = Hints.getWidth();
7730 
7731   // Plan how to best vectorize, return the best VF and its cost.
7732   Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF);
7733 
7734   VectorizationFactor VF = VectorizationFactor::Disabled();
7735   unsigned IC = 1;
7736   unsigned UserIC = Hints.getInterleave();
7737 
7738   if (MaybeVF) {
7739     VF = *MaybeVF;
7740     // Select the interleave count.
7741     IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
7742   }
7743 
7744   // Identify the diagnostic messages that should be produced.
7745   std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
7746   bool VectorizeLoop = true, InterleaveLoop = true;
7747   if (Requirements.doesNotMeet(F, L, Hints)) {
7748     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization "
7749                          "requirements.\n");
7750     Hints.emitRemarkWithHints();
7751     return false;
7752   }
7753 
7754   if (VF.Width == 1) {
7755     LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
7756     VecDiagMsg = std::make_pair(
7757         "VectorizationNotBeneficial",
7758         "the cost-model indicates that vectorization is not beneficial");
7759     VectorizeLoop = false;
7760   }
7761 
7762   if (!MaybeVF && UserIC > 1) {
7763     // Tell the user interleaving was avoided up-front, despite being explicitly
7764     // requested.
7765     LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
7766                          "interleaving should be avoided up front\n");
7767     IntDiagMsg = std::make_pair(
7768         "InterleavingAvoided",
7769         "Ignoring UserIC, because interleaving was avoided up front");
7770     InterleaveLoop = false;
7771   } else if (IC == 1 && UserIC <= 1) {
7772     // Tell the user interleaving is not beneficial.
7773     LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
7774     IntDiagMsg = std::make_pair(
7775         "InterleavingNotBeneficial",
7776         "the cost-model indicates that interleaving is not beneficial");
7777     InterleaveLoop = false;
7778     if (UserIC == 1) {
7779       IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
7780       IntDiagMsg.second +=
7781           " and is explicitly disabled or interleave count is set to 1";
7782     }
7783   } else if (IC > 1 && UserIC == 1) {
7784     // Tell the user interleaving is beneficial, but it explicitly disabled.
7785     LLVM_DEBUG(
7786         dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
7787     IntDiagMsg = std::make_pair(
7788         "InterleavingBeneficialButDisabled",
7789         "the cost-model indicates that interleaving is beneficial "
7790         "but is explicitly disabled or interleave count is set to 1");
7791     InterleaveLoop = false;
7792   }
7793 
7794   // Override IC if user provided an interleave count.
7795   IC = UserIC > 0 ? UserIC : IC;
7796 
7797   // Emit diagnostic messages, if any.
7798   const char *VAPassName = Hints.vectorizeAnalysisPassName();
7799   if (!VectorizeLoop && !InterleaveLoop) {
7800     // Do not vectorize or interleaving the loop.
7801     ORE->emit([&]() {
7802       return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
7803                                       L->getStartLoc(), L->getHeader())
7804              << VecDiagMsg.second;
7805     });
7806     ORE->emit([&]() {
7807       return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
7808                                       L->getStartLoc(), L->getHeader())
7809              << IntDiagMsg.second;
7810     });
7811     return false;
7812   } else if (!VectorizeLoop && InterleaveLoop) {
7813     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
7814     ORE->emit([&]() {
7815       return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
7816                                         L->getStartLoc(), L->getHeader())
7817              << VecDiagMsg.second;
7818     });
7819   } else if (VectorizeLoop && !InterleaveLoop) {
7820     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
7821                       << ") in " << DebugLocStr << '\n');
7822     ORE->emit([&]() {
7823       return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
7824                                         L->getStartLoc(), L->getHeader())
7825              << IntDiagMsg.second;
7826     });
7827   } else if (VectorizeLoop && InterleaveLoop) {
7828     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
7829                       << ") in " << DebugLocStr << '\n');
7830     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
7831   }
7832 
7833   LVP.setBestPlan(VF.Width, IC);
7834 
7835   using namespace ore;
7836   bool DisableRuntimeUnroll = false;
7837   MDNode *OrigLoopID = L->getLoopID();
7838 
7839   if (!VectorizeLoop) {
7840     assert(IC > 1 && "interleave count should not be 1 or 0");
7841     // If we decided that it is not legal to vectorize the loop, then
7842     // interleave it.
7843     InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
7844                                &CM);
7845     LVP.executePlan(Unroller, DT);
7846 
7847     ORE->emit([&]() {
7848       return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
7849                                 L->getHeader())
7850              << "interleaved loop (interleaved count: "
7851              << NV("InterleaveCount", IC) << ")";
7852     });
7853   } else {
7854     // If we decided that it is *legal* to vectorize the loop, then do it.
7855     InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
7856                            &LVL, &CM);
7857     LVP.executePlan(LB, DT);
7858     ++LoopsVectorized;
7859 
7860     // Add metadata to disable runtime unrolling a scalar loop when there are
7861     // no runtime checks about strides and memory. A scalar loop that is
7862     // rarely used is not worth unrolling.
7863     if (!LB.areSafetyChecksAdded())
7864       DisableRuntimeUnroll = true;
7865 
7866     // Report the vectorization decision.
7867     ORE->emit([&]() {
7868       return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(),
7869                                 L->getHeader())
7870              << "vectorized loop (vectorization width: "
7871              << NV("VectorizationFactor", VF.Width)
7872              << ", interleaved count: " << NV("InterleaveCount", IC) << ")";
7873     });
7874   }
7875 
7876   Optional<MDNode *> RemainderLoopID =
7877       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
7878                                       LLVMLoopVectorizeFollowupEpilogue});
7879   if (RemainderLoopID.hasValue()) {
7880     L->setLoopID(RemainderLoopID.getValue());
7881   } else {
7882     if (DisableRuntimeUnroll)
7883       AddRuntimeUnrollDisableMetaData(L);
7884 
7885     // Mark the loop as already vectorized to avoid vectorizing again.
7886     Hints.setAlreadyVectorized();
7887   }
7888 
7889   LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent()));
7890   return true;
7891 }
7892 
7893 bool LoopVectorizePass::runImpl(
7894     Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_,
7895     DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_,
7896     DemandedBits &DB_, AliasAnalysis &AA_, AssumptionCache &AC_,
7897     std::function<const LoopAccessInfo &(Loop &)> &GetLAA_,
7898     OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) {
7899   SE = &SE_;
7900   LI = &LI_;
7901   TTI = &TTI_;
7902   DT = &DT_;
7903   BFI = &BFI_;
7904   TLI = TLI_;
7905   AA = &AA_;
7906   AC = &AC_;
7907   GetLAA = &GetLAA_;
7908   DB = &DB_;
7909   ORE = &ORE_;
7910   PSI = PSI_;
7911 
7912   // Don't attempt if
7913   // 1. the target claims to have no vector registers, and
7914   // 2. interleaving won't help ILP.
7915   //
7916   // The second condition is necessary because, even if the target has no
7917   // vector registers, loop vectorization may still enable scalar
7918   // interleaving.
7919   if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) &&
7920       TTI->getMaxInterleaveFactor(1) < 2)
7921     return false;
7922 
7923   bool Changed = false;
7924 
7925   // The vectorizer requires loops to be in simplified form.
7926   // Since simplification may add new inner loops, it has to run before the
7927   // legality and profitability checks. This means running the loop vectorizer
7928   // will simplify all loops, regardless of whether anything end up being
7929   // vectorized.
7930   for (auto &L : *LI)
7931     Changed |=
7932         simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
7933 
7934   // Build up a worklist of inner-loops to vectorize. This is necessary as
7935   // the act of vectorizing or partially unrolling a loop creates new loops
7936   // and can invalidate iterators across the loops.
7937   SmallVector<Loop *, 8> Worklist;
7938 
7939   for (Loop *L : *LI)
7940     collectSupportedLoops(*L, LI, ORE, Worklist);
7941 
7942   LoopsAnalyzed += Worklist.size();
7943 
7944   // Now walk the identified inner loops.
7945   while (!Worklist.empty()) {
7946     Loop *L = Worklist.pop_back_val();
7947 
7948     // For the inner loops we actually process, form LCSSA to simplify the
7949     // transform.
7950     Changed |= formLCSSARecursively(*L, *DT, LI, SE);
7951 
7952     Changed |= processLoop(L);
7953   }
7954 
7955   // Process each loop nest in the function.
7956   return Changed;
7957 }
7958 
7959 PreservedAnalyses LoopVectorizePass::run(Function &F,
7960                                          FunctionAnalysisManager &AM) {
7961     auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
7962     auto &LI = AM.getResult<LoopAnalysis>(F);
7963     auto &TTI = AM.getResult<TargetIRAnalysis>(F);
7964     auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
7965     auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F);
7966     auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
7967     auto &AA = AM.getResult<AAManager>(F);
7968     auto &AC = AM.getResult<AssumptionAnalysis>(F);
7969     auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
7970     auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
7971     MemorySSA *MSSA = EnableMSSALoopDependency
7972                           ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA()
7973                           : nullptr;
7974 
7975     auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
7976     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
7977         [&](Loop &L) -> const LoopAccessInfo & {
7978       LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI, MSSA};
7979       return LAM.getResult<LoopAccessAnalysis>(L, AR);
7980     };
7981     const ModuleAnalysisManager &MAM =
7982         AM.getResult<ModuleAnalysisManagerFunctionProxy>(F).getManager();
7983     ProfileSummaryInfo *PSI =
7984         MAM.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
7985     bool Changed =
7986         runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI);
7987     if (!Changed)
7988       return PreservedAnalyses::all();
7989     PreservedAnalyses PA;
7990 
7991     // We currently do not preserve loopinfo/dominator analyses with outer loop
7992     // vectorization. Until this is addressed, mark these analyses as preserved
7993     // only for non-VPlan-native path.
7994     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
7995     if (!EnableVPlanNativePath) {
7996       PA.preserve<LoopAnalysis>();
7997       PA.preserve<DominatorTreeAnalysis>();
7998     }
7999     PA.preserve<BasicAA>();
8000     PA.preserve<GlobalsAA>();
8001     return PA;
8002 }
8003