1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10 // and generates target-independent LLVM-IR.
11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12 // of instructions in order to estimate the profitability of vectorization.
13 //
14 // The loop vectorizer combines consecutive loop iterations into a single
15 // 'wide' iteration. After this transformation the index is incremented
16 // by the SIMD vector width, and not by one.
17 //
18 // This pass has three parts:
19 // 1. The main loop pass that drives the different parts.
20 // 2. LoopVectorizationLegality - A unit that checks for the legality
21 //    of the vectorization.
22 // 3. InnerLoopVectorizer - A unit that performs the actual
23 //    widening of instructions.
24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
25 //    of vectorization. It decides on the optimal vector width, which
26 //    can be one, if vectorization is not profitable.
27 //
28 // There is a development effort going on to migrate loop vectorizer to the
29 // VPlan infrastructure and to introduce outer loop vectorization support (see
30 // docs/Proposal/VectorizationPlan.rst and
31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32 // purpose, we temporarily introduced the VPlan-native vectorization path: an
33 // alternative vectorization path that is natively implemented on top of the
34 // VPlan infrastructure. See EnableVPlanNativePath for enabling.
35 //
36 //===----------------------------------------------------------------------===//
37 //
38 // The reduction-variable vectorization is based on the paper:
39 //  D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40 //
41 // Variable uniformity checks are inspired by:
42 //  Karrenberg, R. and Hack, S. Whole Function Vectorization.
43 //
44 // The interleaved access vectorization is based on the paper:
45 //  Dorit Nuzman, Ira Rosen and Ayal Zaks.  Auto-Vectorization of Interleaved
46 //  Data for SIMD
47 //
48 // Other ideas/concepts are from:
49 //  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50 //
51 //  S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua.  An Evaluation of
52 //  Vectorizing Compilers.
53 //
54 //===----------------------------------------------------------------------===//
55 
56 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
57 #include "LoopVectorizationPlanner.h"
58 #include "VPRecipeBuilder.h"
59 #include "VPlan.h"
60 #include "VPlanHCFGBuilder.h"
61 #include "VPlanPredicator.h"
62 #include "VPlanTransforms.h"
63 #include "llvm/ADT/APInt.h"
64 #include "llvm/ADT/ArrayRef.h"
65 #include "llvm/ADT/DenseMap.h"
66 #include "llvm/ADT/DenseMapInfo.h"
67 #include "llvm/ADT/Hashing.h"
68 #include "llvm/ADT/MapVector.h"
69 #include "llvm/ADT/None.h"
70 #include "llvm/ADT/Optional.h"
71 #include "llvm/ADT/STLExtras.h"
72 #include "llvm/ADT/SetVector.h"
73 #include "llvm/ADT/SmallPtrSet.h"
74 #include "llvm/ADT/SmallVector.h"
75 #include "llvm/ADT/Statistic.h"
76 #include "llvm/ADT/StringRef.h"
77 #include "llvm/ADT/Twine.h"
78 #include "llvm/ADT/iterator_range.h"
79 #include "llvm/Analysis/AssumptionCache.h"
80 #include "llvm/Analysis/BasicAliasAnalysis.h"
81 #include "llvm/Analysis/BlockFrequencyInfo.h"
82 #include "llvm/Analysis/CFG.h"
83 #include "llvm/Analysis/CodeMetrics.h"
84 #include "llvm/Analysis/DemandedBits.h"
85 #include "llvm/Analysis/GlobalsModRef.h"
86 #include "llvm/Analysis/LoopAccessAnalysis.h"
87 #include "llvm/Analysis/LoopAnalysisManager.h"
88 #include "llvm/Analysis/LoopInfo.h"
89 #include "llvm/Analysis/LoopIterator.h"
90 #include "llvm/Analysis/MemorySSA.h"
91 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
92 #include "llvm/Analysis/ProfileSummaryInfo.h"
93 #include "llvm/Analysis/ScalarEvolution.h"
94 #include "llvm/Analysis/ScalarEvolutionExpander.h"
95 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
96 #include "llvm/Analysis/TargetLibraryInfo.h"
97 #include "llvm/Analysis/TargetTransformInfo.h"
98 #include "llvm/Analysis/VectorUtils.h"
99 #include "llvm/IR/Attributes.h"
100 #include "llvm/IR/BasicBlock.h"
101 #include "llvm/IR/CFG.h"
102 #include "llvm/IR/Constant.h"
103 #include "llvm/IR/Constants.h"
104 #include "llvm/IR/DataLayout.h"
105 #include "llvm/IR/DebugInfoMetadata.h"
106 #include "llvm/IR/DebugLoc.h"
107 #include "llvm/IR/DerivedTypes.h"
108 #include "llvm/IR/DiagnosticInfo.h"
109 #include "llvm/IR/Dominators.h"
110 #include "llvm/IR/Function.h"
111 #include "llvm/IR/IRBuilder.h"
112 #include "llvm/IR/InstrTypes.h"
113 #include "llvm/IR/Instruction.h"
114 #include "llvm/IR/Instructions.h"
115 #include "llvm/IR/IntrinsicInst.h"
116 #include "llvm/IR/Intrinsics.h"
117 #include "llvm/IR/LLVMContext.h"
118 #include "llvm/IR/Metadata.h"
119 #include "llvm/IR/Module.h"
120 #include "llvm/IR/Operator.h"
121 #include "llvm/IR/Type.h"
122 #include "llvm/IR/Use.h"
123 #include "llvm/IR/User.h"
124 #include "llvm/IR/Value.h"
125 #include "llvm/IR/ValueHandle.h"
126 #include "llvm/IR/Verifier.h"
127 #include "llvm/InitializePasses.h"
128 #include "llvm/Pass.h"
129 #include "llvm/Support/Casting.h"
130 #include "llvm/Support/CommandLine.h"
131 #include "llvm/Support/Compiler.h"
132 #include "llvm/Support/Debug.h"
133 #include "llvm/Support/ErrorHandling.h"
134 #include "llvm/Support/MathExtras.h"
135 #include "llvm/Support/raw_ostream.h"
136 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
137 #include "llvm/Transforms/Utils/InjectTLIMappings.h"
138 #include "llvm/Transforms/Utils/LoopSimplify.h"
139 #include "llvm/Transforms/Utils/LoopUtils.h"
140 #include "llvm/Transforms/Utils/LoopVersioning.h"
141 #include "llvm/Transforms/Utils/SizeOpts.h"
142 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
143 #include <algorithm>
144 #include <cassert>
145 #include <cstdint>
146 #include <cstdlib>
147 #include <functional>
148 #include <iterator>
149 #include <limits>
150 #include <memory>
151 #include <string>
152 #include <tuple>
153 #include <utility>
154 
155 using namespace llvm;
156 
157 #define LV_NAME "loop-vectorize"
158 #define DEBUG_TYPE LV_NAME
159 
160 /// @{
161 /// Metadata attribute names
162 static const char *const LLVMLoopVectorizeFollowupAll =
163     "llvm.loop.vectorize.followup_all";
164 static const char *const LLVMLoopVectorizeFollowupVectorized =
165     "llvm.loop.vectorize.followup_vectorized";
166 static const char *const LLVMLoopVectorizeFollowupEpilogue =
167     "llvm.loop.vectorize.followup_epilogue";
168 /// @}
169 
170 STATISTIC(LoopsVectorized, "Number of loops vectorized");
171 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
172 
173 /// Loops with a known constant trip count below this number are vectorized only
174 /// if no scalar iteration overheads are incurred.
175 static cl::opt<unsigned> TinyTripCountVectorThreshold(
176     "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
177     cl::desc("Loops with a constant trip count that is smaller than this "
178              "value are vectorized only if no scalar iteration overheads "
179              "are incurred."));
180 
181 // Indicates that an epilogue is undesired, predication is preferred.
182 // This means that the vectorizer will try to fold the loop-tail (epilogue)
183 // into the loop and predicate the loop body accordingly.
184 static cl::opt<bool> PreferPredicateOverEpilog(
185     "prefer-predicate-over-epilog", cl::init(false), cl::Hidden,
186     cl::desc("Indicate that an epilogue is undesired, predication should be "
187              "used instead."));
188 
189 static cl::opt<bool> MaximizeBandwidth(
190     "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
191     cl::desc("Maximize bandwidth when selecting vectorization factor which "
192              "will be determined by the smallest type in loop."));
193 
194 static cl::opt<bool> EnableInterleavedMemAccesses(
195     "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
196     cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
197 
198 /// An interleave-group may need masking if it resides in a block that needs
199 /// predication, or in order to mask away gaps.
200 static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
201     "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
202     cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
203 
204 static cl::opt<unsigned> TinyTripCountInterleaveThreshold(
205     "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden,
206     cl::desc("We don't interleave loops with a estimated constant trip count "
207              "below this number"));
208 
209 static cl::opt<unsigned> ForceTargetNumScalarRegs(
210     "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
211     cl::desc("A flag that overrides the target's number of scalar registers."));
212 
213 static cl::opt<unsigned> ForceTargetNumVectorRegs(
214     "force-target-num-vector-regs", cl::init(0), cl::Hidden,
215     cl::desc("A flag that overrides the target's number of vector registers."));
216 
217 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
218     "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
219     cl::desc("A flag that overrides the target's max interleave factor for "
220              "scalar loops."));
221 
222 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
223     "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
224     cl::desc("A flag that overrides the target's max interleave factor for "
225              "vectorized loops."));
226 
227 static cl::opt<unsigned> ForceTargetInstructionCost(
228     "force-target-instruction-cost", cl::init(0), cl::Hidden,
229     cl::desc("A flag that overrides the target's expected cost for "
230              "an instruction to a single constant value. Mostly "
231              "useful for getting consistent testing."));
232 
233 static cl::opt<unsigned> SmallLoopCost(
234     "small-loop-cost", cl::init(20), cl::Hidden,
235     cl::desc(
236         "The cost of a loop that is considered 'small' by the interleaver."));
237 
238 static cl::opt<bool> LoopVectorizeWithBlockFrequency(
239     "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
240     cl::desc("Enable the use of the block frequency analysis to access PGO "
241              "heuristics minimizing code growth in cold regions and being more "
242              "aggressive in hot regions."));
243 
244 // Runtime interleave loops for load/store throughput.
245 static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
246     "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
247     cl::desc(
248         "Enable runtime interleaving until load/store ports are saturated"));
249 
250 /// The number of stores in a loop that are allowed to need predication.
251 static cl::opt<unsigned> NumberOfStoresToPredicate(
252     "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
253     cl::desc("Max number of stores to be predicated behind an if."));
254 
255 static cl::opt<bool> EnableIndVarRegisterHeur(
256     "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
257     cl::desc("Count the induction variable only once when interleaving"));
258 
259 static cl::opt<bool> EnableCondStoresVectorization(
260     "enable-cond-stores-vec", cl::init(true), cl::Hidden,
261     cl::desc("Enable if predication of stores during vectorization."));
262 
263 static cl::opt<unsigned> MaxNestedScalarReductionIC(
264     "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
265     cl::desc("The maximum interleave count to use when interleaving a scalar "
266              "reduction in a nested loop."));
267 
268 cl::opt<bool> EnableVPlanNativePath(
269     "enable-vplan-native-path", cl::init(false), cl::Hidden,
270     cl::desc("Enable VPlan-native vectorization path with "
271              "support for outer loop vectorization."));
272 
273 // FIXME: Remove this switch once we have divergence analysis. Currently we
274 // assume divergent non-backedge branches when this switch is true.
275 cl::opt<bool> EnableVPlanPredication(
276     "enable-vplan-predication", cl::init(false), cl::Hidden,
277     cl::desc("Enable VPlan-native vectorization path predicator with "
278              "support for outer loop vectorization."));
279 
280 // This flag enables the stress testing of the VPlan H-CFG construction in the
281 // VPlan-native vectorization path. It must be used in conjuction with
282 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
283 // verification of the H-CFGs built.
284 static cl::opt<bool> VPlanBuildStressTest(
285     "vplan-build-stress-test", cl::init(false), cl::Hidden,
286     cl::desc(
287         "Build VPlan for every supported loop nest in the function and bail "
288         "out right after the build (stress test the VPlan H-CFG construction "
289         "in the VPlan-native vectorization path)."));
290 
291 cl::opt<bool> llvm::EnableLoopInterleaving(
292     "interleave-loops", cl::init(true), cl::Hidden,
293     cl::desc("Enable loop interleaving in Loop vectorization passes"));
294 cl::opt<bool> llvm::EnableLoopVectorization(
295     "vectorize-loops", cl::init(true), cl::Hidden,
296     cl::desc("Run the Loop vectorization passes"));
297 
298 /// A helper function that returns the type of loaded or stored value.
299 static Type *getMemInstValueType(Value *I) {
300   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
301          "Expected Load or Store instruction");
302   if (auto *LI = dyn_cast<LoadInst>(I))
303     return LI->getType();
304   return cast<StoreInst>(I)->getValueOperand()->getType();
305 }
306 
307 /// A helper function that returns true if the given type is irregular. The
308 /// type is irregular if its allocated size doesn't equal the store size of an
309 /// element of the corresponding vector type at the given vectorization factor.
310 static bool hasIrregularType(Type *Ty, const DataLayout &DL, unsigned VF) {
311   // Determine if an array of VF elements of type Ty is "bitcast compatible"
312   // with a <VF x Ty> vector.
313   if (VF > 1) {
314     auto *VectorTy = VectorType::get(Ty, VF);
315     return VF * DL.getTypeAllocSize(Ty) != DL.getTypeStoreSize(VectorTy);
316   }
317 
318   // If the vectorization factor is one, we just check if an array of type Ty
319   // requires padding between elements.
320   return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
321 }
322 
323 /// A helper function that returns the reciprocal of the block probability of
324 /// predicated blocks. If we return X, we are assuming the predicated block
325 /// will execute once for every X iterations of the loop header.
326 ///
327 /// TODO: We should use actual block probability here, if available. Currently,
328 ///       we always assume predicated blocks have a 50% chance of executing.
329 static unsigned getReciprocalPredBlockProb() { return 2; }
330 
331 /// A helper function that adds a 'fast' flag to floating-point operations.
332 static Value *addFastMathFlag(Value *V) {
333   if (isa<FPMathOperator>(V))
334     cast<Instruction>(V)->setFastMathFlags(FastMathFlags::getFast());
335   return V;
336 }
337 
338 static Value *addFastMathFlag(Value *V, FastMathFlags FMF) {
339   if (isa<FPMathOperator>(V))
340     cast<Instruction>(V)->setFastMathFlags(FMF);
341   return V;
342 }
343 
344 /// A helper function that returns an integer or floating-point constant with
345 /// value C.
346 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
347   return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
348                            : ConstantFP::get(Ty, C);
349 }
350 
351 /// Returns "best known" trip count for the specified loop \p L as defined by
352 /// the following procedure:
353 ///   1) Returns exact trip count if it is known.
354 ///   2) Returns expected trip count according to profile data if any.
355 ///   3) Returns upper bound estimate if it is known.
356 ///   4) Returns None if all of the above failed.
357 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) {
358   // Check if exact trip count is known.
359   if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
360     return ExpectedTC;
361 
362   // Check if there is an expected trip count available from profile data.
363   if (LoopVectorizeWithBlockFrequency)
364     if (auto EstimatedTC = getLoopEstimatedTripCount(L))
365       return EstimatedTC;
366 
367   // Check if upper bound estimate is known.
368   if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
369     return ExpectedTC;
370 
371   return None;
372 }
373 
374 namespace llvm {
375 
376 /// InnerLoopVectorizer vectorizes loops which contain only one basic
377 /// block to a specified vectorization factor (VF).
378 /// This class performs the widening of scalars into vectors, or multiple
379 /// scalars. This class also implements the following features:
380 /// * It inserts an epilogue loop for handling loops that don't have iteration
381 ///   counts that are known to be a multiple of the vectorization factor.
382 /// * It handles the code generation for reduction variables.
383 /// * Scalarization (implementation using scalars) of un-vectorizable
384 ///   instructions.
385 /// InnerLoopVectorizer does not perform any vectorization-legality
386 /// checks, and relies on the caller to check for the different legality
387 /// aspects. The InnerLoopVectorizer relies on the
388 /// LoopVectorizationLegality class to provide information about the induction
389 /// and reduction variables that were found to a given vectorization factor.
390 class InnerLoopVectorizer {
391 public:
392   InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
393                       LoopInfo *LI, DominatorTree *DT,
394                       const TargetLibraryInfo *TLI,
395                       const TargetTransformInfo *TTI, AssumptionCache *AC,
396                       OptimizationRemarkEmitter *ORE, unsigned VecWidth,
397                       unsigned UnrollFactor, LoopVectorizationLegality *LVL,
398                       LoopVectorizationCostModel *CM)
399       : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
400         AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
401         Builder(PSE.getSE()->getContext()),
402         VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM) {}
403   virtual ~InnerLoopVectorizer() = default;
404 
405   /// Create a new empty loop. Unlink the old loop and connect the new one.
406   /// Return the pre-header block of the new loop.
407   BasicBlock *createVectorizedLoopSkeleton();
408 
409   /// Widen a single instruction within the innermost loop.
410   void widenInstruction(Instruction &I);
411 
412   /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
413   void fixVectorizedLoop();
414 
415   // Return true if any runtime check is added.
416   bool areSafetyChecksAdded() { return AddedSafetyChecks; }
417 
418   /// A type for vectorized values in the new loop. Each value from the
419   /// original loop, when vectorized, is represented by UF vector values in the
420   /// new unrolled loop, where UF is the unroll factor.
421   using VectorParts = SmallVector<Value *, 2>;
422 
423   /// Vectorize a single GetElementPtrInst based on information gathered and
424   /// decisions taken during planning.
425   void widenGEP(GetElementPtrInst *GEP, unsigned UF, unsigned VF,
426                 bool IsPtrLoopInvariant, SmallBitVector &IsIndexLoopInvariant);
427 
428   /// Vectorize a single PHINode in a block. This method handles the induction
429   /// variable canonicalization. It supports both VF = 1 for unrolled loops and
430   /// arbitrary length vectors.
431   void widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF);
432 
433   /// A helper function to scalarize a single Instruction in the innermost loop.
434   /// Generates a sequence of scalar instances for each lane between \p MinLane
435   /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
436   /// inclusive..
437   void scalarizeInstruction(Instruction *Instr, const VPIteration &Instance,
438                             bool IfPredicateInstr);
439 
440   /// Widen an integer or floating-point induction variable \p IV. If \p Trunc
441   /// is provided, the integer induction variable will first be truncated to
442   /// the corresponding type.
443   void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr);
444 
445   /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a
446   /// vector or scalar value on-demand if one is not yet available. When
447   /// vectorizing a loop, we visit the definition of an instruction before its
448   /// uses. When visiting the definition, we either vectorize or scalarize the
449   /// instruction, creating an entry for it in the corresponding map. (In some
450   /// cases, such as induction variables, we will create both vector and scalar
451   /// entries.) Then, as we encounter uses of the definition, we derive values
452   /// for each scalar or vector use unless such a value is already available.
453   /// For example, if we scalarize a definition and one of its uses is vector,
454   /// we build the required vector on-demand with an insertelement sequence
455   /// when visiting the use. Otherwise, if the use is scalar, we can use the
456   /// existing scalar definition.
457   ///
458   /// Return a value in the new loop corresponding to \p V from the original
459   /// loop at unroll index \p Part. If the value has already been vectorized,
460   /// the corresponding vector entry in VectorLoopValueMap is returned. If,
461   /// however, the value has a scalar entry in VectorLoopValueMap, we construct
462   /// a new vector value on-demand by inserting the scalar values into a vector
463   /// with an insertelement sequence. If the value has been neither vectorized
464   /// nor scalarized, it must be loop invariant, so we simply broadcast the
465   /// value into a vector.
466   Value *getOrCreateVectorValue(Value *V, unsigned Part);
467 
468   /// Return a value in the new loop corresponding to \p V from the original
469   /// loop at unroll and vector indices \p Instance. If the value has been
470   /// vectorized but not scalarized, the necessary extractelement instruction
471   /// will be generated.
472   Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance);
473 
474   /// Construct the vector value of a scalarized value \p V one lane at a time.
475   void packScalarIntoVectorValue(Value *V, const VPIteration &Instance);
476 
477   /// Try to vectorize the interleaved access group that \p Instr belongs to
478   /// with the base address given in \p Addr, optionally masking the vector
479   /// operations if \p BlockInMask is non-null. Use \p State to translate given
480   /// VPValues to IR values in the vectorized loop.
481   void vectorizeInterleaveGroup(Instruction *Instr, VPTransformState &State,
482                                 VPValue *Addr, VPValue *BlockInMask = nullptr);
483 
484   /// Vectorize Load and Store instructions with the base address given in \p
485   /// Addr, optionally masking the vector operations if \p BlockInMask is
486   /// non-null. Use \p State to translate given VPValues to IR values in the
487   /// vectorized loop.
488   void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State,
489                                   VPValue *Addr, VPValue *StoredValue,
490                                   VPValue *BlockInMask);
491 
492   /// Set the debug location in the builder using the debug location in
493   /// the instruction.
494   void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr);
495 
496   /// Fix the non-induction PHIs in the OrigPHIsToFix vector.
497   void fixNonInductionPHIs(void);
498 
499 protected:
500   friend class LoopVectorizationPlanner;
501 
502   /// A small list of PHINodes.
503   using PhiVector = SmallVector<PHINode *, 4>;
504 
505   /// A type for scalarized values in the new loop. Each value from the
506   /// original loop, when scalarized, is represented by UF x VF scalar values
507   /// in the new unrolled loop, where UF is the unroll factor and VF is the
508   /// vectorization factor.
509   using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
510 
511   /// Set up the values of the IVs correctly when exiting the vector loop.
512   void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
513                     Value *CountRoundDown, Value *EndValue,
514                     BasicBlock *MiddleBlock);
515 
516   /// Create a new induction variable inside L.
517   PHINode *createInductionVariable(Loop *L, Value *Start, Value *End,
518                                    Value *Step, Instruction *DL);
519 
520   /// Handle all cross-iteration phis in the header.
521   void fixCrossIterationPHIs();
522 
523   /// Fix a first-order recurrence. This is the second phase of vectorizing
524   /// this phi node.
525   void fixFirstOrderRecurrence(PHINode *Phi);
526 
527   /// Fix a reduction cross-iteration phi. This is the second phase of
528   /// vectorizing this phi node.
529   void fixReduction(PHINode *Phi);
530 
531   /// Clear NSW/NUW flags from reduction instructions if necessary.
532   void clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc);
533 
534   /// The Loop exit block may have single value PHI nodes with some
535   /// incoming value. While vectorizing we only handled real values
536   /// that were defined inside the loop and we should have one value for
537   /// each predecessor of its parent basic block. See PR14725.
538   void fixLCSSAPHIs();
539 
540   /// Iteratively sink the scalarized operands of a predicated instruction into
541   /// the block that was created for it.
542   void sinkScalarOperands(Instruction *PredInst);
543 
544   /// Shrinks vector element sizes to the smallest bitwidth they can be legally
545   /// represented as.
546   void truncateToMinimalBitwidths();
547 
548   /// Create a broadcast instruction. This method generates a broadcast
549   /// instruction (shuffle) for loop invariant values and for the induction
550   /// value. If this is the induction variable then we extend it to N, N+1, ...
551   /// this is needed because each iteration in the loop corresponds to a SIMD
552   /// element.
553   virtual Value *getBroadcastInstrs(Value *V);
554 
555   /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...)
556   /// to each vector element of Val. The sequence starts at StartIndex.
557   /// \p Opcode is relevant for FP induction variable.
558   virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step,
559                                Instruction::BinaryOps Opcode =
560                                Instruction::BinaryOpsEnd);
561 
562   /// Compute scalar induction steps. \p ScalarIV is the scalar induction
563   /// variable on which to base the steps, \p Step is the size of the step, and
564   /// \p EntryVal is the value from the original loop that maps to the steps.
565   /// Note that \p EntryVal doesn't have to be an induction variable - it
566   /// can also be a truncate instruction.
567   void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal,
568                         const InductionDescriptor &ID);
569 
570   /// Create a vector induction phi node based on an existing scalar one. \p
571   /// EntryVal is the value from the original loop that maps to the vector phi
572   /// node, and \p Step is the loop-invariant step. If \p EntryVal is a
573   /// truncate instruction, instead of widening the original IV, we widen a
574   /// version of the IV truncated to \p EntryVal's type.
575   void createVectorIntOrFpInductionPHI(const InductionDescriptor &II,
576                                        Value *Step, Instruction *EntryVal);
577 
578   /// Returns true if an instruction \p I should be scalarized instead of
579   /// vectorized for the chosen vectorization factor.
580   bool shouldScalarizeInstruction(Instruction *I) const;
581 
582   /// Returns true if we should generate a scalar version of \p IV.
583   bool needsScalarInduction(Instruction *IV) const;
584 
585   /// If there is a cast involved in the induction variable \p ID, which should
586   /// be ignored in the vectorized loop body, this function records the
587   /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the
588   /// cast. We had already proved that the casted Phi is equal to the uncasted
589   /// Phi in the vectorized loop (under a runtime guard), and therefore
590   /// there is no need to vectorize the cast - the same value can be used in the
591   /// vector loop for both the Phi and the cast.
592   /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified,
593   /// Otherwise, \p VectorLoopValue is a widened/vectorized value.
594   ///
595   /// \p EntryVal is the value from the original loop that maps to the vector
596   /// phi node and is used to distinguish what is the IV currently being
597   /// processed - original one (if \p EntryVal is a phi corresponding to the
598   /// original IV) or the "newly-created" one based on the proof mentioned above
599   /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the
600   /// latter case \p EntryVal is a TruncInst and we must not record anything for
601   /// that IV, but it's error-prone to expect callers of this routine to care
602   /// about that, hence this explicit parameter.
603   void recordVectorLoopValueForInductionCast(const InductionDescriptor &ID,
604                                              const Instruction *EntryVal,
605                                              Value *VectorLoopValue,
606                                              unsigned Part,
607                                              unsigned Lane = UINT_MAX);
608 
609   /// Generate a shuffle sequence that will reverse the vector Vec.
610   virtual Value *reverseVector(Value *Vec);
611 
612   /// Returns (and creates if needed) the original loop trip count.
613   Value *getOrCreateTripCount(Loop *NewLoop);
614 
615   /// Returns (and creates if needed) the trip count of the widened loop.
616   Value *getOrCreateVectorTripCount(Loop *NewLoop);
617 
618   /// Returns a bitcasted value to the requested vector type.
619   /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
620   Value *createBitOrPointerCast(Value *V, VectorType *DstVTy,
621                                 const DataLayout &DL);
622 
623   /// Emit a bypass check to see if the vector trip count is zero, including if
624   /// it overflows.
625   void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass);
626 
627   /// Emit a bypass check to see if all of the SCEV assumptions we've
628   /// had to make are correct.
629   void emitSCEVChecks(Loop *L, BasicBlock *Bypass);
630 
631   /// Emit bypass checks to check any memory assumptions we may have made.
632   void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass);
633 
634   /// Compute the transformed value of Index at offset StartValue using step
635   /// StepValue.
636   /// For integer induction, returns StartValue + Index * StepValue.
637   /// For pointer induction, returns StartValue[Index * StepValue].
638   /// FIXME: The newly created binary instructions should contain nsw/nuw
639   /// flags, which can be found from the original scalar operations.
640   Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE,
641                               const DataLayout &DL,
642                               const InductionDescriptor &ID) const;
643 
644   /// Add additional metadata to \p To that was not present on \p Orig.
645   ///
646   /// Currently this is used to add the noalias annotations based on the
647   /// inserted memchecks.  Use this for instructions that are *cloned* into the
648   /// vector loop.
649   void addNewMetadata(Instruction *To, const Instruction *Orig);
650 
651   /// Add metadata from one instruction to another.
652   ///
653   /// This includes both the original MDs from \p From and additional ones (\see
654   /// addNewMetadata).  Use this for *newly created* instructions in the vector
655   /// loop.
656   void addMetadata(Instruction *To, Instruction *From);
657 
658   /// Similar to the previous function but it adds the metadata to a
659   /// vector of instructions.
660   void addMetadata(ArrayRef<Value *> To, Instruction *From);
661 
662   /// The original loop.
663   Loop *OrigLoop;
664 
665   /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
666   /// dynamic knowledge to simplify SCEV expressions and converts them to a
667   /// more usable form.
668   PredicatedScalarEvolution &PSE;
669 
670   /// Loop Info.
671   LoopInfo *LI;
672 
673   /// Dominator Tree.
674   DominatorTree *DT;
675 
676   /// Alias Analysis.
677   AliasAnalysis *AA;
678 
679   /// Target Library Info.
680   const TargetLibraryInfo *TLI;
681 
682   /// Target Transform Info.
683   const TargetTransformInfo *TTI;
684 
685   /// Assumption Cache.
686   AssumptionCache *AC;
687 
688   /// Interface to emit optimization remarks.
689   OptimizationRemarkEmitter *ORE;
690 
691   /// LoopVersioning.  It's only set up (non-null) if memchecks were
692   /// used.
693   ///
694   /// This is currently only used to add no-alias metadata based on the
695   /// memchecks.  The actually versioning is performed manually.
696   std::unique_ptr<LoopVersioning> LVer;
697 
698   /// The vectorization SIMD factor to use. Each vector will have this many
699   /// vector elements.
700   unsigned VF;
701 
702   /// The vectorization unroll factor to use. Each scalar is vectorized to this
703   /// many different vector instructions.
704   unsigned UF;
705 
706   /// The builder that we use
707   IRBuilder<> Builder;
708 
709   // --- Vectorization state ---
710 
711   /// The vector-loop preheader.
712   BasicBlock *LoopVectorPreHeader;
713 
714   /// The scalar-loop preheader.
715   BasicBlock *LoopScalarPreHeader;
716 
717   /// Middle Block between the vector and the scalar.
718   BasicBlock *LoopMiddleBlock;
719 
720   /// The ExitBlock of the scalar loop.
721   BasicBlock *LoopExitBlock;
722 
723   /// The vector loop body.
724   BasicBlock *LoopVectorBody;
725 
726   /// The scalar loop body.
727   BasicBlock *LoopScalarBody;
728 
729   /// A list of all bypass blocks. The first block is the entry of the loop.
730   SmallVector<BasicBlock *, 4> LoopBypassBlocks;
731 
732   /// The new Induction variable which was added to the new block.
733   PHINode *Induction = nullptr;
734 
735   /// The induction variable of the old basic block.
736   PHINode *OldInduction = nullptr;
737 
738   /// Maps values from the original loop to their corresponding values in the
739   /// vectorized loop. A key value can map to either vector values, scalar
740   /// values or both kinds of values, depending on whether the key was
741   /// vectorized and scalarized.
742   VectorizerValueMap VectorLoopValueMap;
743 
744   /// Store instructions that were predicated.
745   SmallVector<Instruction *, 4> PredicatedInstructions;
746 
747   /// Trip count of the original loop.
748   Value *TripCount = nullptr;
749 
750   /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
751   Value *VectorTripCount = nullptr;
752 
753   /// The legality analysis.
754   LoopVectorizationLegality *Legal;
755 
756   /// The profitablity analysis.
757   LoopVectorizationCostModel *Cost;
758 
759   // Record whether runtime checks are added.
760   bool AddedSafetyChecks = false;
761 
762   // Holds the end values for each induction variable. We save the end values
763   // so we can later fix-up the external users of the induction variables.
764   DenseMap<PHINode *, Value *> IVEndValues;
765 
766   // Vector of original scalar PHIs whose corresponding widened PHIs need to be
767   // fixed up at the end of vector code generation.
768   SmallVector<PHINode *, 8> OrigPHIsToFix;
769 };
770 
771 class InnerLoopUnroller : public InnerLoopVectorizer {
772 public:
773   InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
774                     LoopInfo *LI, DominatorTree *DT,
775                     const TargetLibraryInfo *TLI,
776                     const TargetTransformInfo *TTI, AssumptionCache *AC,
777                     OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
778                     LoopVectorizationLegality *LVL,
779                     LoopVectorizationCostModel *CM)
780       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1,
781                             UnrollFactor, LVL, CM) {}
782 
783 private:
784   Value *getBroadcastInstrs(Value *V) override;
785   Value *getStepVector(Value *Val, int StartIdx, Value *Step,
786                        Instruction::BinaryOps Opcode =
787                        Instruction::BinaryOpsEnd) override;
788   Value *reverseVector(Value *Vec) override;
789 };
790 
791 } // end namespace llvm
792 
793 /// Look for a meaningful debug location on the instruction or it's
794 /// operands.
795 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
796   if (!I)
797     return I;
798 
799   DebugLoc Empty;
800   if (I->getDebugLoc() != Empty)
801     return I;
802 
803   for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) {
804     if (Instruction *OpInst = dyn_cast<Instruction>(*OI))
805       if (OpInst->getDebugLoc() != Empty)
806         return OpInst;
807   }
808 
809   return I;
810 }
811 
812 void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) {
813   if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) {
814     const DILocation *DIL = Inst->getDebugLoc();
815     if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
816         !isa<DbgInfoIntrinsic>(Inst)) {
817       auto NewDIL = DIL->cloneByMultiplyingDuplicationFactor(UF * VF);
818       if (NewDIL)
819         B.SetCurrentDebugLocation(NewDIL.getValue());
820       else
821         LLVM_DEBUG(dbgs()
822                    << "Failed to create new discriminator: "
823                    << DIL->getFilename() << " Line: " << DIL->getLine());
824     }
825     else
826       B.SetCurrentDebugLocation(DIL);
827   } else
828     B.SetCurrentDebugLocation(DebugLoc());
829 }
830 
831 /// Write a record \p DebugMsg about vectorization failure to the debug
832 /// output stream. If \p I is passed, it is an instruction that prevents
833 /// vectorization.
834 #ifndef NDEBUG
835 static void debugVectorizationFailure(const StringRef DebugMsg,
836     Instruction *I) {
837   dbgs() << "LV: Not vectorizing: " << DebugMsg;
838   if (I != nullptr)
839     dbgs() << " " << *I;
840   else
841     dbgs() << '.';
842   dbgs() << '\n';
843 }
844 #endif
845 
846 /// Create an analysis remark that explains why vectorization failed
847 ///
848 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint).  \p
849 /// RemarkName is the identifier for the remark.  If \p I is passed it is an
850 /// instruction that prevents vectorization.  Otherwise \p TheLoop is used for
851 /// the location of the remark.  \return the remark object that can be
852 /// streamed to.
853 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
854     StringRef RemarkName, Loop *TheLoop, Instruction *I) {
855   Value *CodeRegion = TheLoop->getHeader();
856   DebugLoc DL = TheLoop->getStartLoc();
857 
858   if (I) {
859     CodeRegion = I->getParent();
860     // If there is no debug location attached to the instruction, revert back to
861     // using the loop's.
862     if (I->getDebugLoc())
863       DL = I->getDebugLoc();
864   }
865 
866   OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion);
867   R << "loop not vectorized: ";
868   return R;
869 }
870 
871 namespace llvm {
872 
873 void reportVectorizationFailure(const StringRef DebugMsg,
874     const StringRef OREMsg, const StringRef ORETag,
875     OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) {
876   LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I));
877   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
878   ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(),
879                 ORETag, TheLoop, I) << OREMsg);
880 }
881 
882 } // end namespace llvm
883 
884 #ifndef NDEBUG
885 /// \return string containing a file name and a line # for the given loop.
886 static std::string getDebugLocString(const Loop *L) {
887   std::string Result;
888   if (L) {
889     raw_string_ostream OS(Result);
890     if (const DebugLoc LoopDbgLoc = L->getStartLoc())
891       LoopDbgLoc.print(OS);
892     else
893       // Just print the module name.
894       OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
895     OS.flush();
896   }
897   return Result;
898 }
899 #endif
900 
901 void InnerLoopVectorizer::addNewMetadata(Instruction *To,
902                                          const Instruction *Orig) {
903   // If the loop was versioned with memchecks, add the corresponding no-alias
904   // metadata.
905   if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig)))
906     LVer->annotateInstWithNoAlias(To, Orig);
907 }
908 
909 void InnerLoopVectorizer::addMetadata(Instruction *To,
910                                       Instruction *From) {
911   propagateMetadata(To, From);
912   addNewMetadata(To, From);
913 }
914 
915 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,
916                                       Instruction *From) {
917   for (Value *V : To) {
918     if (Instruction *I = dyn_cast<Instruction>(V))
919       addMetadata(I, From);
920   }
921 }
922 
923 namespace llvm {
924 
925 // Loop vectorization cost-model hints how the scalar epilogue loop should be
926 // lowered.
927 enum ScalarEpilogueLowering {
928 
929   // The default: allowing scalar epilogues.
930   CM_ScalarEpilogueAllowed,
931 
932   // Vectorization with OptForSize: don't allow epilogues.
933   CM_ScalarEpilogueNotAllowedOptSize,
934 
935   // A special case of vectorisation with OptForSize: loops with a very small
936   // trip count are considered for vectorization under OptForSize, thereby
937   // making sure the cost of their loop body is dominant, free of runtime
938   // guards and scalar iteration overheads.
939   CM_ScalarEpilogueNotAllowedLowTripLoop,
940 
941   // Loop hint predicate indicating an epilogue is undesired.
942   CM_ScalarEpilogueNotNeededUsePredicate
943 };
944 
945 /// LoopVectorizationCostModel - estimates the expected speedups due to
946 /// vectorization.
947 /// In many cases vectorization is not profitable. This can happen because of
948 /// a number of reasons. In this class we mainly attempt to predict the
949 /// expected speedup/slowdowns due to the supported instruction set. We use the
950 /// TargetTransformInfo to query the different backends for the cost of
951 /// different operations.
952 class LoopVectorizationCostModel {
953 public:
954   LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
955                              PredicatedScalarEvolution &PSE, LoopInfo *LI,
956                              LoopVectorizationLegality *Legal,
957                              const TargetTransformInfo &TTI,
958                              const TargetLibraryInfo *TLI, DemandedBits *DB,
959                              AssumptionCache *AC,
960                              OptimizationRemarkEmitter *ORE, const Function *F,
961                              const LoopVectorizeHints *Hints,
962                              InterleavedAccessInfo &IAI)
963       : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
964         TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
965         Hints(Hints), InterleaveInfo(IAI) {}
966 
967   /// \return An upper bound for the vectorization factor, or None if
968   /// vectorization and interleaving should be avoided up front.
969   Optional<unsigned> computeMaxVF();
970 
971   /// \return True if runtime checks are required for vectorization, and false
972   /// otherwise.
973   bool runtimeChecksRequired();
974 
975   /// \return The most profitable vectorization factor and the cost of that VF.
976   /// This method checks every power of two up to MaxVF. If UserVF is not ZERO
977   /// then this vectorization factor will be selected if vectorization is
978   /// possible.
979   VectorizationFactor selectVectorizationFactor(unsigned MaxVF);
980 
981   /// Setup cost-based decisions for user vectorization factor.
982   void selectUserVectorizationFactor(unsigned UserVF) {
983     collectUniformsAndScalars(UserVF);
984     collectInstsToScalarize(UserVF);
985   }
986 
987   /// \return The size (in bits) of the smallest and widest types in the code
988   /// that needs to be vectorized. We ignore values that remain scalar such as
989   /// 64 bit loop indices.
990   std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
991 
992   /// \return The desired interleave count.
993   /// If interleave count has been specified by metadata it will be returned.
994   /// Otherwise, the interleave count is computed and returned. VF and LoopCost
995   /// are the selected vectorization factor and the cost of the selected VF.
996   unsigned selectInterleaveCount(unsigned VF, unsigned LoopCost);
997 
998   /// Memory access instruction may be vectorized in more than one way.
999   /// Form of instruction after vectorization depends on cost.
1000   /// This function takes cost-based decisions for Load/Store instructions
1001   /// and collects them in a map. This decisions map is used for building
1002   /// the lists of loop-uniform and loop-scalar instructions.
1003   /// The calculated cost is saved with widening decision in order to
1004   /// avoid redundant calculations.
1005   void setCostBasedWideningDecision(unsigned VF);
1006 
1007   /// A struct that represents some properties of the register usage
1008   /// of a loop.
1009   struct RegisterUsage {
1010     /// Holds the number of loop invariant values that are used in the loop.
1011     /// The key is ClassID of target-provided register class.
1012     SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs;
1013     /// Holds the maximum number of concurrent live intervals in the loop.
1014     /// The key is ClassID of target-provided register class.
1015     SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
1016   };
1017 
1018   /// \return Returns information about the register usages of the loop for the
1019   /// given vectorization factors.
1020   SmallVector<RegisterUsage, 8> calculateRegisterUsage(ArrayRef<unsigned> VFs);
1021 
1022   /// Collect values we want to ignore in the cost model.
1023   void collectValuesToIgnore();
1024 
1025   /// \returns The smallest bitwidth each instruction can be represented with.
1026   /// The vector equivalents of these instructions should be truncated to this
1027   /// type.
1028   const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
1029     return MinBWs;
1030   }
1031 
1032   /// \returns True if it is more profitable to scalarize instruction \p I for
1033   /// vectorization factor \p VF.
1034   bool isProfitableToScalarize(Instruction *I, unsigned VF) const {
1035     assert(VF > 1 && "Profitable to scalarize relevant only for VF > 1.");
1036 
1037     // Cost model is not run in the VPlan-native path - return conservative
1038     // result until this changes.
1039     if (EnableVPlanNativePath)
1040       return false;
1041 
1042     auto Scalars = InstsToScalarize.find(VF);
1043     assert(Scalars != InstsToScalarize.end() &&
1044            "VF not yet analyzed for scalarization profitability");
1045     return Scalars->second.find(I) != Scalars->second.end();
1046   }
1047 
1048   /// Returns true if \p I is known to be uniform after vectorization.
1049   bool isUniformAfterVectorization(Instruction *I, unsigned VF) const {
1050     if (VF == 1)
1051       return true;
1052 
1053     // Cost model is not run in the VPlan-native path - return conservative
1054     // result until this changes.
1055     if (EnableVPlanNativePath)
1056       return false;
1057 
1058     auto UniformsPerVF = Uniforms.find(VF);
1059     assert(UniformsPerVF != Uniforms.end() &&
1060            "VF not yet analyzed for uniformity");
1061     return UniformsPerVF->second.find(I) != UniformsPerVF->second.end();
1062   }
1063 
1064   /// Returns true if \p I is known to be scalar after vectorization.
1065   bool isScalarAfterVectorization(Instruction *I, unsigned VF) const {
1066     if (VF == 1)
1067       return true;
1068 
1069     // Cost model is not run in the VPlan-native path - return conservative
1070     // result until this changes.
1071     if (EnableVPlanNativePath)
1072       return false;
1073 
1074     auto ScalarsPerVF = Scalars.find(VF);
1075     assert(ScalarsPerVF != Scalars.end() &&
1076            "Scalar values are not calculated for VF");
1077     return ScalarsPerVF->second.find(I) != ScalarsPerVF->second.end();
1078   }
1079 
1080   /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1081   /// for vectorization factor \p VF.
1082   bool canTruncateToMinimalBitwidth(Instruction *I, unsigned VF) const {
1083     return VF > 1 && MinBWs.find(I) != MinBWs.end() &&
1084            !isProfitableToScalarize(I, VF) &&
1085            !isScalarAfterVectorization(I, VF);
1086   }
1087 
1088   /// Decision that was taken during cost calculation for memory instruction.
1089   enum InstWidening {
1090     CM_Unknown,
1091     CM_Widen,         // For consecutive accesses with stride +1.
1092     CM_Widen_Reverse, // For consecutive accesses with stride -1.
1093     CM_Interleave,
1094     CM_GatherScatter,
1095     CM_Scalarize
1096   };
1097 
1098   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1099   /// instruction \p I and vector width \p VF.
1100   void setWideningDecision(Instruction *I, unsigned VF, InstWidening W,
1101                            unsigned Cost) {
1102     assert(VF >= 2 && "Expected VF >=2");
1103     WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1104   }
1105 
1106   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1107   /// interleaving group \p Grp and vector width \p VF.
1108   void setWideningDecision(const InterleaveGroup<Instruction> *Grp, unsigned VF,
1109                            InstWidening W, unsigned Cost) {
1110     assert(VF >= 2 && "Expected VF >=2");
1111     /// Broadcast this decicion to all instructions inside the group.
1112     /// But the cost will be assigned to one instruction only.
1113     for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1114       if (auto *I = Grp->getMember(i)) {
1115         if (Grp->getInsertPos() == I)
1116           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1117         else
1118           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1119       }
1120     }
1121   }
1122 
1123   /// Return the cost model decision for the given instruction \p I and vector
1124   /// width \p VF. Return CM_Unknown if this instruction did not pass
1125   /// through the cost modeling.
1126   InstWidening getWideningDecision(Instruction *I, unsigned VF) {
1127     assert(VF >= 2 && "Expected VF >=2");
1128 
1129     // Cost model is not run in the VPlan-native path - return conservative
1130     // result until this changes.
1131     if (EnableVPlanNativePath)
1132       return CM_GatherScatter;
1133 
1134     std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
1135     auto Itr = WideningDecisions.find(InstOnVF);
1136     if (Itr == WideningDecisions.end())
1137       return CM_Unknown;
1138     return Itr->second.first;
1139   }
1140 
1141   /// Return the vectorization cost for the given instruction \p I and vector
1142   /// width \p VF.
1143   unsigned getWideningCost(Instruction *I, unsigned VF) {
1144     assert(VF >= 2 && "Expected VF >=2");
1145     std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
1146     assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&
1147            "The cost is not calculated");
1148     return WideningDecisions[InstOnVF].second;
1149   }
1150 
1151   /// Return True if instruction \p I is an optimizable truncate whose operand
1152   /// is an induction variable. Such a truncate will be removed by adding a new
1153   /// induction variable with the destination type.
1154   bool isOptimizableIVTruncate(Instruction *I, unsigned VF) {
1155     // If the instruction is not a truncate, return false.
1156     auto *Trunc = dyn_cast<TruncInst>(I);
1157     if (!Trunc)
1158       return false;
1159 
1160     // Get the source and destination types of the truncate.
1161     Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1162     Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1163 
1164     // If the truncate is free for the given types, return false. Replacing a
1165     // free truncate with an induction variable would add an induction variable
1166     // update instruction to each iteration of the loop. We exclude from this
1167     // check the primary induction variable since it will need an update
1168     // instruction regardless.
1169     Value *Op = Trunc->getOperand(0);
1170     if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1171       return false;
1172 
1173     // If the truncated value is not an induction variable, return false.
1174     return Legal->isInductionPhi(Op);
1175   }
1176 
1177   /// Collects the instructions to scalarize for each predicated instruction in
1178   /// the loop.
1179   void collectInstsToScalarize(unsigned VF);
1180 
1181   /// Collect Uniform and Scalar values for the given \p VF.
1182   /// The sets depend on CM decision for Load/Store instructions
1183   /// that may be vectorized as interleave, gather-scatter or scalarized.
1184   void collectUniformsAndScalars(unsigned VF) {
1185     // Do the analysis once.
1186     if (VF == 1 || Uniforms.find(VF) != Uniforms.end())
1187       return;
1188     setCostBasedWideningDecision(VF);
1189     collectLoopUniforms(VF);
1190     collectLoopScalars(VF);
1191   }
1192 
1193   /// Returns true if the target machine supports masked store operation
1194   /// for the given \p DataType and kind of access to \p Ptr.
1195   bool isLegalMaskedStore(Type *DataType, Value *Ptr, MaybeAlign Alignment) {
1196     return Legal->isConsecutivePtr(Ptr) &&
1197            TTI.isLegalMaskedStore(DataType, Alignment);
1198   }
1199 
1200   /// Returns true if the target machine supports masked load operation
1201   /// for the given \p DataType and kind of access to \p Ptr.
1202   bool isLegalMaskedLoad(Type *DataType, Value *Ptr, MaybeAlign Alignment) {
1203     return Legal->isConsecutivePtr(Ptr) &&
1204            TTI.isLegalMaskedLoad(DataType, Alignment);
1205   }
1206 
1207   /// Returns true if the target machine supports masked scatter operation
1208   /// for the given \p DataType.
1209   bool isLegalMaskedScatter(Type *DataType, MaybeAlign Alignment) {
1210     return TTI.isLegalMaskedScatter(DataType, Alignment);
1211   }
1212 
1213   /// Returns true if the target machine supports masked gather operation
1214   /// for the given \p DataType.
1215   bool isLegalMaskedGather(Type *DataType, MaybeAlign Alignment) {
1216     return TTI.isLegalMaskedGather(DataType, Alignment);
1217   }
1218 
1219   /// Returns true if the target machine can represent \p V as a masked gather
1220   /// or scatter operation.
1221   bool isLegalGatherOrScatter(Value *V) {
1222     bool LI = isa<LoadInst>(V);
1223     bool SI = isa<StoreInst>(V);
1224     if (!LI && !SI)
1225       return false;
1226     auto *Ty = getMemInstValueType(V);
1227     MaybeAlign Align = getLoadStoreAlignment(V);
1228     return (LI && isLegalMaskedGather(Ty, Align)) ||
1229            (SI && isLegalMaskedScatter(Ty, Align));
1230   }
1231 
1232   /// Returns true if \p I is an instruction that will be scalarized with
1233   /// predication. Such instructions include conditional stores and
1234   /// instructions that may divide by zero.
1235   /// If a non-zero VF has been calculated, we check if I will be scalarized
1236   /// predication for that VF.
1237   bool isScalarWithPredication(Instruction *I, unsigned VF = 1);
1238 
1239   // Returns true if \p I is an instruction that will be predicated either
1240   // through scalar predication or masked load/store or masked gather/scatter.
1241   // Superset of instructions that return true for isScalarWithPredication.
1242   bool isPredicatedInst(Instruction *I) {
1243     if (!blockNeedsPredication(I->getParent()))
1244       return false;
1245     // Loads and stores that need some form of masked operation are predicated
1246     // instructions.
1247     if (isa<LoadInst>(I) || isa<StoreInst>(I))
1248       return Legal->isMaskRequired(I);
1249     return isScalarWithPredication(I);
1250   }
1251 
1252   /// Returns true if \p I is a memory instruction with consecutive memory
1253   /// access that can be widened.
1254   bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1);
1255 
1256   /// Returns true if \p I is a memory instruction in an interleaved-group
1257   /// of memory accesses that can be vectorized with wide vector loads/stores
1258   /// and shuffles.
1259   bool interleavedAccessCanBeWidened(Instruction *I, unsigned VF = 1);
1260 
1261   /// Check if \p Instr belongs to any interleaved access group.
1262   bool isAccessInterleaved(Instruction *Instr) {
1263     return InterleaveInfo.isInterleaved(Instr);
1264   }
1265 
1266   /// Get the interleaved access group that \p Instr belongs to.
1267   const InterleaveGroup<Instruction> *
1268   getInterleavedAccessGroup(Instruction *Instr) {
1269     return InterleaveInfo.getInterleaveGroup(Instr);
1270   }
1271 
1272   /// Returns true if an interleaved group requires a scalar iteration
1273   /// to handle accesses with gaps, and there is nothing preventing us from
1274   /// creating a scalar epilogue.
1275   bool requiresScalarEpilogue() const {
1276     return isScalarEpilogueAllowed() && InterleaveInfo.requiresScalarEpilogue();
1277   }
1278 
1279   /// Returns true if a scalar epilogue is not allowed due to optsize or a
1280   /// loop hint annotation.
1281   bool isScalarEpilogueAllowed() const {
1282     return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1283   }
1284 
1285   /// Returns true if all loop blocks should be masked to fold tail loop.
1286   bool foldTailByMasking() const { return FoldTailByMasking; }
1287 
1288   bool blockNeedsPredication(BasicBlock *BB) {
1289     return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1290   }
1291 
1292   /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1293   /// with factor VF.  Return the cost of the instruction, including
1294   /// scalarization overhead if it's needed.
1295   unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF);
1296 
1297   /// Estimate cost of a call instruction CI if it were vectorized with factor
1298   /// VF. Return the cost of the instruction, including scalarization overhead
1299   /// if it's needed. The flag NeedToScalarize shows if the call needs to be
1300   /// scalarized -
1301   /// i.e. either vector version isn't available, or is too expensive.
1302   unsigned getVectorCallCost(CallInst *CI, unsigned VF, bool &NeedToScalarize);
1303 
1304 private:
1305   unsigned NumPredStores = 0;
1306 
1307   /// \return An upper bound for the vectorization factor, larger than zero.
1308   /// One is returned if vectorization should best be avoided due to cost.
1309   unsigned computeFeasibleMaxVF(unsigned ConstTripCount);
1310 
1311   /// The vectorization cost is a combination of the cost itself and a boolean
1312   /// indicating whether any of the contributing operations will actually
1313   /// operate on
1314   /// vector values after type legalization in the backend. If this latter value
1315   /// is
1316   /// false, then all operations will be scalarized (i.e. no vectorization has
1317   /// actually taken place).
1318   using VectorizationCostTy = std::pair<unsigned, bool>;
1319 
1320   /// Returns the expected execution cost. The unit of the cost does
1321   /// not matter because we use the 'cost' units to compare different
1322   /// vector widths. The cost that is returned is *not* normalized by
1323   /// the factor width.
1324   VectorizationCostTy expectedCost(unsigned VF);
1325 
1326   /// Returns the execution time cost of an instruction for a given vector
1327   /// width. Vector width of one means scalar.
1328   VectorizationCostTy getInstructionCost(Instruction *I, unsigned VF);
1329 
1330   /// The cost-computation logic from getInstructionCost which provides
1331   /// the vector type as an output parameter.
1332   unsigned getInstructionCost(Instruction *I, unsigned VF, Type *&VectorTy);
1333 
1334   /// Calculate vectorization cost of memory instruction \p I.
1335   unsigned getMemoryInstructionCost(Instruction *I, unsigned VF);
1336 
1337   /// The cost computation for scalarized memory instruction.
1338   unsigned getMemInstScalarizationCost(Instruction *I, unsigned VF);
1339 
1340   /// The cost computation for interleaving group of memory instructions.
1341   unsigned getInterleaveGroupCost(Instruction *I, unsigned VF);
1342 
1343   /// The cost computation for Gather/Scatter instruction.
1344   unsigned getGatherScatterCost(Instruction *I, unsigned VF);
1345 
1346   /// The cost computation for widening instruction \p I with consecutive
1347   /// memory access.
1348   unsigned getConsecutiveMemOpCost(Instruction *I, unsigned VF);
1349 
1350   /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1351   /// Load: scalar load + broadcast.
1352   /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1353   /// element)
1354   unsigned getUniformMemOpCost(Instruction *I, unsigned VF);
1355 
1356   /// Estimate the overhead of scalarizing an instruction. This is a
1357   /// convenience wrapper for the type-based getScalarizationOverhead API.
1358   unsigned getScalarizationOverhead(Instruction *I, unsigned VF);
1359 
1360   /// Returns whether the instruction is a load or store and will be a emitted
1361   /// as a vector operation.
1362   bool isConsecutiveLoadOrStore(Instruction *I);
1363 
1364   /// Returns true if an artificially high cost for emulated masked memrefs
1365   /// should be used.
1366   bool useEmulatedMaskMemRefHack(Instruction *I);
1367 
1368   /// Map of scalar integer values to the smallest bitwidth they can be legally
1369   /// represented as. The vector equivalents of these values should be truncated
1370   /// to this type.
1371   MapVector<Instruction *, uint64_t> MinBWs;
1372 
1373   /// A type representing the costs for instructions if they were to be
1374   /// scalarized rather than vectorized. The entries are Instruction-Cost
1375   /// pairs.
1376   using ScalarCostsTy = DenseMap<Instruction *, unsigned>;
1377 
1378   /// A set containing all BasicBlocks that are known to present after
1379   /// vectorization as a predicated block.
1380   SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;
1381 
1382   /// Records whether it is allowed to have the original scalar loop execute at
1383   /// least once. This may be needed as a fallback loop in case runtime
1384   /// aliasing/dependence checks fail, or to handle the tail/remainder
1385   /// iterations when the trip count is unknown or doesn't divide by the VF,
1386   /// or as a peel-loop to handle gaps in interleave-groups.
1387   /// Under optsize and when the trip count is very small we don't allow any
1388   /// iterations to execute in the scalar loop.
1389   ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1390 
1391   /// All blocks of loop are to be masked to fold tail of scalar iterations.
1392   bool FoldTailByMasking = false;
1393 
1394   /// A map holding scalar costs for different vectorization factors. The
1395   /// presence of a cost for an instruction in the mapping indicates that the
1396   /// instruction will be scalarized when vectorizing with the associated
1397   /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1398   DenseMap<unsigned, ScalarCostsTy> InstsToScalarize;
1399 
1400   /// Holds the instructions known to be uniform after vectorization.
1401   /// The data is collected per VF.
1402   DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Uniforms;
1403 
1404   /// Holds the instructions known to be scalar after vectorization.
1405   /// The data is collected per VF.
1406   DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Scalars;
1407 
1408   /// Holds the instructions (address computations) that are forced to be
1409   /// scalarized.
1410   DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1411 
1412   /// Returns the expected difference in cost from scalarizing the expression
1413   /// feeding a predicated instruction \p PredInst. The instructions to
1414   /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1415   /// non-negative return value implies the expression will be scalarized.
1416   /// Currently, only single-use chains are considered for scalarization.
1417   int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
1418                               unsigned VF);
1419 
1420   /// Collect the instructions that are uniform after vectorization. An
1421   /// instruction is uniform if we represent it with a single scalar value in
1422   /// the vectorized loop corresponding to each vector iteration. Examples of
1423   /// uniform instructions include pointer operands of consecutive or
1424   /// interleaved memory accesses. Note that although uniformity implies an
1425   /// instruction will be scalar, the reverse is not true. In general, a
1426   /// scalarized instruction will be represented by VF scalar values in the
1427   /// vectorized loop, each corresponding to an iteration of the original
1428   /// scalar loop.
1429   void collectLoopUniforms(unsigned VF);
1430 
1431   /// Collect the instructions that are scalar after vectorization. An
1432   /// instruction is scalar if it is known to be uniform or will be scalarized
1433   /// during vectorization. Non-uniform scalarized instructions will be
1434   /// represented by VF values in the vectorized loop, each corresponding to an
1435   /// iteration of the original scalar loop.
1436   void collectLoopScalars(unsigned VF);
1437 
1438   /// Keeps cost model vectorization decision and cost for instructions.
1439   /// Right now it is used for memory instructions only.
1440   using DecisionList = DenseMap<std::pair<Instruction *, unsigned>,
1441                                 std::pair<InstWidening, unsigned>>;
1442 
1443   DecisionList WideningDecisions;
1444 
1445   /// Returns true if \p V is expected to be vectorized and it needs to be
1446   /// extracted.
1447   bool needsExtract(Value *V, unsigned VF) const {
1448     Instruction *I = dyn_cast<Instruction>(V);
1449     if (VF == 1 || !I || !TheLoop->contains(I) || TheLoop->isLoopInvariant(I))
1450       return false;
1451 
1452     // Assume we can vectorize V (and hence we need extraction) if the
1453     // scalars are not computed yet. This can happen, because it is called
1454     // via getScalarizationOverhead from setCostBasedWideningDecision, before
1455     // the scalars are collected. That should be a safe assumption in most
1456     // cases, because we check if the operands have vectorizable types
1457     // beforehand in LoopVectorizationLegality.
1458     return Scalars.find(VF) == Scalars.end() ||
1459            !isScalarAfterVectorization(I, VF);
1460   };
1461 
1462   /// Returns a range containing only operands needing to be extracted.
1463   SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1464                                                    unsigned VF) {
1465     return SmallVector<Value *, 4>(make_filter_range(
1466         Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1467   }
1468 
1469 public:
1470   /// The loop that we evaluate.
1471   Loop *TheLoop;
1472 
1473   /// Predicated scalar evolution analysis.
1474   PredicatedScalarEvolution &PSE;
1475 
1476   /// Loop Info analysis.
1477   LoopInfo *LI;
1478 
1479   /// Vectorization legality.
1480   LoopVectorizationLegality *Legal;
1481 
1482   /// Vector target information.
1483   const TargetTransformInfo &TTI;
1484 
1485   /// Target Library Info.
1486   const TargetLibraryInfo *TLI;
1487 
1488   /// Demanded bits analysis.
1489   DemandedBits *DB;
1490 
1491   /// Assumption cache.
1492   AssumptionCache *AC;
1493 
1494   /// Interface to emit optimization remarks.
1495   OptimizationRemarkEmitter *ORE;
1496 
1497   const Function *TheFunction;
1498 
1499   /// Loop Vectorize Hint.
1500   const LoopVectorizeHints *Hints;
1501 
1502   /// The interleave access information contains groups of interleaved accesses
1503   /// with the same stride and close to each other.
1504   InterleavedAccessInfo &InterleaveInfo;
1505 
1506   /// Values to ignore in the cost model.
1507   SmallPtrSet<const Value *, 16> ValuesToIgnore;
1508 
1509   /// Values to ignore in the cost model when VF > 1.
1510   SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1511 };
1512 
1513 } // end namespace llvm
1514 
1515 // Return true if \p OuterLp is an outer loop annotated with hints for explicit
1516 // vectorization. The loop needs to be annotated with #pragma omp simd
1517 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
1518 // vector length information is not provided, vectorization is not considered
1519 // explicit. Interleave hints are not allowed either. These limitations will be
1520 // relaxed in the future.
1521 // Please, note that we are currently forced to abuse the pragma 'clang
1522 // vectorize' semantics. This pragma provides *auto-vectorization hints*
1523 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
1524 // provides *explicit vectorization hints* (LV can bypass legal checks and
1525 // assume that vectorization is legal). However, both hints are implemented
1526 // using the same metadata (llvm.loop.vectorize, processed by
1527 // LoopVectorizeHints). This will be fixed in the future when the native IR
1528 // representation for pragma 'omp simd' is introduced.
1529 static bool isExplicitVecOuterLoop(Loop *OuterLp,
1530                                    OptimizationRemarkEmitter *ORE) {
1531   assert(!OuterLp->empty() && "This is not an outer loop");
1532   LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
1533 
1534   // Only outer loops with an explicit vectorization hint are supported.
1535   // Unannotated outer loops are ignored.
1536   if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
1537     return false;
1538 
1539   Function *Fn = OuterLp->getHeader()->getParent();
1540   if (!Hints.allowVectorization(Fn, OuterLp,
1541                                 true /*VectorizeOnlyWhenForced*/)) {
1542     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
1543     return false;
1544   }
1545 
1546   if (Hints.getInterleave() > 1) {
1547     // TODO: Interleave support is future work.
1548     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
1549                          "outer loops.\n");
1550     Hints.emitRemarkWithHints();
1551     return false;
1552   }
1553 
1554   return true;
1555 }
1556 
1557 static void collectSupportedLoops(Loop &L, LoopInfo *LI,
1558                                   OptimizationRemarkEmitter *ORE,
1559                                   SmallVectorImpl<Loop *> &V) {
1560   // Collect inner loops and outer loops without irreducible control flow. For
1561   // now, only collect outer loops that have explicit vectorization hints. If we
1562   // are stress testing the VPlan H-CFG construction, we collect the outermost
1563   // loop of every loop nest.
1564   if (L.empty() || VPlanBuildStressTest ||
1565       (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
1566     LoopBlocksRPO RPOT(&L);
1567     RPOT.perform(LI);
1568     if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
1569       V.push_back(&L);
1570       // TODO: Collect inner loops inside marked outer loops in case
1571       // vectorization fails for the outer loop. Do not invoke
1572       // 'containsIrreducibleCFG' again for inner loops when the outer loop is
1573       // already known to be reducible. We can use an inherited attribute for
1574       // that.
1575       return;
1576     }
1577   }
1578   for (Loop *InnerL : L)
1579     collectSupportedLoops(*InnerL, LI, ORE, V);
1580 }
1581 
1582 namespace {
1583 
1584 /// The LoopVectorize Pass.
1585 struct LoopVectorize : public FunctionPass {
1586   /// Pass identification, replacement for typeid
1587   static char ID;
1588 
1589   LoopVectorizePass Impl;
1590 
1591   explicit LoopVectorize(bool InterleaveOnlyWhenForced = false,
1592                          bool VectorizeOnlyWhenForced = false)
1593       : FunctionPass(ID) {
1594     Impl.InterleaveOnlyWhenForced = InterleaveOnlyWhenForced;
1595     Impl.VectorizeOnlyWhenForced = VectorizeOnlyWhenForced;
1596     initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
1597   }
1598 
1599   bool runOnFunction(Function &F) override {
1600     if (skipFunction(F))
1601       return false;
1602 
1603     auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
1604     auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
1605     auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
1606     auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
1607     auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
1608     auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
1609     auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
1610     auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
1611     auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
1612     auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
1613     auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
1614     auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
1615     auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
1616 
1617     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
1618         [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
1619 
1620     return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC,
1621                         GetLAA, *ORE, PSI);
1622   }
1623 
1624   void getAnalysisUsage(AnalysisUsage &AU) const override {
1625     AU.addRequired<AssumptionCacheTracker>();
1626     AU.addRequired<BlockFrequencyInfoWrapperPass>();
1627     AU.addRequired<DominatorTreeWrapperPass>();
1628     AU.addRequired<LoopInfoWrapperPass>();
1629     AU.addRequired<ScalarEvolutionWrapperPass>();
1630     AU.addRequired<TargetTransformInfoWrapperPass>();
1631     AU.addRequired<AAResultsWrapperPass>();
1632     AU.addRequired<LoopAccessLegacyAnalysis>();
1633     AU.addRequired<DemandedBitsWrapperPass>();
1634     AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
1635     AU.addRequired<InjectTLIMappingsLegacy>();
1636 
1637     // We currently do not preserve loopinfo/dominator analyses with outer loop
1638     // vectorization. Until this is addressed, mark these analyses as preserved
1639     // only for non-VPlan-native path.
1640     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
1641     if (!EnableVPlanNativePath) {
1642       AU.addPreserved<LoopInfoWrapperPass>();
1643       AU.addPreserved<DominatorTreeWrapperPass>();
1644     }
1645 
1646     AU.addPreserved<BasicAAWrapperPass>();
1647     AU.addPreserved<GlobalsAAWrapperPass>();
1648     AU.addRequired<ProfileSummaryInfoWrapperPass>();
1649   }
1650 };
1651 
1652 } // end anonymous namespace
1653 
1654 //===----------------------------------------------------------------------===//
1655 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
1656 // LoopVectorizationCostModel and LoopVectorizationPlanner.
1657 //===----------------------------------------------------------------------===//
1658 
1659 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
1660   // We need to place the broadcast of invariant variables outside the loop,
1661   // but only if it's proven safe to do so. Else, broadcast will be inside
1662   // vector loop body.
1663   Instruction *Instr = dyn_cast<Instruction>(V);
1664   bool SafeToHoist = OrigLoop->isLoopInvariant(V) &&
1665                      (!Instr ||
1666                       DT->dominates(Instr->getParent(), LoopVectorPreHeader));
1667   // Place the code for broadcasting invariant variables in the new preheader.
1668   IRBuilder<>::InsertPointGuard Guard(Builder);
1669   if (SafeToHoist)
1670     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
1671 
1672   // Broadcast the scalar into all locations in the vector.
1673   Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
1674 
1675   return Shuf;
1676 }
1677 
1678 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
1679     const InductionDescriptor &II, Value *Step, Instruction *EntryVal) {
1680   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1681          "Expected either an induction phi-node or a truncate of it!");
1682   Value *Start = II.getStartValue();
1683 
1684   // Construct the initial value of the vector IV in the vector loop preheader
1685   auto CurrIP = Builder.saveIP();
1686   Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
1687   if (isa<TruncInst>(EntryVal)) {
1688     assert(Start->getType()->isIntegerTy() &&
1689            "Truncation requires an integer type");
1690     auto *TruncType = cast<IntegerType>(EntryVal->getType());
1691     Step = Builder.CreateTrunc(Step, TruncType);
1692     Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
1693   }
1694   Value *SplatStart = Builder.CreateVectorSplat(VF, Start);
1695   Value *SteppedStart =
1696       getStepVector(SplatStart, 0, Step, II.getInductionOpcode());
1697 
1698   // We create vector phi nodes for both integer and floating-point induction
1699   // variables. Here, we determine the kind of arithmetic we will perform.
1700   Instruction::BinaryOps AddOp;
1701   Instruction::BinaryOps MulOp;
1702   if (Step->getType()->isIntegerTy()) {
1703     AddOp = Instruction::Add;
1704     MulOp = Instruction::Mul;
1705   } else {
1706     AddOp = II.getInductionOpcode();
1707     MulOp = Instruction::FMul;
1708   }
1709 
1710   // Multiply the vectorization factor by the step using integer or
1711   // floating-point arithmetic as appropriate.
1712   Value *ConstVF = getSignedIntOrFpConstant(Step->getType(), VF);
1713   Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF));
1714 
1715   // Create a vector splat to use in the induction update.
1716   //
1717   // FIXME: If the step is non-constant, we create the vector splat with
1718   //        IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
1719   //        handle a constant vector splat.
1720   Value *SplatVF =
1721       isa<Constant>(Mul)
1722           ? ConstantVector::getSplat({VF, false}, cast<Constant>(Mul))
1723           : Builder.CreateVectorSplat(VF, Mul);
1724   Builder.restoreIP(CurrIP);
1725 
1726   // We may need to add the step a number of times, depending on the unroll
1727   // factor. The last of those goes into the PHI.
1728   PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
1729                                     &*LoopVectorBody->getFirstInsertionPt());
1730   VecInd->setDebugLoc(EntryVal->getDebugLoc());
1731   Instruction *LastInduction = VecInd;
1732   for (unsigned Part = 0; Part < UF; ++Part) {
1733     VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction);
1734 
1735     if (isa<TruncInst>(EntryVal))
1736       addMetadata(LastInduction, EntryVal);
1737     recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, Part);
1738 
1739     LastInduction = cast<Instruction>(addFastMathFlag(
1740         Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")));
1741     LastInduction->setDebugLoc(EntryVal->getDebugLoc());
1742   }
1743 
1744   // Move the last step to the end of the latch block. This ensures consistent
1745   // placement of all induction updates.
1746   auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
1747   auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator());
1748   auto *ICmp = cast<Instruction>(Br->getCondition());
1749   LastInduction->moveBefore(ICmp);
1750   LastInduction->setName("vec.ind.next");
1751 
1752   VecInd->addIncoming(SteppedStart, LoopVectorPreHeader);
1753   VecInd->addIncoming(LastInduction, LoopVectorLatch);
1754 }
1755 
1756 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const {
1757   return Cost->isScalarAfterVectorization(I, VF) ||
1758          Cost->isProfitableToScalarize(I, VF);
1759 }
1760 
1761 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const {
1762   if (shouldScalarizeInstruction(IV))
1763     return true;
1764   auto isScalarInst = [&](User *U) -> bool {
1765     auto *I = cast<Instruction>(U);
1766     return (OrigLoop->contains(I) && shouldScalarizeInstruction(I));
1767   };
1768   return llvm::any_of(IV->users(), isScalarInst);
1769 }
1770 
1771 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast(
1772     const InductionDescriptor &ID, const Instruction *EntryVal,
1773     Value *VectorLoopVal, unsigned Part, unsigned Lane) {
1774   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1775          "Expected either an induction phi-node or a truncate of it!");
1776 
1777   // This induction variable is not the phi from the original loop but the
1778   // newly-created IV based on the proof that casted Phi is equal to the
1779   // uncasted Phi in the vectorized loop (under a runtime guard possibly). It
1780   // re-uses the same InductionDescriptor that original IV uses but we don't
1781   // have to do any recording in this case - that is done when original IV is
1782   // processed.
1783   if (isa<TruncInst>(EntryVal))
1784     return;
1785 
1786   const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts();
1787   if (Casts.empty())
1788     return;
1789   // Only the first Cast instruction in the Casts vector is of interest.
1790   // The rest of the Casts (if exist) have no uses outside the
1791   // induction update chain itself.
1792   Instruction *CastInst = *Casts.begin();
1793   if (Lane < UINT_MAX)
1794     VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal);
1795   else
1796     VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal);
1797 }
1798 
1799 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) {
1800   assert((IV->getType()->isIntegerTy() || IV != OldInduction) &&
1801          "Primary induction variable must have an integer type");
1802 
1803   auto II = Legal->getInductionVars().find(IV);
1804   assert(II != Legal->getInductionVars().end() && "IV is not an induction");
1805 
1806   auto ID = II->second;
1807   assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
1808 
1809   // The scalar value to broadcast. This will be derived from the canonical
1810   // induction variable.
1811   Value *ScalarIV = nullptr;
1812 
1813   // The value from the original loop to which we are mapping the new induction
1814   // variable.
1815   Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
1816 
1817   // True if we have vectorized the induction variable.
1818   auto VectorizedIV = false;
1819 
1820   // Determine if we want a scalar version of the induction variable. This is
1821   // true if the induction variable itself is not widened, or if it has at
1822   // least one user in the loop that is not widened.
1823   auto NeedsScalarIV = VF > 1 && needsScalarInduction(EntryVal);
1824 
1825   // Generate code for the induction step. Note that induction steps are
1826   // required to be loop-invariant
1827   assert(PSE.getSE()->isLoopInvariant(ID.getStep(), OrigLoop) &&
1828          "Induction step should be loop invariant");
1829   auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
1830   Value *Step = nullptr;
1831   if (PSE.getSE()->isSCEVable(IV->getType())) {
1832     SCEVExpander Exp(*PSE.getSE(), DL, "induction");
1833     Step = Exp.expandCodeFor(ID.getStep(), ID.getStep()->getType(),
1834                              LoopVectorPreHeader->getTerminator());
1835   } else {
1836     Step = cast<SCEVUnknown>(ID.getStep())->getValue();
1837   }
1838 
1839   // Try to create a new independent vector induction variable. If we can't
1840   // create the phi node, we will splat the scalar induction variable in each
1841   // loop iteration.
1842   if (VF > 1 && !shouldScalarizeInstruction(EntryVal)) {
1843     createVectorIntOrFpInductionPHI(ID, Step, EntryVal);
1844     VectorizedIV = true;
1845   }
1846 
1847   // If we haven't yet vectorized the induction variable, or if we will create
1848   // a scalar one, we need to define the scalar induction variable and step
1849   // values. If we were given a truncation type, truncate the canonical
1850   // induction variable and step. Otherwise, derive these values from the
1851   // induction descriptor.
1852   if (!VectorizedIV || NeedsScalarIV) {
1853     ScalarIV = Induction;
1854     if (IV != OldInduction) {
1855       ScalarIV = IV->getType()->isIntegerTy()
1856                      ? Builder.CreateSExtOrTrunc(Induction, IV->getType())
1857                      : Builder.CreateCast(Instruction::SIToFP, Induction,
1858                                           IV->getType());
1859       ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID);
1860       ScalarIV->setName("offset.idx");
1861     }
1862     if (Trunc) {
1863       auto *TruncType = cast<IntegerType>(Trunc->getType());
1864       assert(Step->getType()->isIntegerTy() &&
1865              "Truncation requires an integer step");
1866       ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType);
1867       Step = Builder.CreateTrunc(Step, TruncType);
1868     }
1869   }
1870 
1871   // If we haven't yet vectorized the induction variable, splat the scalar
1872   // induction variable, and build the necessary step vectors.
1873   // TODO: Don't do it unless the vectorized IV is really required.
1874   if (!VectorizedIV) {
1875     Value *Broadcasted = getBroadcastInstrs(ScalarIV);
1876     for (unsigned Part = 0; Part < UF; ++Part) {
1877       Value *EntryPart =
1878           getStepVector(Broadcasted, VF * Part, Step, ID.getInductionOpcode());
1879       VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart);
1880       if (Trunc)
1881         addMetadata(EntryPart, Trunc);
1882       recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part);
1883     }
1884   }
1885 
1886   // If an induction variable is only used for counting loop iterations or
1887   // calculating addresses, it doesn't need to be widened. Create scalar steps
1888   // that can be used by instructions we will later scalarize. Note that the
1889   // addition of the scalar steps will not increase the number of instructions
1890   // in the loop in the common case prior to InstCombine. We will be trading
1891   // one vector extract for each scalar step.
1892   if (NeedsScalarIV)
1893     buildScalarSteps(ScalarIV, Step, EntryVal, ID);
1894 }
1895 
1896 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step,
1897                                           Instruction::BinaryOps BinOp) {
1898   // Create and check the types.
1899   assert(Val->getType()->isVectorTy() && "Must be a vector");
1900   int VLen = Val->getType()->getVectorNumElements();
1901 
1902   Type *STy = Val->getType()->getScalarType();
1903   assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
1904          "Induction Step must be an integer or FP");
1905   assert(Step->getType() == STy && "Step has wrong type");
1906 
1907   SmallVector<Constant *, 8> Indices;
1908 
1909   if (STy->isIntegerTy()) {
1910     // Create a vector of consecutive numbers from zero to VF.
1911     for (int i = 0; i < VLen; ++i)
1912       Indices.push_back(ConstantInt::get(STy, StartIdx + i));
1913 
1914     // Add the consecutive indices to the vector value.
1915     Constant *Cv = ConstantVector::get(Indices);
1916     assert(Cv->getType() == Val->getType() && "Invalid consecutive vec");
1917     Step = Builder.CreateVectorSplat(VLen, Step);
1918     assert(Step->getType() == Val->getType() && "Invalid step vec");
1919     // FIXME: The newly created binary instructions should contain nsw/nuw flags,
1920     // which can be found from the original scalar operations.
1921     Step = Builder.CreateMul(Cv, Step);
1922     return Builder.CreateAdd(Val, Step, "induction");
1923   }
1924 
1925   // Floating point induction.
1926   assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
1927          "Binary Opcode should be specified for FP induction");
1928   // Create a vector of consecutive numbers from zero to VF.
1929   for (int i = 0; i < VLen; ++i)
1930     Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i)));
1931 
1932   // Add the consecutive indices to the vector value.
1933   Constant *Cv = ConstantVector::get(Indices);
1934 
1935   Step = Builder.CreateVectorSplat(VLen, Step);
1936 
1937   // Floating point operations had to be 'fast' to enable the induction.
1938   FastMathFlags Flags;
1939   Flags.setFast();
1940 
1941   Value *MulOp = Builder.CreateFMul(Cv, Step);
1942   if (isa<Instruction>(MulOp))
1943     // Have to check, MulOp may be a constant
1944     cast<Instruction>(MulOp)->setFastMathFlags(Flags);
1945 
1946   Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
1947   if (isa<Instruction>(BOp))
1948     cast<Instruction>(BOp)->setFastMathFlags(Flags);
1949   return BOp;
1950 }
1951 
1952 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
1953                                            Instruction *EntryVal,
1954                                            const InductionDescriptor &ID) {
1955   // We shouldn't have to build scalar steps if we aren't vectorizing.
1956   assert(VF > 1 && "VF should be greater than one");
1957 
1958   // Get the value type and ensure it and the step have the same integer type.
1959   Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
1960   assert(ScalarIVTy == Step->getType() &&
1961          "Val and Step should have the same type");
1962 
1963   // We build scalar steps for both integer and floating-point induction
1964   // variables. Here, we determine the kind of arithmetic we will perform.
1965   Instruction::BinaryOps AddOp;
1966   Instruction::BinaryOps MulOp;
1967   if (ScalarIVTy->isIntegerTy()) {
1968     AddOp = Instruction::Add;
1969     MulOp = Instruction::Mul;
1970   } else {
1971     AddOp = ID.getInductionOpcode();
1972     MulOp = Instruction::FMul;
1973   }
1974 
1975   // Determine the number of scalars we need to generate for each unroll
1976   // iteration. If EntryVal is uniform, we only need to generate the first
1977   // lane. Otherwise, we generate all VF values.
1978   unsigned Lanes =
1979       Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) ? 1
1980                                                                          : VF;
1981   // Compute the scalar steps and save the results in VectorLoopValueMap.
1982   for (unsigned Part = 0; Part < UF; ++Part) {
1983     for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
1984       auto *StartIdx = getSignedIntOrFpConstant(ScalarIVTy, VF * Part + Lane);
1985       auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step));
1986       auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul));
1987       VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add);
1988       recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane);
1989     }
1990   }
1991 }
1992 
1993 Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) {
1994   assert(V != Induction && "The new induction variable should not be used.");
1995   assert(!V->getType()->isVectorTy() && "Can't widen a vector");
1996   assert(!V->getType()->isVoidTy() && "Type does not produce a value");
1997 
1998   // If we have a stride that is replaced by one, do it here. Defer this for
1999   // the VPlan-native path until we start running Legal checks in that path.
2000   if (!EnableVPlanNativePath && Legal->hasStride(V))
2001     V = ConstantInt::get(V->getType(), 1);
2002 
2003   // If we have a vector mapped to this value, return it.
2004   if (VectorLoopValueMap.hasVectorValue(V, Part))
2005     return VectorLoopValueMap.getVectorValue(V, Part);
2006 
2007   // If the value has not been vectorized, check if it has been scalarized
2008   // instead. If it has been scalarized, and we actually need the value in
2009   // vector form, we will construct the vector values on demand.
2010   if (VectorLoopValueMap.hasAnyScalarValue(V)) {
2011     Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0});
2012 
2013     // If we've scalarized a value, that value should be an instruction.
2014     auto *I = cast<Instruction>(V);
2015 
2016     // If we aren't vectorizing, we can just copy the scalar map values over to
2017     // the vector map.
2018     if (VF == 1) {
2019       VectorLoopValueMap.setVectorValue(V, Part, ScalarValue);
2020       return ScalarValue;
2021     }
2022 
2023     // Get the last scalar instruction we generated for V and Part. If the value
2024     // is known to be uniform after vectorization, this corresponds to lane zero
2025     // of the Part unroll iteration. Otherwise, the last instruction is the one
2026     // we created for the last vector lane of the Part unroll iteration.
2027     unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) ? 0 : VF - 1;
2028     auto *LastInst = cast<Instruction>(
2029         VectorLoopValueMap.getScalarValue(V, {Part, LastLane}));
2030 
2031     // Set the insert point after the last scalarized instruction. This ensures
2032     // the insertelement sequence will directly follow the scalar definitions.
2033     auto OldIP = Builder.saveIP();
2034     auto NewIP = std::next(BasicBlock::iterator(LastInst));
2035     Builder.SetInsertPoint(&*NewIP);
2036 
2037     // However, if we are vectorizing, we need to construct the vector values.
2038     // If the value is known to be uniform after vectorization, we can just
2039     // broadcast the scalar value corresponding to lane zero for each unroll
2040     // iteration. Otherwise, we construct the vector values using insertelement
2041     // instructions. Since the resulting vectors are stored in
2042     // VectorLoopValueMap, we will only generate the insertelements once.
2043     Value *VectorValue = nullptr;
2044     if (Cost->isUniformAfterVectorization(I, VF)) {
2045       VectorValue = getBroadcastInstrs(ScalarValue);
2046       VectorLoopValueMap.setVectorValue(V, Part, VectorValue);
2047     } else {
2048       // Initialize packing with insertelements to start from undef.
2049       Value *Undef = UndefValue::get(VectorType::get(V->getType(), VF));
2050       VectorLoopValueMap.setVectorValue(V, Part, Undef);
2051       for (unsigned Lane = 0; Lane < VF; ++Lane)
2052         packScalarIntoVectorValue(V, {Part, Lane});
2053       VectorValue = VectorLoopValueMap.getVectorValue(V, Part);
2054     }
2055     Builder.restoreIP(OldIP);
2056     return VectorValue;
2057   }
2058 
2059   // If this scalar is unknown, assume that it is a constant or that it is
2060   // loop invariant. Broadcast V and save the value for future uses.
2061   Value *B = getBroadcastInstrs(V);
2062   VectorLoopValueMap.setVectorValue(V, Part, B);
2063   return B;
2064 }
2065 
2066 Value *
2067 InnerLoopVectorizer::getOrCreateScalarValue(Value *V,
2068                                             const VPIteration &Instance) {
2069   // If the value is not an instruction contained in the loop, it should
2070   // already be scalar.
2071   if (OrigLoop->isLoopInvariant(V))
2072     return V;
2073 
2074   assert(Instance.Lane > 0
2075              ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF)
2076              : true && "Uniform values only have lane zero");
2077 
2078   // If the value from the original loop has not been vectorized, it is
2079   // represented by UF x VF scalar values in the new loop. Return the requested
2080   // scalar value.
2081   if (VectorLoopValueMap.hasScalarValue(V, Instance))
2082     return VectorLoopValueMap.getScalarValue(V, Instance);
2083 
2084   // If the value has not been scalarized, get its entry in VectorLoopValueMap
2085   // for the given unroll part. If this entry is not a vector type (i.e., the
2086   // vectorization factor is one), there is no need to generate an
2087   // extractelement instruction.
2088   auto *U = getOrCreateVectorValue(V, Instance.Part);
2089   if (!U->getType()->isVectorTy()) {
2090     assert(VF == 1 && "Value not scalarized has non-vector type");
2091     return U;
2092   }
2093 
2094   // Otherwise, the value from the original loop has been vectorized and is
2095   // represented by UF vector values. Extract and return the requested scalar
2096   // value from the appropriate vector lane.
2097   return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane));
2098 }
2099 
2100 void InnerLoopVectorizer::packScalarIntoVectorValue(
2101     Value *V, const VPIteration &Instance) {
2102   assert(V != Induction && "The new induction variable should not be used.");
2103   assert(!V->getType()->isVectorTy() && "Can't pack a vector");
2104   assert(!V->getType()->isVoidTy() && "Type does not produce a value");
2105 
2106   Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance);
2107   Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part);
2108   VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst,
2109                                             Builder.getInt32(Instance.Lane));
2110   VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue);
2111 }
2112 
2113 Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
2114   assert(Vec->getType()->isVectorTy() && "Invalid type");
2115   SmallVector<Constant *, 8> ShuffleMask;
2116   for (unsigned i = 0; i < VF; ++i)
2117     ShuffleMask.push_back(Builder.getInt32(VF - i - 1));
2118 
2119   return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()),
2120                                      ConstantVector::get(ShuffleMask),
2121                                      "reverse");
2122 }
2123 
2124 // Return whether we allow using masked interleave-groups (for dealing with
2125 // strided loads/stores that reside in predicated blocks, or for dealing
2126 // with gaps).
2127 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2128   // If an override option has been passed in for interleaved accesses, use it.
2129   if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2130     return EnableMaskedInterleavedMemAccesses;
2131 
2132   return TTI.enableMaskedInterleavedAccessVectorization();
2133 }
2134 
2135 // Try to vectorize the interleave group that \p Instr belongs to.
2136 //
2137 // E.g. Translate following interleaved load group (factor = 3):
2138 //   for (i = 0; i < N; i+=3) {
2139 //     R = Pic[i];             // Member of index 0
2140 //     G = Pic[i+1];           // Member of index 1
2141 //     B = Pic[i+2];           // Member of index 2
2142 //     ... // do something to R, G, B
2143 //   }
2144 // To:
2145 //   %wide.vec = load <12 x i32>                       ; Read 4 tuples of R,G,B
2146 //   %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9>   ; R elements
2147 //   %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10>  ; G elements
2148 //   %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11>  ; B elements
2149 //
2150 // Or translate following interleaved store group (factor = 3):
2151 //   for (i = 0; i < N; i+=3) {
2152 //     ... do something to R, G, B
2153 //     Pic[i]   = R;           // Member of index 0
2154 //     Pic[i+1] = G;           // Member of index 1
2155 //     Pic[i+2] = B;           // Member of index 2
2156 //   }
2157 // To:
2158 //   %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2159 //   %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u>
2160 //   %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2161 //        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements
2162 //   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
2163 void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr,
2164                                                    VPTransformState &State,
2165                                                    VPValue *Addr,
2166                                                    VPValue *BlockInMask) {
2167   const InterleaveGroup<Instruction> *Group =
2168       Cost->getInterleavedAccessGroup(Instr);
2169   assert(Group && "Fail to get an interleaved access group.");
2170 
2171   // Skip if current instruction is not the insert position.
2172   if (Instr != Group->getInsertPos())
2173     return;
2174 
2175   const DataLayout &DL = Instr->getModule()->getDataLayout();
2176 
2177   // Prepare for the vector type of the interleaved load/store.
2178   Type *ScalarTy = getMemInstValueType(Instr);
2179   unsigned InterleaveFactor = Group->getFactor();
2180   Type *VecTy = VectorType::get(ScalarTy, InterleaveFactor * VF);
2181 
2182   // Prepare for the new pointers.
2183   SmallVector<Value *, 2> AddrParts;
2184   unsigned Index = Group->getIndex(Instr);
2185 
2186   // TODO: extend the masked interleaved-group support to reversed access.
2187   assert((!BlockInMask || !Group->isReverse()) &&
2188          "Reversed masked interleave-group not supported.");
2189 
2190   // If the group is reverse, adjust the index to refer to the last vector lane
2191   // instead of the first. We adjust the index from the first vector lane,
2192   // rather than directly getting the pointer for lane VF - 1, because the
2193   // pointer operand of the interleaved access is supposed to be uniform. For
2194   // uniform instructions, we're only required to generate a value for the
2195   // first vector lane in each unroll iteration.
2196   if (Group->isReverse())
2197     Index += (VF - 1) * Group->getFactor();
2198 
2199   for (unsigned Part = 0; Part < UF; Part++) {
2200     Value *AddrPart = State.get(Addr, {Part, 0});
2201     setDebugLocFromInst(Builder, AddrPart);
2202 
2203     // Notice current instruction could be any index. Need to adjust the address
2204     // to the member of index 0.
2205     //
2206     // E.g.  a = A[i+1];     // Member of index 1 (Current instruction)
2207     //       b = A[i];       // Member of index 0
2208     // Current pointer is pointed to A[i+1], adjust it to A[i].
2209     //
2210     // E.g.  A[i+1] = a;     // Member of index 1
2211     //       A[i]   = b;     // Member of index 0
2212     //       A[i+2] = c;     // Member of index 2 (Current instruction)
2213     // Current pointer is pointed to A[i+2], adjust it to A[i].
2214 
2215     bool InBounds = false;
2216     if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts()))
2217       InBounds = gep->isInBounds();
2218     AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index));
2219     cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds);
2220 
2221     // Cast to the vector pointer type.
2222     unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace();
2223     Type *PtrTy = VecTy->getPointerTo(AddressSpace);
2224     AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy));
2225   }
2226 
2227   setDebugLocFromInst(Builder, Instr);
2228   Value *UndefVec = UndefValue::get(VecTy);
2229 
2230   Value *MaskForGaps = nullptr;
2231   if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) {
2232     MaskForGaps = createBitMaskForGaps(Builder, VF, *Group);
2233     assert(MaskForGaps && "Mask for Gaps is required but it is null");
2234   }
2235 
2236   // Vectorize the interleaved load group.
2237   if (isa<LoadInst>(Instr)) {
2238     // For each unroll part, create a wide load for the group.
2239     SmallVector<Value *, 2> NewLoads;
2240     for (unsigned Part = 0; Part < UF; Part++) {
2241       Instruction *NewLoad;
2242       if (BlockInMask || MaskForGaps) {
2243         assert(useMaskedInterleavedAccesses(*TTI) &&
2244                "masked interleaved groups are not allowed.");
2245         Value *GroupMask = MaskForGaps;
2246         if (BlockInMask) {
2247           Value *BlockInMaskPart = State.get(BlockInMask, Part);
2248           auto *Undefs = UndefValue::get(BlockInMaskPart->getType());
2249           auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF);
2250           Value *ShuffledMask = Builder.CreateShuffleVector(
2251               BlockInMaskPart, Undefs, RepMask, "interleaved.mask");
2252           GroupMask = MaskForGaps
2253                           ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
2254                                                 MaskForGaps)
2255                           : ShuffledMask;
2256         }
2257         NewLoad =
2258             Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlign(),
2259                                      GroupMask, UndefVec, "wide.masked.vec");
2260       }
2261       else
2262         NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part],
2263                                             Group->getAlign(), "wide.vec");
2264       Group->addMetadata(NewLoad);
2265       NewLoads.push_back(NewLoad);
2266     }
2267 
2268     // For each member in the group, shuffle out the appropriate data from the
2269     // wide loads.
2270     for (unsigned I = 0; I < InterleaveFactor; ++I) {
2271       Instruction *Member = Group->getMember(I);
2272 
2273       // Skip the gaps in the group.
2274       if (!Member)
2275         continue;
2276 
2277       Constant *StrideMask = createStrideMask(Builder, I, InterleaveFactor, VF);
2278       for (unsigned Part = 0; Part < UF; Part++) {
2279         Value *StridedVec = Builder.CreateShuffleVector(
2280             NewLoads[Part], UndefVec, StrideMask, "strided.vec");
2281 
2282         // If this member has different type, cast the result type.
2283         if (Member->getType() != ScalarTy) {
2284           VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2285           StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2286         }
2287 
2288         if (Group->isReverse())
2289           StridedVec = reverseVector(StridedVec);
2290 
2291         VectorLoopValueMap.setVectorValue(Member, Part, StridedVec);
2292       }
2293     }
2294     return;
2295   }
2296 
2297   // The sub vector type for current instruction.
2298   VectorType *SubVT = VectorType::get(ScalarTy, VF);
2299 
2300   // Vectorize the interleaved store group.
2301   for (unsigned Part = 0; Part < UF; Part++) {
2302     // Collect the stored vector from each member.
2303     SmallVector<Value *, 4> StoredVecs;
2304     for (unsigned i = 0; i < InterleaveFactor; i++) {
2305       // Interleaved store group doesn't allow a gap, so each index has a member
2306       Instruction *Member = Group->getMember(i);
2307       assert(Member && "Fail to get a member from an interleaved store group");
2308 
2309       Value *StoredVec = getOrCreateVectorValue(
2310           cast<StoreInst>(Member)->getValueOperand(), Part);
2311       if (Group->isReverse())
2312         StoredVec = reverseVector(StoredVec);
2313 
2314       // If this member has different type, cast it to a unified type.
2315 
2316       if (StoredVec->getType() != SubVT)
2317         StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2318 
2319       StoredVecs.push_back(StoredVec);
2320     }
2321 
2322     // Concatenate all vectors into a wide vector.
2323     Value *WideVec = concatenateVectors(Builder, StoredVecs);
2324 
2325     // Interleave the elements in the wide vector.
2326     Constant *IMask = createInterleaveMask(Builder, VF, InterleaveFactor);
2327     Value *IVec = Builder.CreateShuffleVector(WideVec, UndefVec, IMask,
2328                                               "interleaved.vec");
2329 
2330     Instruction *NewStoreInstr;
2331     if (BlockInMask) {
2332       Value *BlockInMaskPart = State.get(BlockInMask, Part);
2333       auto *Undefs = UndefValue::get(BlockInMaskPart->getType());
2334       auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF);
2335       Value *ShuffledMask = Builder.CreateShuffleVector(
2336           BlockInMaskPart, Undefs, RepMask, "interleaved.mask");
2337       NewStoreInstr = Builder.CreateMaskedStore(
2338           IVec, AddrParts[Part], Group->getAlign(), ShuffledMask);
2339     }
2340     else
2341       NewStoreInstr =
2342           Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign());
2343 
2344     Group->addMetadata(NewStoreInstr);
2345   }
2346 }
2347 
2348 void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
2349                                                      VPTransformState &State,
2350                                                      VPValue *Addr,
2351                                                      VPValue *StoredValue,
2352                                                      VPValue *BlockInMask) {
2353   // Attempt to issue a wide load.
2354   LoadInst *LI = dyn_cast<LoadInst>(Instr);
2355   StoreInst *SI = dyn_cast<StoreInst>(Instr);
2356 
2357   assert((LI || SI) && "Invalid Load/Store instruction");
2358   assert((!SI || StoredValue) && "No stored value provided for widened store");
2359   assert((!LI || !StoredValue) && "Stored value provided for widened load");
2360 
2361   LoopVectorizationCostModel::InstWidening Decision =
2362       Cost->getWideningDecision(Instr, VF);
2363   assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
2364          "CM decision should be taken at this point");
2365   if (Decision == LoopVectorizationCostModel::CM_Interleave)
2366     return vectorizeInterleaveGroup(Instr, State, Addr, BlockInMask);
2367 
2368   Type *ScalarDataTy = getMemInstValueType(Instr);
2369   Type *DataTy = VectorType::get(ScalarDataTy, VF);
2370   // An alignment of 0 means target abi alignment. We need to use the scalar's
2371   // target abi alignment in such a case.
2372   const DataLayout &DL = Instr->getModule()->getDataLayout();
2373   const Align Alignment =
2374       DL.getValueOrABITypeAlignment(getLoadStoreAlignment(Instr), ScalarDataTy);
2375 
2376   // Determine if the pointer operand of the access is either consecutive or
2377   // reverse consecutive.
2378   bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse);
2379   bool ConsecutiveStride =
2380       Reverse || (Decision == LoopVectorizationCostModel::CM_Widen);
2381   bool CreateGatherScatter =
2382       (Decision == LoopVectorizationCostModel::CM_GatherScatter);
2383 
2384   // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector
2385   // gather/scatter. Otherwise Decision should have been to Scalarize.
2386   assert((ConsecutiveStride || CreateGatherScatter) &&
2387          "The instruction should be scalarized");
2388   (void)ConsecutiveStride;
2389 
2390   VectorParts BlockInMaskParts(UF);
2391   bool isMaskRequired = BlockInMask;
2392   if (isMaskRequired)
2393     for (unsigned Part = 0; Part < UF; ++Part)
2394       BlockInMaskParts[Part] = State.get(BlockInMask, Part);
2395 
2396   const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
2397     // Calculate the pointer for the specific unroll-part.
2398     GetElementPtrInst *PartPtr = nullptr;
2399 
2400     bool InBounds = false;
2401     if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
2402       InBounds = gep->isInBounds();
2403 
2404     if (Reverse) {
2405       // If the address is consecutive but reversed, then the
2406       // wide store needs to start at the last vector element.
2407       PartPtr = cast<GetElementPtrInst>(
2408           Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(-Part * VF)));
2409       PartPtr->setIsInBounds(InBounds);
2410       PartPtr = cast<GetElementPtrInst>(
2411           Builder.CreateGEP(ScalarDataTy, PartPtr, Builder.getInt32(1 - VF)));
2412       PartPtr->setIsInBounds(InBounds);
2413       if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
2414         BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]);
2415     } else {
2416       PartPtr = cast<GetElementPtrInst>(
2417           Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(Part * VF)));
2418       PartPtr->setIsInBounds(InBounds);
2419     }
2420 
2421     unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
2422     return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
2423   };
2424 
2425   // Handle Stores:
2426   if (SI) {
2427     setDebugLocFromInst(Builder, SI);
2428 
2429     for (unsigned Part = 0; Part < UF; ++Part) {
2430       Instruction *NewSI = nullptr;
2431       Value *StoredVal = State.get(StoredValue, Part);
2432       if (CreateGatherScatter) {
2433         Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
2434         Value *VectorGep = State.get(Addr, Part);
2435         NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
2436                                             MaskPart);
2437       } else {
2438         if (Reverse) {
2439           // If we store to reverse consecutive memory locations, then we need
2440           // to reverse the order of elements in the stored value.
2441           StoredVal = reverseVector(StoredVal);
2442           // We don't want to update the value in the map as it might be used in
2443           // another expression. So don't call resetVectorValue(StoredVal).
2444         }
2445         auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0}));
2446         if (isMaskRequired)
2447           NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
2448                                             BlockInMaskParts[Part]);
2449         else
2450           NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
2451       }
2452       addMetadata(NewSI, SI);
2453     }
2454     return;
2455   }
2456 
2457   // Handle loads.
2458   assert(LI && "Must have a load instruction");
2459   setDebugLocFromInst(Builder, LI);
2460   for (unsigned Part = 0; Part < UF; ++Part) {
2461     Value *NewLI;
2462     if (CreateGatherScatter) {
2463       Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
2464       Value *VectorGep = State.get(Addr, Part);
2465       NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart,
2466                                          nullptr, "wide.masked.gather");
2467       addMetadata(NewLI, LI);
2468     } else {
2469       auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0}));
2470       if (isMaskRequired)
2471         NewLI = Builder.CreateMaskedLoad(
2472             VecPtr, Alignment, BlockInMaskParts[Part], UndefValue::get(DataTy),
2473             "wide.masked.load");
2474       else
2475         NewLI =
2476             Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");
2477 
2478       // Add metadata to the load, but setVectorValue to the reverse shuffle.
2479       addMetadata(NewLI, LI);
2480       if (Reverse)
2481         NewLI = reverseVector(NewLI);
2482     }
2483     VectorLoopValueMap.setVectorValue(Instr, Part, NewLI);
2484   }
2485 }
2486 
2487 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
2488                                                const VPIteration &Instance,
2489                                                bool IfPredicateInstr) {
2490   assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
2491 
2492   setDebugLocFromInst(Builder, Instr);
2493 
2494   // Does this instruction return a value ?
2495   bool IsVoidRetTy = Instr->getType()->isVoidTy();
2496 
2497   Instruction *Cloned = Instr->clone();
2498   if (!IsVoidRetTy)
2499     Cloned->setName(Instr->getName() + ".cloned");
2500 
2501   // Replace the operands of the cloned instructions with their scalar
2502   // equivalents in the new loop.
2503   for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
2504     auto *NewOp = getOrCreateScalarValue(Instr->getOperand(op), Instance);
2505     Cloned->setOperand(op, NewOp);
2506   }
2507   addNewMetadata(Cloned, Instr);
2508 
2509   // Place the cloned scalar in the new loop.
2510   Builder.Insert(Cloned);
2511 
2512   // Add the cloned scalar to the scalar map entry.
2513   VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned);
2514 
2515   // If we just cloned a new assumption, add it the assumption cache.
2516   if (auto *II = dyn_cast<IntrinsicInst>(Cloned))
2517     if (II->getIntrinsicID() == Intrinsic::assume)
2518       AC->registerAssumption(II);
2519 
2520   // End if-block.
2521   if (IfPredicateInstr)
2522     PredicatedInstructions.push_back(Cloned);
2523 }
2524 
2525 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start,
2526                                                       Value *End, Value *Step,
2527                                                       Instruction *DL) {
2528   BasicBlock *Header = L->getHeader();
2529   BasicBlock *Latch = L->getLoopLatch();
2530   // As we're just creating this loop, it's possible no latch exists
2531   // yet. If so, use the header as this will be a single block loop.
2532   if (!Latch)
2533     Latch = Header;
2534 
2535   IRBuilder<> Builder(&*Header->getFirstInsertionPt());
2536   Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction);
2537   setDebugLocFromInst(Builder, OldInst);
2538   auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index");
2539 
2540   Builder.SetInsertPoint(Latch->getTerminator());
2541   setDebugLocFromInst(Builder, OldInst);
2542 
2543   // Create i+1 and fill the PHINode.
2544   Value *Next = Builder.CreateAdd(Induction, Step, "index.next");
2545   Induction->addIncoming(Start, L->getLoopPreheader());
2546   Induction->addIncoming(Next, Latch);
2547   // Create the compare.
2548   Value *ICmp = Builder.CreateICmpEQ(Next, End);
2549   Builder.CreateCondBr(ICmp, L->getExitBlock(), Header);
2550 
2551   // Now we have two terminators. Remove the old one from the block.
2552   Latch->getTerminator()->eraseFromParent();
2553 
2554   return Induction;
2555 }
2556 
2557 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
2558   if (TripCount)
2559     return TripCount;
2560 
2561   assert(L && "Create Trip Count for null loop.");
2562   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2563   // Find the loop boundaries.
2564   ScalarEvolution *SE = PSE.getSE();
2565   const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
2566   assert(BackedgeTakenCount != SE->getCouldNotCompute() &&
2567          "Invalid loop count");
2568 
2569   Type *IdxTy = Legal->getWidestInductionType();
2570   assert(IdxTy && "No type for induction");
2571 
2572   // The exit count might have the type of i64 while the phi is i32. This can
2573   // happen if we have an induction variable that is sign extended before the
2574   // compare. The only way that we get a backedge taken count is that the
2575   // induction variable was signed and as such will not overflow. In such a case
2576   // truncation is legal.
2577   if (BackedgeTakenCount->getType()->getPrimitiveSizeInBits() >
2578       IdxTy->getPrimitiveSizeInBits())
2579     BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
2580   BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
2581 
2582   // Get the total trip count from the count by adding 1.
2583   const SCEV *ExitCount = SE->getAddExpr(
2584       BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
2585 
2586   const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
2587 
2588   // Expand the trip count and place the new instructions in the preheader.
2589   // Notice that the pre-header does not change, only the loop body.
2590   SCEVExpander Exp(*SE, DL, "induction");
2591 
2592   // Count holds the overall loop count (N).
2593   TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
2594                                 L->getLoopPreheader()->getTerminator());
2595 
2596   if (TripCount->getType()->isPointerTy())
2597     TripCount =
2598         CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
2599                                     L->getLoopPreheader()->getTerminator());
2600 
2601   return TripCount;
2602 }
2603 
2604 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
2605   if (VectorTripCount)
2606     return VectorTripCount;
2607 
2608   Value *TC = getOrCreateTripCount(L);
2609   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2610 
2611   Type *Ty = TC->getType();
2612   Constant *Step = ConstantInt::get(Ty, VF * UF);
2613 
2614   // If the tail is to be folded by masking, round the number of iterations N
2615   // up to a multiple of Step instead of rounding down. This is done by first
2616   // adding Step-1 and then rounding down. Note that it's ok if this addition
2617   // overflows: the vector induction variable will eventually wrap to zero given
2618   // that it starts at zero and its Step is a power of two; the loop will then
2619   // exit, with the last early-exit vector comparison also producing all-true.
2620   if (Cost->foldTailByMasking()) {
2621     assert(isPowerOf2_32(VF * UF) &&
2622            "VF*UF must be a power of 2 when folding tail by masking");
2623     TC = Builder.CreateAdd(TC, ConstantInt::get(Ty, VF * UF - 1), "n.rnd.up");
2624   }
2625 
2626   // Now we need to generate the expression for the part of the loop that the
2627   // vectorized body will execute. This is equal to N - (N % Step) if scalar
2628   // iterations are not required for correctness, or N - Step, otherwise. Step
2629   // is equal to the vectorization factor (number of SIMD elements) times the
2630   // unroll factor (number of SIMD instructions).
2631   Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
2632 
2633   // If there is a non-reversed interleaved group that may speculatively access
2634   // memory out-of-bounds, we need to ensure that there will be at least one
2635   // iteration of the scalar epilogue loop. Thus, if the step evenly divides
2636   // the trip count, we set the remainder to be equal to the step. If the step
2637   // does not evenly divide the trip count, no adjustment is necessary since
2638   // there will already be scalar iterations. Note that the minimum iterations
2639   // check ensures that N >= Step.
2640   if (VF > 1 && Cost->requiresScalarEpilogue()) {
2641     auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
2642     R = Builder.CreateSelect(IsZero, Step, R);
2643   }
2644 
2645   VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
2646 
2647   return VectorTripCount;
2648 }
2649 
2650 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
2651                                                    const DataLayout &DL) {
2652   // Verify that V is a vector type with same number of elements as DstVTy.
2653   unsigned VF = DstVTy->getNumElements();
2654   VectorType *SrcVecTy = cast<VectorType>(V->getType());
2655   assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match");
2656   Type *SrcElemTy = SrcVecTy->getElementType();
2657   Type *DstElemTy = DstVTy->getElementType();
2658   assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
2659          "Vector elements must have same size");
2660 
2661   // Do a direct cast if element types are castable.
2662   if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
2663     return Builder.CreateBitOrPointerCast(V, DstVTy);
2664   }
2665   // V cannot be directly casted to desired vector type.
2666   // May happen when V is a floating point vector but DstVTy is a vector of
2667   // pointers or vice-versa. Handle this using a two-step bitcast using an
2668   // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
2669   assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
2670          "Only one type should be a pointer type");
2671   assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
2672          "Only one type should be a floating point type");
2673   Type *IntTy =
2674       IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
2675   VectorType *VecIntTy = VectorType::get(IntTy, VF);
2676   Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
2677   return Builder.CreateBitOrPointerCast(CastVal, DstVTy);
2678 }
2679 
2680 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
2681                                                          BasicBlock *Bypass) {
2682   Value *Count = getOrCreateTripCount(L);
2683   // Reuse existing vector loop preheader for TC checks.
2684   // Note that new preheader block is generated for vector loop.
2685   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
2686   IRBuilder<> Builder(TCCheckBlock->getTerminator());
2687 
2688   // Generate code to check if the loop's trip count is less than VF * UF, or
2689   // equal to it in case a scalar epilogue is required; this implies that the
2690   // vector trip count is zero. This check also covers the case where adding one
2691   // to the backedge-taken count overflowed leading to an incorrect trip count
2692   // of zero. In this case we will also jump to the scalar loop.
2693   auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE
2694                                           : ICmpInst::ICMP_ULT;
2695 
2696   // If tail is to be folded, vector loop takes care of all iterations.
2697   Value *CheckMinIters = Builder.getFalse();
2698   if (!Cost->foldTailByMasking())
2699     CheckMinIters = Builder.CreateICmp(
2700         P, Count, ConstantInt::get(Count->getType(), VF * UF),
2701         "min.iters.check");
2702 
2703   // Create new preheader for vector loop.
2704   LoopVectorPreHeader =
2705       SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
2706                  "vector.ph");
2707 
2708   assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
2709                                DT->getNode(Bypass)->getIDom()) &&
2710          "TC check is expected to dominate Bypass");
2711 
2712   // Update dominator for Bypass & LoopExit.
2713   DT->changeImmediateDominator(Bypass, TCCheckBlock);
2714   DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
2715 
2716   ReplaceInstWithInst(
2717       TCCheckBlock->getTerminator(),
2718       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
2719   LoopBypassBlocks.push_back(TCCheckBlock);
2720 }
2721 
2722 void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
2723   // Reuse existing vector loop preheader for SCEV checks.
2724   // Note that new preheader block is generated for vector loop.
2725   BasicBlock *const SCEVCheckBlock = LoopVectorPreHeader;
2726 
2727   // Generate the code to check that the SCEV assumptions that we made.
2728   // We want the new basic block to start at the first instruction in a
2729   // sequence of instructions that form a check.
2730   SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(),
2731                    "scev.check");
2732   Value *SCEVCheck = Exp.expandCodeForPredicate(
2733       &PSE.getUnionPredicate(), SCEVCheckBlock->getTerminator());
2734 
2735   if (auto *C = dyn_cast<ConstantInt>(SCEVCheck))
2736     if (C->isZero())
2737       return;
2738 
2739   assert(!SCEVCheckBlock->getParent()->hasOptSize() &&
2740          "Cannot SCEV check stride or overflow when optimizing for size");
2741 
2742   SCEVCheckBlock->setName("vector.scevcheck");
2743   // Create new preheader for vector loop.
2744   LoopVectorPreHeader =
2745       SplitBlock(SCEVCheckBlock, SCEVCheckBlock->getTerminator(), DT, LI,
2746                  nullptr, "vector.ph");
2747 
2748   // Update dominator only if this is first RT check.
2749   if (LoopBypassBlocks.empty()) {
2750     DT->changeImmediateDominator(Bypass, SCEVCheckBlock);
2751     DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock);
2752   }
2753 
2754   ReplaceInstWithInst(
2755       SCEVCheckBlock->getTerminator(),
2756       BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheck));
2757   LoopBypassBlocks.push_back(SCEVCheckBlock);
2758   AddedSafetyChecks = true;
2759 }
2760 
2761 void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {
2762   // VPlan-native path does not do any analysis for runtime checks currently.
2763   if (EnableVPlanNativePath)
2764     return;
2765 
2766   // Reuse existing vector loop preheader for runtime memory checks.
2767   // Note that new preheader block is generated for vector loop.
2768   BasicBlock *const MemCheckBlock = L->getLoopPreheader();
2769 
2770   // Generate the code that checks in runtime if arrays overlap. We put the
2771   // checks into a separate block to make the more common case of few elements
2772   // faster.
2773   Instruction *FirstCheckInst;
2774   Instruction *MemRuntimeCheck;
2775   std::tie(FirstCheckInst, MemRuntimeCheck) =
2776       Legal->getLAI()->addRuntimeChecks(MemCheckBlock->getTerminator());
2777   if (!MemRuntimeCheck)
2778     return;
2779 
2780   if (MemCheckBlock->getParent()->hasOptSize()) {
2781     assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
2782            "Cannot emit memory checks when optimizing for size, unless forced "
2783            "to vectorize.");
2784     ORE->emit([&]() {
2785       return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
2786                                         L->getStartLoc(), L->getHeader())
2787              << "Code-size may be reduced by not forcing "
2788                 "vectorization, or by source-code modifications "
2789                 "eliminating the need for runtime checks "
2790                 "(e.g., adding 'restrict').";
2791     });
2792   }
2793 
2794   MemCheckBlock->setName("vector.memcheck");
2795   // Create new preheader for vector loop.
2796   LoopVectorPreHeader =
2797       SplitBlock(MemCheckBlock, MemCheckBlock->getTerminator(), DT, LI, nullptr,
2798                  "vector.ph");
2799 
2800   // Update dominator only if this is first RT check.
2801   if (LoopBypassBlocks.empty()) {
2802     DT->changeImmediateDominator(Bypass, MemCheckBlock);
2803     DT->changeImmediateDominator(LoopExitBlock, MemCheckBlock);
2804   }
2805 
2806   ReplaceInstWithInst(
2807       MemCheckBlock->getTerminator(),
2808       BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheck));
2809   LoopBypassBlocks.push_back(MemCheckBlock);
2810   AddedSafetyChecks = true;
2811 
2812   // We currently don't use LoopVersioning for the actual loop cloning but we
2813   // still use it to add the noalias metadata.
2814   LVer = std::make_unique<LoopVersioning>(*Legal->getLAI(), OrigLoop, LI, DT,
2815                                           PSE.getSE());
2816   LVer->prepareNoAliasMetadata();
2817 }
2818 
2819 Value *InnerLoopVectorizer::emitTransformedIndex(
2820     IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL,
2821     const InductionDescriptor &ID) const {
2822 
2823   SCEVExpander Exp(*SE, DL, "induction");
2824   auto Step = ID.getStep();
2825   auto StartValue = ID.getStartValue();
2826   assert(Index->getType() == Step->getType() &&
2827          "Index type does not match StepValue type");
2828 
2829   // Note: the IR at this point is broken. We cannot use SE to create any new
2830   // SCEV and then expand it, hoping that SCEV's simplification will give us
2831   // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
2832   // lead to various SCEV crashes. So all we can do is to use builder and rely
2833   // on InstCombine for future simplifications. Here we handle some trivial
2834   // cases only.
2835   auto CreateAdd = [&B](Value *X, Value *Y) {
2836     assert(X->getType() == Y->getType() && "Types don't match!");
2837     if (auto *CX = dyn_cast<ConstantInt>(X))
2838       if (CX->isZero())
2839         return Y;
2840     if (auto *CY = dyn_cast<ConstantInt>(Y))
2841       if (CY->isZero())
2842         return X;
2843     return B.CreateAdd(X, Y);
2844   };
2845 
2846   auto CreateMul = [&B](Value *X, Value *Y) {
2847     assert(X->getType() == Y->getType() && "Types don't match!");
2848     if (auto *CX = dyn_cast<ConstantInt>(X))
2849       if (CX->isOne())
2850         return Y;
2851     if (auto *CY = dyn_cast<ConstantInt>(Y))
2852       if (CY->isOne())
2853         return X;
2854     return B.CreateMul(X, Y);
2855   };
2856 
2857   switch (ID.getKind()) {
2858   case InductionDescriptor::IK_IntInduction: {
2859     assert(Index->getType() == StartValue->getType() &&
2860            "Index type does not match StartValue type");
2861     if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne())
2862       return B.CreateSub(StartValue, Index);
2863     auto *Offset = CreateMul(
2864         Index, Exp.expandCodeFor(Step, Index->getType(), &*B.GetInsertPoint()));
2865     return CreateAdd(StartValue, Offset);
2866   }
2867   case InductionDescriptor::IK_PtrInduction: {
2868     assert(isa<SCEVConstant>(Step) &&
2869            "Expected constant step for pointer induction");
2870     return B.CreateGEP(
2871         StartValue->getType()->getPointerElementType(), StartValue,
2872         CreateMul(Index, Exp.expandCodeFor(Step, Index->getType(),
2873                                            &*B.GetInsertPoint())));
2874   }
2875   case InductionDescriptor::IK_FpInduction: {
2876     assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
2877     auto InductionBinOp = ID.getInductionBinOp();
2878     assert(InductionBinOp &&
2879            (InductionBinOp->getOpcode() == Instruction::FAdd ||
2880             InductionBinOp->getOpcode() == Instruction::FSub) &&
2881            "Original bin op should be defined for FP induction");
2882 
2883     Value *StepValue = cast<SCEVUnknown>(Step)->getValue();
2884 
2885     // Floating point operations had to be 'fast' to enable the induction.
2886     FastMathFlags Flags;
2887     Flags.setFast();
2888 
2889     Value *MulExp = B.CreateFMul(StepValue, Index);
2890     if (isa<Instruction>(MulExp))
2891       // We have to check, the MulExp may be a constant.
2892       cast<Instruction>(MulExp)->setFastMathFlags(Flags);
2893 
2894     Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
2895                                "induction");
2896     if (isa<Instruction>(BOp))
2897       cast<Instruction>(BOp)->setFastMathFlags(Flags);
2898 
2899     return BOp;
2900   }
2901   case InductionDescriptor::IK_NoInduction:
2902     return nullptr;
2903   }
2904   llvm_unreachable("invalid enum");
2905 }
2906 
2907 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
2908   /*
2909    In this function we generate a new loop. The new loop will contain
2910    the vectorized instructions while the old loop will continue to run the
2911    scalar remainder.
2912 
2913        [ ] <-- loop iteration number check.
2914     /   |
2915    /    v
2916   |    [ ] <-- vector loop bypass (may consist of multiple blocks).
2917   |  /  |
2918   | /   v
2919   ||   [ ]     <-- vector pre header.
2920   |/    |
2921   |     v
2922   |    [  ] \
2923   |    [  ]_|   <-- vector loop.
2924   |     |
2925   |     v
2926   |   -[ ]   <--- middle-block.
2927   |  /  |
2928   | /   v
2929   -|- >[ ]     <--- new preheader.
2930    |    |
2931    |    v
2932    |   [ ] \
2933    |   [ ]_|   <-- old scalar loop to handle remainder.
2934     \   |
2935      \  v
2936       >[ ]     <-- exit block.
2937    ...
2938    */
2939 
2940   MDNode *OrigLoopID = OrigLoop->getLoopID();
2941 
2942   // Some loops have a single integer induction variable, while other loops
2943   // don't. One example is c++ iterators that often have multiple pointer
2944   // induction variables. In the code below we also support a case where we
2945   // don't have a single induction variable.
2946   //
2947   // We try to obtain an induction variable from the original loop as hard
2948   // as possible. However if we don't find one that:
2949   //   - is an integer
2950   //   - counts from zero, stepping by one
2951   //   - is the size of the widest induction variable type
2952   // then we create a new one.
2953   OldInduction = Legal->getPrimaryInduction();
2954   Type *IdxTy = Legal->getWidestInductionType();
2955 
2956   // Split the single block loop into the two loop structure described above.
2957   LoopScalarBody = OrigLoop->getHeader();
2958   LoopVectorPreHeader = OrigLoop->getLoopPreheader();
2959   LoopExitBlock = OrigLoop->getExitBlock();
2960   assert(LoopExitBlock && "Must have an exit block");
2961   assert(LoopVectorPreHeader && "Invalid loop structure");
2962 
2963   LoopMiddleBlock =
2964       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
2965                  LI, nullptr, "middle.block");
2966   LoopScalarPreHeader =
2967       SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,
2968                  nullptr, "scalar.ph");
2969   // We intentionally don't let SplitBlock to update LoopInfo since
2970   // LoopVectorBody should belong to another loop than LoopVectorPreHeader.
2971   // LoopVectorBody is explicitly added to the correct place few lines later.
2972   LoopVectorBody =
2973       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
2974                  nullptr, nullptr, "vector.body");
2975 
2976   // Update dominator for loop exit.
2977   DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
2978 
2979   // Create and register the new vector loop.
2980   Loop *Lp = LI->AllocateLoop();
2981   Loop *ParentLoop = OrigLoop->getParentLoop();
2982 
2983   // Insert the new loop into the loop nest and register the new basic blocks
2984   // before calling any utilities such as SCEV that require valid LoopInfo.
2985   if (ParentLoop) {
2986     ParentLoop->addChildLoop(Lp);
2987   } else {
2988     LI->addTopLevelLoop(Lp);
2989   }
2990   Lp->addBasicBlockToLoop(LoopVectorBody, *LI);
2991 
2992   // Find the loop boundaries.
2993   Value *Count = getOrCreateTripCount(Lp);
2994 
2995   Value *StartIdx = ConstantInt::get(IdxTy, 0);
2996 
2997   // Now, compare the new count to zero. If it is zero skip the vector loop and
2998   // jump to the scalar loop. This check also covers the case where the
2999   // backedge-taken count is uint##_max: adding one to it will overflow leading
3000   // to an incorrect trip count of zero. In this (rare) case we will also jump
3001   // to the scalar loop.
3002   emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader);
3003 
3004   // Generate the code to check any assumptions that we've made for SCEV
3005   // expressions.
3006   emitSCEVChecks(Lp, LoopScalarPreHeader);
3007 
3008   // Generate the code that checks in runtime if arrays overlap. We put the
3009   // checks into a separate block to make the more common case of few elements
3010   // faster.
3011   emitMemRuntimeChecks(Lp, LoopScalarPreHeader);
3012 
3013   // Generate the induction variable.
3014   // The loop step is equal to the vectorization factor (num of SIMD elements)
3015   // times the unroll factor (num of SIMD instructions).
3016   Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
3017   Constant *Step = ConstantInt::get(IdxTy, VF * UF);
3018   Induction =
3019       createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
3020                               getDebugLocFromInstOrOperands(OldInduction));
3021 
3022   // We are going to resume the execution of the scalar loop.
3023   // Go over all of the induction variables that we found and fix the
3024   // PHIs that are left in the scalar version of the loop.
3025   // The starting values of PHI nodes depend on the counter of the last
3026   // iteration in the vectorized loop.
3027   // If we come from a bypass edge then we need to start from the original
3028   // start value.
3029 
3030   // This variable saves the new starting index for the scalar loop. It is used
3031   // to test if there are any tail iterations left once the vector loop has
3032   // completed.
3033   for (auto &InductionEntry : Legal->getInductionVars()) {
3034     PHINode *OrigPhi = InductionEntry.first;
3035     InductionDescriptor II = InductionEntry.second;
3036 
3037     // Create phi nodes to merge from the  backedge-taken check block.
3038     PHINode *BCResumeVal =
3039         PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
3040                         LoopScalarPreHeader->getTerminator());
3041     // Copy original phi DL over to the new one.
3042     BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
3043     Value *&EndValue = IVEndValues[OrigPhi];
3044     if (OrigPhi == OldInduction) {
3045       // We know what the end value is.
3046       EndValue = CountRoundDown;
3047     } else {
3048       IRBuilder<> B(Lp->getLoopPreheader()->getTerminator());
3049       Type *StepType = II.getStep()->getType();
3050       Instruction::CastOps CastOp =
3051           CastInst::getCastOpcode(CountRoundDown, true, StepType, true);
3052       Value *CRD = B.CreateCast(CastOp, CountRoundDown, StepType, "cast.crd");
3053       const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout();
3054       EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
3055       EndValue->setName("ind.end");
3056     }
3057 
3058     // The new PHI merges the original incoming value, in case of a bypass,
3059     // or the value at the end of the vectorized loop.
3060     BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);
3061 
3062     // Fix the scalar body counter (PHI node).
3063     // The old induction's phi node in the scalar body needs the truncated
3064     // value.
3065     for (BasicBlock *BB : LoopBypassBlocks)
3066       BCResumeVal->addIncoming(II.getStartValue(), BB);
3067     OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
3068   }
3069 
3070   // We need the OrigLoop (scalar loop part) latch terminator to help
3071   // produce correct debug info for the middle block BB instructions.
3072   // The legality check stage guarantees that the loop will have a single
3073   // latch.
3074   assert(isa<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()) &&
3075          "Scalar loop latch terminator isn't a branch");
3076   BranchInst *ScalarLatchBr =
3077       cast<BranchInst>(OrigLoop->getLoopLatch()->getTerminator());
3078 
3079   // Add a check in the middle block to see if we have completed
3080   // all of the iterations in the first vector loop.
3081   // If (N - N%VF) == N, then we *don't* need to run the remainder.
3082   // If tail is to be folded, we know we don't need to run the remainder.
3083   Value *CmpN = Builder.getTrue();
3084   if (!Cost->foldTailByMasking()) {
3085     CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count,
3086                            CountRoundDown, "cmp.n",
3087                            LoopMiddleBlock->getTerminator());
3088 
3089     // Here we use the same DebugLoc as the scalar loop latch branch instead
3090     // of the corresponding compare because they may have ended up with
3091     // different line numbers and we want to avoid awkward line stepping while
3092     // debugging. Eg. if the compare has got a line number inside the loop.
3093     cast<Instruction>(CmpN)->setDebugLoc(ScalarLatchBr->getDebugLoc());
3094   }
3095 
3096   BranchInst *BrInst =
3097       BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, CmpN);
3098   BrInst->setDebugLoc(ScalarLatchBr->getDebugLoc());
3099   ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst);
3100 
3101   // Get ready to start creating new instructions into the vectorized body.
3102   assert(LoopVectorPreHeader == Lp->getLoopPreheader() &&
3103          "Inconsistent vector loop preheader");
3104   Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt());
3105 
3106   Optional<MDNode *> VectorizedLoopID =
3107       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
3108                                       LLVMLoopVectorizeFollowupVectorized});
3109   if (VectorizedLoopID.hasValue()) {
3110     Lp->setLoopID(VectorizedLoopID.getValue());
3111 
3112     // Do not setAlreadyVectorized if loop attributes have been defined
3113     // explicitly.
3114     return LoopVectorPreHeader;
3115   }
3116 
3117   // Keep all loop hints from the original loop on the vector loop (we'll
3118   // replace the vectorizer-specific hints below).
3119   if (MDNode *LID = OrigLoop->getLoopID())
3120     Lp->setLoopID(LID);
3121 
3122   LoopVectorizeHints Hints(Lp, true, *ORE);
3123   Hints.setAlreadyVectorized();
3124 
3125 #ifdef EXPENSIVE_CHECKS
3126   assert(DT->verify(DominatorTree::VerificationLevel::Fast));
3127   LI->verify(*DT);
3128 #endif
3129 
3130   return LoopVectorPreHeader;
3131 }
3132 
3133 // Fix up external users of the induction variable. At this point, we are
3134 // in LCSSA form, with all external PHIs that use the IV having one input value,
3135 // coming from the remainder loop. We need those PHIs to also have a correct
3136 // value for the IV when arriving directly from the middle block.
3137 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
3138                                        const InductionDescriptor &II,
3139                                        Value *CountRoundDown, Value *EndValue,
3140                                        BasicBlock *MiddleBlock) {
3141   // There are two kinds of external IV usages - those that use the value
3142   // computed in the last iteration (the PHI) and those that use the penultimate
3143   // value (the value that feeds into the phi from the loop latch).
3144   // We allow both, but they, obviously, have different values.
3145 
3146   assert(OrigLoop->getExitBlock() && "Expected a single exit block");
3147 
3148   DenseMap<Value *, Value *> MissingVals;
3149 
3150   // An external user of the last iteration's value should see the value that
3151   // the remainder loop uses to initialize its own IV.
3152   Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
3153   for (User *U : PostInc->users()) {
3154     Instruction *UI = cast<Instruction>(U);
3155     if (!OrigLoop->contains(UI)) {
3156       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3157       MissingVals[UI] = EndValue;
3158     }
3159   }
3160 
3161   // An external user of the penultimate value need to see EndValue - Step.
3162   // The simplest way to get this is to recompute it from the constituent SCEVs,
3163   // that is Start + (Step * (CRD - 1)).
3164   for (User *U : OrigPhi->users()) {
3165     auto *UI = cast<Instruction>(U);
3166     if (!OrigLoop->contains(UI)) {
3167       const DataLayout &DL =
3168           OrigLoop->getHeader()->getModule()->getDataLayout();
3169       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3170 
3171       IRBuilder<> B(MiddleBlock->getTerminator());
3172       Value *CountMinusOne = B.CreateSub(
3173           CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1));
3174       Value *CMO =
3175           !II.getStep()->getType()->isIntegerTy()
3176               ? B.CreateCast(Instruction::SIToFP, CountMinusOne,
3177                              II.getStep()->getType())
3178               : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType());
3179       CMO->setName("cast.cmo");
3180       Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II);
3181       Escape->setName("ind.escape");
3182       MissingVals[UI] = Escape;
3183     }
3184   }
3185 
3186   for (auto &I : MissingVals) {
3187     PHINode *PHI = cast<PHINode>(I.first);
3188     // One corner case we have to handle is two IVs "chasing" each-other,
3189     // that is %IV2 = phi [...], [ %IV1, %latch ]
3190     // In this case, if IV1 has an external use, we need to avoid adding both
3191     // "last value of IV1" and "penultimate value of IV2". So, verify that we
3192     // don't already have an incoming value for the middle block.
3193     if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
3194       PHI->addIncoming(I.second, MiddleBlock);
3195   }
3196 }
3197 
3198 namespace {
3199 
3200 struct CSEDenseMapInfo {
3201   static bool canHandle(const Instruction *I) {
3202     return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
3203            isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
3204   }
3205 
3206   static inline Instruction *getEmptyKey() {
3207     return DenseMapInfo<Instruction *>::getEmptyKey();
3208   }
3209 
3210   static inline Instruction *getTombstoneKey() {
3211     return DenseMapInfo<Instruction *>::getTombstoneKey();
3212   }
3213 
3214   static unsigned getHashValue(const Instruction *I) {
3215     assert(canHandle(I) && "Unknown instruction!");
3216     return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
3217                                                            I->value_op_end()));
3218   }
3219 
3220   static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
3221     if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3222         LHS == getTombstoneKey() || RHS == getTombstoneKey())
3223       return LHS == RHS;
3224     return LHS->isIdenticalTo(RHS);
3225   }
3226 };
3227 
3228 } // end anonymous namespace
3229 
3230 ///Perform cse of induction variable instructions.
3231 static void cse(BasicBlock *BB) {
3232   // Perform simple cse.
3233   SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
3234   for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
3235     Instruction *In = &*I++;
3236 
3237     if (!CSEDenseMapInfo::canHandle(In))
3238       continue;
3239 
3240     // Check if we can replace this instruction with any of the
3241     // visited instructions.
3242     if (Instruction *V = CSEMap.lookup(In)) {
3243       In->replaceAllUsesWith(V);
3244       In->eraseFromParent();
3245       continue;
3246     }
3247 
3248     CSEMap[In] = In;
3249   }
3250 }
3251 
3252 unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
3253                                                        unsigned VF,
3254                                                        bool &NeedToScalarize) {
3255   Function *F = CI->getCalledFunction();
3256   Type *ScalarRetTy = CI->getType();
3257   SmallVector<Type *, 4> Tys, ScalarTys;
3258   for (auto &ArgOp : CI->arg_operands())
3259     ScalarTys.push_back(ArgOp->getType());
3260 
3261   // Estimate cost of scalarized vector call. The source operands are assumed
3262   // to be vectors, so we need to extract individual elements from there,
3263   // execute VF scalar calls, and then gather the result into the vector return
3264   // value.
3265   unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys);
3266   if (VF == 1)
3267     return ScalarCallCost;
3268 
3269   // Compute corresponding vector type for return value and arguments.
3270   Type *RetTy = ToVectorTy(ScalarRetTy, VF);
3271   for (Type *ScalarTy : ScalarTys)
3272     Tys.push_back(ToVectorTy(ScalarTy, VF));
3273 
3274   // Compute costs of unpacking argument values for the scalar calls and
3275   // packing the return values to a vector.
3276   unsigned ScalarizationCost = getScalarizationOverhead(CI, VF);
3277 
3278   unsigned Cost = ScalarCallCost * VF + ScalarizationCost;
3279 
3280   // If we can't emit a vector call for this function, then the currently found
3281   // cost is the cost we need to return.
3282   NeedToScalarize = true;
3283   VFShape Shape = VFShape::get(*CI, {VF, false}, false /*HasGlobalPred*/);
3284   Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
3285 
3286   if (!TLI || CI->isNoBuiltin() || !VecFunc)
3287     return Cost;
3288 
3289   // If the corresponding vector cost is cheaper, return its cost.
3290   unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys);
3291   if (VectorCallCost < Cost) {
3292     NeedToScalarize = false;
3293     return VectorCallCost;
3294   }
3295   return Cost;
3296 }
3297 
3298 unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
3299                                                             unsigned VF) {
3300   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
3301   assert(ID && "Expected intrinsic call!");
3302 
3303   FastMathFlags FMF;
3304   if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
3305     FMF = FPMO->getFastMathFlags();
3306 
3307   SmallVector<Value *, 4> Operands(CI->arg_operands());
3308   return TTI.getIntrinsicInstrCost(ID, CI->getType(), Operands, FMF, VF, CI);
3309 }
3310 
3311 static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
3312   auto *I1 = cast<IntegerType>(T1->getVectorElementType());
3313   auto *I2 = cast<IntegerType>(T2->getVectorElementType());
3314   return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3315 }
3316 static Type *largestIntegerVectorType(Type *T1, Type *T2) {
3317   auto *I1 = cast<IntegerType>(T1->getVectorElementType());
3318   auto *I2 = cast<IntegerType>(T2->getVectorElementType());
3319   return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3320 }
3321 
3322 void InnerLoopVectorizer::truncateToMinimalBitwidths() {
3323   // For every instruction `I` in MinBWs, truncate the operands, create a
3324   // truncated version of `I` and reextend its result. InstCombine runs
3325   // later and will remove any ext/trunc pairs.
3326   SmallPtrSet<Value *, 4> Erased;
3327   for (const auto &KV : Cost->getMinimalBitwidths()) {
3328     // If the value wasn't vectorized, we must maintain the original scalar
3329     // type. The absence of the value from VectorLoopValueMap indicates that it
3330     // wasn't vectorized.
3331     if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3332       continue;
3333     for (unsigned Part = 0; Part < UF; ++Part) {
3334       Value *I = getOrCreateVectorValue(KV.first, Part);
3335       if (Erased.find(I) != Erased.end() || I->use_empty() ||
3336           !isa<Instruction>(I))
3337         continue;
3338       Type *OriginalTy = I->getType();
3339       Type *ScalarTruncatedTy =
3340           IntegerType::get(OriginalTy->getContext(), KV.second);
3341       Type *TruncatedTy = VectorType::get(ScalarTruncatedTy,
3342                                           OriginalTy->getVectorNumElements());
3343       if (TruncatedTy == OriginalTy)
3344         continue;
3345 
3346       IRBuilder<> B(cast<Instruction>(I));
3347       auto ShrinkOperand = [&](Value *V) -> Value * {
3348         if (auto *ZI = dyn_cast<ZExtInst>(V))
3349           if (ZI->getSrcTy() == TruncatedTy)
3350             return ZI->getOperand(0);
3351         return B.CreateZExtOrTrunc(V, TruncatedTy);
3352       };
3353 
3354       // The actual instruction modification depends on the instruction type,
3355       // unfortunately.
3356       Value *NewI = nullptr;
3357       if (auto *BO = dyn_cast<BinaryOperator>(I)) {
3358         NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
3359                              ShrinkOperand(BO->getOperand(1)));
3360 
3361         // Any wrapping introduced by shrinking this operation shouldn't be
3362         // considered undefined behavior. So, we can't unconditionally copy
3363         // arithmetic wrapping flags to NewI.
3364         cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
3365       } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
3366         NewI =
3367             B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
3368                          ShrinkOperand(CI->getOperand(1)));
3369       } else if (auto *SI = dyn_cast<SelectInst>(I)) {
3370         NewI = B.CreateSelect(SI->getCondition(),
3371                               ShrinkOperand(SI->getTrueValue()),
3372                               ShrinkOperand(SI->getFalseValue()));
3373       } else if (auto *CI = dyn_cast<CastInst>(I)) {
3374         switch (CI->getOpcode()) {
3375         default:
3376           llvm_unreachable("Unhandled cast!");
3377         case Instruction::Trunc:
3378           NewI = ShrinkOperand(CI->getOperand(0));
3379           break;
3380         case Instruction::SExt:
3381           NewI = B.CreateSExtOrTrunc(
3382               CI->getOperand(0),
3383               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3384           break;
3385         case Instruction::ZExt:
3386           NewI = B.CreateZExtOrTrunc(
3387               CI->getOperand(0),
3388               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3389           break;
3390         }
3391       } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
3392         auto Elements0 = SI->getOperand(0)->getType()->getVectorNumElements();
3393         auto *O0 = B.CreateZExtOrTrunc(
3394             SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0));
3395         auto Elements1 = SI->getOperand(1)->getType()->getVectorNumElements();
3396         auto *O1 = B.CreateZExtOrTrunc(
3397             SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1));
3398 
3399         NewI = B.CreateShuffleVector(O0, O1, SI->getMask());
3400       } else if (isa<LoadInst>(I) || isa<PHINode>(I)) {
3401         // Don't do anything with the operands, just extend the result.
3402         continue;
3403       } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
3404         auto Elements = IE->getOperand(0)->getType()->getVectorNumElements();
3405         auto *O0 = B.CreateZExtOrTrunc(
3406             IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3407         auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
3408         NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
3409       } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
3410         auto Elements = EE->getOperand(0)->getType()->getVectorNumElements();
3411         auto *O0 = B.CreateZExtOrTrunc(
3412             EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3413         NewI = B.CreateExtractElement(O0, EE->getOperand(2));
3414       } else {
3415         // If we don't know what to do, be conservative and don't do anything.
3416         continue;
3417       }
3418 
3419       // Lastly, extend the result.
3420       NewI->takeName(cast<Instruction>(I));
3421       Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
3422       I->replaceAllUsesWith(Res);
3423       cast<Instruction>(I)->eraseFromParent();
3424       Erased.insert(I);
3425       VectorLoopValueMap.resetVectorValue(KV.first, Part, Res);
3426     }
3427   }
3428 
3429   // We'll have created a bunch of ZExts that are now parentless. Clean up.
3430   for (const auto &KV : Cost->getMinimalBitwidths()) {
3431     // If the value wasn't vectorized, we must maintain the original scalar
3432     // type. The absence of the value from VectorLoopValueMap indicates that it
3433     // wasn't vectorized.
3434     if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3435       continue;
3436     for (unsigned Part = 0; Part < UF; ++Part) {
3437       Value *I = getOrCreateVectorValue(KV.first, Part);
3438       ZExtInst *Inst = dyn_cast<ZExtInst>(I);
3439       if (Inst && Inst->use_empty()) {
3440         Value *NewI = Inst->getOperand(0);
3441         Inst->eraseFromParent();
3442         VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI);
3443       }
3444     }
3445   }
3446 }
3447 
3448 void InnerLoopVectorizer::fixVectorizedLoop() {
3449   // Insert truncates and extends for any truncated instructions as hints to
3450   // InstCombine.
3451   if (VF > 1)
3452     truncateToMinimalBitwidths();
3453 
3454   // Fix widened non-induction PHIs by setting up the PHI operands.
3455   if (OrigPHIsToFix.size()) {
3456     assert(EnableVPlanNativePath &&
3457            "Unexpected non-induction PHIs for fixup in non VPlan-native path");
3458     fixNonInductionPHIs();
3459   }
3460 
3461   // At this point every instruction in the original loop is widened to a
3462   // vector form. Now we need to fix the recurrences in the loop. These PHI
3463   // nodes are currently empty because we did not want to introduce cycles.
3464   // This is the second stage of vectorizing recurrences.
3465   fixCrossIterationPHIs();
3466 
3467   // Forget the original basic block.
3468   PSE.getSE()->forgetLoop(OrigLoop);
3469 
3470   // Fix-up external users of the induction variables.
3471   for (auto &Entry : Legal->getInductionVars())
3472     fixupIVUsers(Entry.first, Entry.second,
3473                  getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)),
3474                  IVEndValues[Entry.first], LoopMiddleBlock);
3475 
3476   fixLCSSAPHIs();
3477   for (Instruction *PI : PredicatedInstructions)
3478     sinkScalarOperands(&*PI);
3479 
3480   // Remove redundant induction instructions.
3481   cse(LoopVectorBody);
3482 
3483   // Set/update profile weights for the vector and remainder loops as original
3484   // loop iterations are now distributed among them. Note that original loop
3485   // represented by LoopScalarBody becomes remainder loop after vectorization.
3486   //
3487   // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
3488   // end up getting slightly roughened result but that should be OK since
3489   // profile is not inherently precise anyway. Note also possible bypass of
3490   // vector code caused by legality checks is ignored, assigning all the weight
3491   // to the vector loop, optimistically.
3492   setProfileInfoAfterUnrolling(LI->getLoopFor(LoopScalarBody),
3493                                LI->getLoopFor(LoopVectorBody),
3494                                LI->getLoopFor(LoopScalarBody), VF * UF);
3495 }
3496 
3497 void InnerLoopVectorizer::fixCrossIterationPHIs() {
3498   // In order to support recurrences we need to be able to vectorize Phi nodes.
3499   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3500   // stage #2: We now need to fix the recurrences by adding incoming edges to
3501   // the currently empty PHI nodes. At this point every instruction in the
3502   // original loop is widened to a vector form so we can use them to construct
3503   // the incoming edges.
3504   for (PHINode &Phi : OrigLoop->getHeader()->phis()) {
3505     // Handle first-order recurrences and reductions that need to be fixed.
3506     if (Legal->isFirstOrderRecurrence(&Phi))
3507       fixFirstOrderRecurrence(&Phi);
3508     else if (Legal->isReductionVariable(&Phi))
3509       fixReduction(&Phi);
3510   }
3511 }
3512 
3513 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
3514   // This is the second phase of vectorizing first-order recurrences. An
3515   // overview of the transformation is described below. Suppose we have the
3516   // following loop.
3517   //
3518   //   for (int i = 0; i < n; ++i)
3519   //     b[i] = a[i] - a[i - 1];
3520   //
3521   // There is a first-order recurrence on "a". For this loop, the shorthand
3522   // scalar IR looks like:
3523   //
3524   //   scalar.ph:
3525   //     s_init = a[-1]
3526   //     br scalar.body
3527   //
3528   //   scalar.body:
3529   //     i = phi [0, scalar.ph], [i+1, scalar.body]
3530   //     s1 = phi [s_init, scalar.ph], [s2, scalar.body]
3531   //     s2 = a[i]
3532   //     b[i] = s2 - s1
3533   //     br cond, scalar.body, ...
3534   //
3535   // In this example, s1 is a recurrence because it's value depends on the
3536   // previous iteration. In the first phase of vectorization, we created a
3537   // temporary value for s1. We now complete the vectorization and produce the
3538   // shorthand vector IR shown below (for VF = 4, UF = 1).
3539   //
3540   //   vector.ph:
3541   //     v_init = vector(..., ..., ..., a[-1])
3542   //     br vector.body
3543   //
3544   //   vector.body
3545   //     i = phi [0, vector.ph], [i+4, vector.body]
3546   //     v1 = phi [v_init, vector.ph], [v2, vector.body]
3547   //     v2 = a[i, i+1, i+2, i+3];
3548   //     v3 = vector(v1(3), v2(0, 1, 2))
3549   //     b[i, i+1, i+2, i+3] = v2 - v3
3550   //     br cond, vector.body, middle.block
3551   //
3552   //   middle.block:
3553   //     x = v2(3)
3554   //     br scalar.ph
3555   //
3556   //   scalar.ph:
3557   //     s_init = phi [x, middle.block], [a[-1], otherwise]
3558   //     br scalar.body
3559   //
3560   // After execution completes the vector loop, we extract the next value of
3561   // the recurrence (x) to use as the initial value in the scalar loop.
3562 
3563   // Get the original loop preheader and single loop latch.
3564   auto *Preheader = OrigLoop->getLoopPreheader();
3565   auto *Latch = OrigLoop->getLoopLatch();
3566 
3567   // Get the initial and previous values of the scalar recurrence.
3568   auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader);
3569   auto *Previous = Phi->getIncomingValueForBlock(Latch);
3570 
3571   // Create a vector from the initial value.
3572   auto *VectorInit = ScalarInit;
3573   if (VF > 1) {
3574     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
3575     VectorInit = Builder.CreateInsertElement(
3576         UndefValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit,
3577         Builder.getInt32(VF - 1), "vector.recur.init");
3578   }
3579 
3580   // We constructed a temporary phi node in the first phase of vectorization.
3581   // This phi node will eventually be deleted.
3582   Builder.SetInsertPoint(
3583       cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0)));
3584 
3585   // Create a phi node for the new recurrence. The current value will either be
3586   // the initial value inserted into a vector or loop-varying vector value.
3587   auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur");
3588   VecPhi->addIncoming(VectorInit, LoopVectorPreHeader);
3589 
3590   // Get the vectorized previous value of the last part UF - 1. It appears last
3591   // among all unrolled iterations, due to the order of their construction.
3592   Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1);
3593 
3594   // Find and set the insertion point after the previous value if it is an
3595   // instruction.
3596   BasicBlock::iterator InsertPt;
3597   // Note that the previous value may have been constant-folded so it is not
3598   // guaranteed to be an instruction in the vector loop.
3599   // FIXME: Loop invariant values do not form recurrences. We should deal with
3600   //        them earlier.
3601   if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart))
3602     InsertPt = LoopVectorBody->getFirstInsertionPt();
3603   else {
3604     Instruction *PreviousInst = cast<Instruction>(PreviousLastPart);
3605     if (isa<PHINode>(PreviousLastPart))
3606       // If the previous value is a phi node, we should insert after all the phi
3607       // nodes in the block containing the PHI to avoid breaking basic block
3608       // verification. Note that the basic block may be different to
3609       // LoopVectorBody, in case we predicate the loop.
3610       InsertPt = PreviousInst->getParent()->getFirstInsertionPt();
3611     else
3612       InsertPt = ++PreviousInst->getIterator();
3613   }
3614   Builder.SetInsertPoint(&*InsertPt);
3615 
3616   // We will construct a vector for the recurrence by combining the values for
3617   // the current and previous iterations. This is the required shuffle mask.
3618   SmallVector<Constant *, 8> ShuffleMask(VF);
3619   ShuffleMask[0] = Builder.getInt32(VF - 1);
3620   for (unsigned I = 1; I < VF; ++I)
3621     ShuffleMask[I] = Builder.getInt32(I + VF - 1);
3622 
3623   // The vector from which to take the initial value for the current iteration
3624   // (actual or unrolled). Initially, this is the vector phi node.
3625   Value *Incoming = VecPhi;
3626 
3627   // Shuffle the current and previous vector and update the vector parts.
3628   for (unsigned Part = 0; Part < UF; ++Part) {
3629     Value *PreviousPart = getOrCreateVectorValue(Previous, Part);
3630     Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part);
3631     auto *Shuffle =
3632         VF > 1 ? Builder.CreateShuffleVector(Incoming, PreviousPart,
3633                                              ConstantVector::get(ShuffleMask))
3634                : Incoming;
3635     PhiPart->replaceAllUsesWith(Shuffle);
3636     cast<Instruction>(PhiPart)->eraseFromParent();
3637     VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle);
3638     Incoming = PreviousPart;
3639   }
3640 
3641   // Fix the latch value of the new recurrence in the vector loop.
3642   VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
3643 
3644   // Extract the last vector element in the middle block. This will be the
3645   // initial value for the recurrence when jumping to the scalar loop.
3646   auto *ExtractForScalar = Incoming;
3647   if (VF > 1) {
3648     Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
3649     ExtractForScalar = Builder.CreateExtractElement(
3650         ExtractForScalar, Builder.getInt32(VF - 1), "vector.recur.extract");
3651   }
3652   // Extract the second last element in the middle block if the
3653   // Phi is used outside the loop. We need to extract the phi itself
3654   // and not the last element (the phi update in the current iteration). This
3655   // will be the value when jumping to the exit block from the LoopMiddleBlock,
3656   // when the scalar loop is not run at all.
3657   Value *ExtractForPhiUsedOutsideLoop = nullptr;
3658   if (VF > 1)
3659     ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
3660         Incoming, Builder.getInt32(VF - 2), "vector.recur.extract.for.phi");
3661   // When loop is unrolled without vectorizing, initialize
3662   // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of
3663   // `Incoming`. This is analogous to the vectorized case above: extracting the
3664   // second last element when VF > 1.
3665   else if (UF > 1)
3666     ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2);
3667 
3668   // Fix the initial value of the original recurrence in the scalar loop.
3669   Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
3670   auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
3671   for (auto *BB : predecessors(LoopScalarPreHeader)) {
3672     auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
3673     Start->addIncoming(Incoming, BB);
3674   }
3675 
3676   Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start);
3677   Phi->setName("scalar.recur");
3678 
3679   // Finally, fix users of the recurrence outside the loop. The users will need
3680   // either the last value of the scalar recurrence or the last value of the
3681   // vector recurrence we extracted in the middle block. Since the loop is in
3682   // LCSSA form, we just need to find all the phi nodes for the original scalar
3683   // recurrence in the exit block, and then add an edge for the middle block.
3684   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3685     if (LCSSAPhi.getIncomingValue(0) == Phi) {
3686       LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
3687     }
3688   }
3689 }
3690 
3691 void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
3692   Constant *Zero = Builder.getInt32(0);
3693 
3694   // Get it's reduction variable descriptor.
3695   assert(Legal->isReductionVariable(Phi) &&
3696          "Unable to find the reduction variable");
3697   RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi];
3698 
3699   RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();
3700   TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
3701   Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
3702   RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind =
3703     RdxDesc.getMinMaxRecurrenceKind();
3704   setDebugLocFromInst(Builder, ReductionStartValue);
3705 
3706   // We need to generate a reduction vector from the incoming scalar.
3707   // To do so, we need to generate the 'identity' vector and override
3708   // one of the elements with the incoming scalar reduction. We need
3709   // to do it in the vector-loop preheader.
3710   Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
3711 
3712   // This is the vector-clone of the value that leaves the loop.
3713   Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType();
3714 
3715   // Find the reduction identity variable. Zero for addition, or, xor,
3716   // one for multiplication, -1 for And.
3717   Value *Identity;
3718   Value *VectorStart;
3719   if (RK == RecurrenceDescriptor::RK_IntegerMinMax ||
3720       RK == RecurrenceDescriptor::RK_FloatMinMax) {
3721     // MinMax reduction have the start value as their identify.
3722     if (VF == 1) {
3723       VectorStart = Identity = ReductionStartValue;
3724     } else {
3725       VectorStart = Identity =
3726         Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident");
3727     }
3728   } else {
3729     // Handle other reduction kinds:
3730     Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity(
3731         RK, VecTy->getScalarType());
3732     if (VF == 1) {
3733       Identity = Iden;
3734       // This vector is the Identity vector where the first element is the
3735       // incoming scalar reduction.
3736       VectorStart = ReductionStartValue;
3737     } else {
3738       Identity = ConstantVector::getSplat({VF, false}, Iden);
3739 
3740       // This vector is the Identity vector where the first element is the
3741       // incoming scalar reduction.
3742       VectorStart =
3743         Builder.CreateInsertElement(Identity, ReductionStartValue, Zero);
3744     }
3745   }
3746 
3747   // Wrap flags are in general invalid after vectorization, clear them.
3748   clearReductionWrapFlags(RdxDesc);
3749 
3750   // Fix the vector-loop phi.
3751 
3752   // Reductions do not have to start at zero. They can start with
3753   // any loop invariant values.
3754   BasicBlock *Latch = OrigLoop->getLoopLatch();
3755   Value *LoopVal = Phi->getIncomingValueForBlock(Latch);
3756 
3757   for (unsigned Part = 0; Part < UF; ++Part) {
3758     Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part);
3759     Value *Val = getOrCreateVectorValue(LoopVal, Part);
3760     // Make sure to add the reduction start value only to the
3761     // first unroll part.
3762     Value *StartVal = (Part == 0) ? VectorStart : Identity;
3763     cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader);
3764     cast<PHINode>(VecRdxPhi)
3765       ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
3766   }
3767 
3768   // Before each round, move the insertion point right between
3769   // the PHIs and the values we are going to write.
3770   // This allows us to write both PHINodes and the extractelement
3771   // instructions.
3772   Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3773 
3774   setDebugLocFromInst(Builder, LoopExitInst);
3775 
3776   // If tail is folded by masking, the vector value to leave the loop should be
3777   // a Select choosing between the vectorized LoopExitInst and vectorized Phi,
3778   // instead of the former.
3779   if (Cost->foldTailByMasking()) {
3780     for (unsigned Part = 0; Part < UF; ++Part) {
3781       Value *VecLoopExitInst =
3782           VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3783       Value *Sel = nullptr;
3784       for (User *U : VecLoopExitInst->users()) {
3785         if (isa<SelectInst>(U)) {
3786           assert(!Sel && "Reduction exit feeding two selects");
3787           Sel = U;
3788         } else
3789           assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select");
3790       }
3791       assert(Sel && "Reduction exit feeds no select");
3792       VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, Sel);
3793     }
3794   }
3795 
3796   // If the vector reduction can be performed in a smaller type, we truncate
3797   // then extend the loop exit value to enable InstCombine to evaluate the
3798   // entire expression in the smaller type.
3799   if (VF > 1 && Phi->getType() != RdxDesc.getRecurrenceType()) {
3800     Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
3801     Builder.SetInsertPoint(
3802         LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator());
3803     VectorParts RdxParts(UF);
3804     for (unsigned Part = 0; Part < UF; ++Part) {
3805       RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3806       Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3807       Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
3808                                         : Builder.CreateZExt(Trunc, VecTy);
3809       for (Value::user_iterator UI = RdxParts[Part]->user_begin();
3810            UI != RdxParts[Part]->user_end();)
3811         if (*UI != Trunc) {
3812           (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd);
3813           RdxParts[Part] = Extnd;
3814         } else {
3815           ++UI;
3816         }
3817     }
3818     Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3819     for (unsigned Part = 0; Part < UF; ++Part) {
3820       RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3821       VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]);
3822     }
3823   }
3824 
3825   // Reduce all of the unrolled parts into a single vector.
3826   Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0);
3827   unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK);
3828 
3829   // The middle block terminator has already been assigned a DebugLoc here (the
3830   // OrigLoop's single latch terminator). We want the whole middle block to
3831   // appear to execute on this line because: (a) it is all compiler generated,
3832   // (b) these instructions are always executed after evaluating the latch
3833   // conditional branch, and (c) other passes may add new predecessors which
3834   // terminate on this line. This is the easiest way to ensure we don't
3835   // accidentally cause an extra step back into the loop while debugging.
3836   setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator());
3837   for (unsigned Part = 1; Part < UF; ++Part) {
3838     Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3839     if (Op != Instruction::ICmp && Op != Instruction::FCmp)
3840       // Floating point operations had to be 'fast' to enable the reduction.
3841       ReducedPartRdx = addFastMathFlag(
3842           Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxPart,
3843                               ReducedPartRdx, "bin.rdx"),
3844           RdxDesc.getFastMathFlags());
3845     else
3846       ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx,
3847                                       RdxPart);
3848   }
3849 
3850   if (VF > 1) {
3851     bool NoNaN = Legal->hasFunNoNaNAttr();
3852     ReducedPartRdx =
3853         createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN);
3854     // If the reduction can be performed in a smaller type, we need to extend
3855     // the reduction to the wider type before we branch to the original loop.
3856     if (Phi->getType() != RdxDesc.getRecurrenceType())
3857       ReducedPartRdx =
3858         RdxDesc.isSigned()
3859         ? Builder.CreateSExt(ReducedPartRdx, Phi->getType())
3860         : Builder.CreateZExt(ReducedPartRdx, Phi->getType());
3861   }
3862 
3863   // Create a phi node that merges control-flow from the backedge-taken check
3864   // block and the middle block.
3865   PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx",
3866                                         LoopScalarPreHeader->getTerminator());
3867   for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
3868     BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]);
3869   BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
3870 
3871   // Now, we need to fix the users of the reduction variable
3872   // inside and outside of the scalar remainder loop.
3873   // We know that the loop is in LCSSA form. We need to update the
3874   // PHI nodes in the exit blocks.
3875   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3876     // All PHINodes need to have a single entry edge, or two if
3877     // we already fixed them.
3878     assert(LCSSAPhi.getNumIncomingValues() < 3 && "Invalid LCSSA PHI");
3879 
3880     // We found a reduction value exit-PHI. Update it with the
3881     // incoming bypass edge.
3882     if (LCSSAPhi.getIncomingValue(0) == LoopExitInst)
3883       LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
3884   } // end of the LCSSA phi scan.
3885 
3886     // Fix the scalar loop reduction variable with the incoming reduction sum
3887     // from the vector body and from the backedge value.
3888   int IncomingEdgeBlockIdx =
3889     Phi->getBasicBlockIndex(OrigLoop->getLoopLatch());
3890   assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
3891   // Pick the other block.
3892   int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
3893   Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
3894   Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
3895 }
3896 
3897 void InnerLoopVectorizer::clearReductionWrapFlags(
3898     RecurrenceDescriptor &RdxDesc) {
3899   RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();
3900   if (RK != RecurrenceDescriptor::RK_IntegerAdd &&
3901       RK != RecurrenceDescriptor::RK_IntegerMult)
3902     return;
3903 
3904   Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr();
3905   assert(LoopExitInstr && "null loop exit instruction");
3906   SmallVector<Instruction *, 8> Worklist;
3907   SmallPtrSet<Instruction *, 8> Visited;
3908   Worklist.push_back(LoopExitInstr);
3909   Visited.insert(LoopExitInstr);
3910 
3911   while (!Worklist.empty()) {
3912     Instruction *Cur = Worklist.pop_back_val();
3913     if (isa<OverflowingBinaryOperator>(Cur))
3914       for (unsigned Part = 0; Part < UF; ++Part) {
3915         Value *V = getOrCreateVectorValue(Cur, Part);
3916         cast<Instruction>(V)->dropPoisonGeneratingFlags();
3917       }
3918 
3919     for (User *U : Cur->users()) {
3920       Instruction *UI = cast<Instruction>(U);
3921       if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) &&
3922           Visited.insert(UI).second)
3923         Worklist.push_back(UI);
3924     }
3925   }
3926 }
3927 
3928 void InnerLoopVectorizer::fixLCSSAPHIs() {
3929   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3930     if (LCSSAPhi.getNumIncomingValues() == 1) {
3931       auto *IncomingValue = LCSSAPhi.getIncomingValue(0);
3932       // Non-instruction incoming values will have only one value.
3933       unsigned LastLane = 0;
3934       if (isa<Instruction>(IncomingValue))
3935           LastLane = Cost->isUniformAfterVectorization(
3936                          cast<Instruction>(IncomingValue), VF)
3937                          ? 0
3938                          : VF - 1;
3939       // Can be a loop invariant incoming value or the last scalar value to be
3940       // extracted from the vectorized loop.
3941       Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
3942       Value *lastIncomingValue =
3943           getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane });
3944       LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock);
3945     }
3946   }
3947 }
3948 
3949 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
3950   // The basic block and loop containing the predicated instruction.
3951   auto *PredBB = PredInst->getParent();
3952   auto *VectorLoop = LI->getLoopFor(PredBB);
3953 
3954   // Initialize a worklist with the operands of the predicated instruction.
3955   SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
3956 
3957   // Holds instructions that we need to analyze again. An instruction may be
3958   // reanalyzed if we don't yet know if we can sink it or not.
3959   SmallVector<Instruction *, 8> InstsToReanalyze;
3960 
3961   // Returns true if a given use occurs in the predicated block. Phi nodes use
3962   // their operands in their corresponding predecessor blocks.
3963   auto isBlockOfUsePredicated = [&](Use &U) -> bool {
3964     auto *I = cast<Instruction>(U.getUser());
3965     BasicBlock *BB = I->getParent();
3966     if (auto *Phi = dyn_cast<PHINode>(I))
3967       BB = Phi->getIncomingBlock(
3968           PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
3969     return BB == PredBB;
3970   };
3971 
3972   // Iteratively sink the scalarized operands of the predicated instruction
3973   // into the block we created for it. When an instruction is sunk, it's
3974   // operands are then added to the worklist. The algorithm ends after one pass
3975   // through the worklist doesn't sink a single instruction.
3976   bool Changed;
3977   do {
3978     // Add the instructions that need to be reanalyzed to the worklist, and
3979     // reset the changed indicator.
3980     Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
3981     InstsToReanalyze.clear();
3982     Changed = false;
3983 
3984     while (!Worklist.empty()) {
3985       auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
3986 
3987       // We can't sink an instruction if it is a phi node, is already in the
3988       // predicated block, is not in the loop, or may have side effects.
3989       if (!I || isa<PHINode>(I) || I->getParent() == PredBB ||
3990           !VectorLoop->contains(I) || I->mayHaveSideEffects())
3991         continue;
3992 
3993       // It's legal to sink the instruction if all its uses occur in the
3994       // predicated block. Otherwise, there's nothing to do yet, and we may
3995       // need to reanalyze the instruction.
3996       if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
3997         InstsToReanalyze.push_back(I);
3998         continue;
3999       }
4000 
4001       // Move the instruction to the beginning of the predicated block, and add
4002       // it's operands to the worklist.
4003       I->moveBefore(&*PredBB->getFirstInsertionPt());
4004       Worklist.insert(I->op_begin(), I->op_end());
4005 
4006       // The sinking may have enabled other instructions to be sunk, so we will
4007       // need to iterate.
4008       Changed = true;
4009     }
4010   } while (Changed);
4011 }
4012 
4013 void InnerLoopVectorizer::fixNonInductionPHIs() {
4014   for (PHINode *OrigPhi : OrigPHIsToFix) {
4015     PHINode *NewPhi =
4016         cast<PHINode>(VectorLoopValueMap.getVectorValue(OrigPhi, 0));
4017     unsigned NumIncomingValues = OrigPhi->getNumIncomingValues();
4018 
4019     SmallVector<BasicBlock *, 2> ScalarBBPredecessors(
4020         predecessors(OrigPhi->getParent()));
4021     SmallVector<BasicBlock *, 2> VectorBBPredecessors(
4022         predecessors(NewPhi->getParent()));
4023     assert(ScalarBBPredecessors.size() == VectorBBPredecessors.size() &&
4024            "Scalar and Vector BB should have the same number of predecessors");
4025 
4026     // The insertion point in Builder may be invalidated by the time we get
4027     // here. Force the Builder insertion point to something valid so that we do
4028     // not run into issues during insertion point restore in
4029     // getOrCreateVectorValue calls below.
4030     Builder.SetInsertPoint(NewPhi);
4031 
4032     // The predecessor order is preserved and we can rely on mapping between
4033     // scalar and vector block predecessors.
4034     for (unsigned i = 0; i < NumIncomingValues; ++i) {
4035       BasicBlock *NewPredBB = VectorBBPredecessors[i];
4036 
4037       // When looking up the new scalar/vector values to fix up, use incoming
4038       // values from original phi.
4039       Value *ScIncV =
4040           OrigPhi->getIncomingValueForBlock(ScalarBBPredecessors[i]);
4041 
4042       // Scalar incoming value may need a broadcast
4043       Value *NewIncV = getOrCreateVectorValue(ScIncV, 0);
4044       NewPhi->addIncoming(NewIncV, NewPredBB);
4045     }
4046   }
4047 }
4048 
4049 void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, unsigned UF,
4050                                    unsigned VF, bool IsPtrLoopInvariant,
4051                                    SmallBitVector &IsIndexLoopInvariant) {
4052   // Construct a vector GEP by widening the operands of the scalar GEP as
4053   // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
4054   // results in a vector of pointers when at least one operand of the GEP
4055   // is vector-typed. Thus, to keep the representation compact, we only use
4056   // vector-typed operands for loop-varying values.
4057 
4058   if (VF > 1 && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) {
4059     // If we are vectorizing, but the GEP has only loop-invariant operands,
4060     // the GEP we build (by only using vector-typed operands for
4061     // loop-varying values) would be a scalar pointer. Thus, to ensure we
4062     // produce a vector of pointers, we need to either arbitrarily pick an
4063     // operand to broadcast, or broadcast a clone of the original GEP.
4064     // Here, we broadcast a clone of the original.
4065     //
4066     // TODO: If at some point we decide to scalarize instructions having
4067     //       loop-invariant operands, this special case will no longer be
4068     //       required. We would add the scalarization decision to
4069     //       collectLoopScalars() and teach getVectorValue() to broadcast
4070     //       the lane-zero scalar value.
4071     auto *Clone = Builder.Insert(GEP->clone());
4072     for (unsigned Part = 0; Part < UF; ++Part) {
4073       Value *EntryPart = Builder.CreateVectorSplat(VF, Clone);
4074       VectorLoopValueMap.setVectorValue(GEP, Part, EntryPart);
4075       addMetadata(EntryPart, GEP);
4076     }
4077   } else {
4078     // If the GEP has at least one loop-varying operand, we are sure to
4079     // produce a vector of pointers. But if we are only unrolling, we want
4080     // to produce a scalar GEP for each unroll part. Thus, the GEP we
4081     // produce with the code below will be scalar (if VF == 1) or vector
4082     // (otherwise). Note that for the unroll-only case, we still maintain
4083     // values in the vector mapping with initVector, as we do for other
4084     // instructions.
4085     for (unsigned Part = 0; Part < UF; ++Part) {
4086       // The pointer operand of the new GEP. If it's loop-invariant, we
4087       // won't broadcast it.
4088       auto *Ptr = IsPtrLoopInvariant
4089                       ? GEP->getPointerOperand()
4090                       : getOrCreateVectorValue(GEP->getPointerOperand(), Part);
4091 
4092       // Collect all the indices for the new GEP. If any index is
4093       // loop-invariant, we won't broadcast it.
4094       SmallVector<Value *, 4> Indices;
4095       for (auto Index : enumerate(GEP->indices())) {
4096         Value *User = Index.value().get();
4097         if (IsIndexLoopInvariant[Index.index()])
4098           Indices.push_back(User);
4099         else
4100           Indices.push_back(getOrCreateVectorValue(User, Part));
4101       }
4102 
4103       // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
4104       // but it should be a vector, otherwise.
4105       auto *NewGEP =
4106           GEP->isInBounds()
4107               ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr,
4108                                           Indices)
4109               : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices);
4110       assert((VF == 1 || NewGEP->getType()->isVectorTy()) &&
4111              "NewGEP is not a pointer vector");
4112       VectorLoopValueMap.setVectorValue(GEP, Part, NewGEP);
4113       addMetadata(NewGEP, GEP);
4114     }
4115   }
4116 }
4117 
4118 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
4119                                               unsigned VF) {
4120   PHINode *P = cast<PHINode>(PN);
4121   if (EnableVPlanNativePath) {
4122     // Currently we enter here in the VPlan-native path for non-induction
4123     // PHIs where all control flow is uniform. We simply widen these PHIs.
4124     // Create a vector phi with no operands - the vector phi operands will be
4125     // set at the end of vector code generation.
4126     Type *VecTy =
4127         (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF);
4128     Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi");
4129     VectorLoopValueMap.setVectorValue(P, 0, VecPhi);
4130     OrigPHIsToFix.push_back(P);
4131 
4132     return;
4133   }
4134 
4135   assert(PN->getParent() == OrigLoop->getHeader() &&
4136          "Non-header phis should have been handled elsewhere");
4137 
4138   // In order to support recurrences we need to be able to vectorize Phi nodes.
4139   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
4140   // stage #1: We create a new vector PHI node with no incoming edges. We'll use
4141   // this value when we vectorize all of the instructions that use the PHI.
4142   if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) {
4143     for (unsigned Part = 0; Part < UF; ++Part) {
4144       // This is phase one of vectorizing PHIs.
4145       Type *VecTy =
4146           (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF);
4147       Value *EntryPart = PHINode::Create(
4148           VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt());
4149       VectorLoopValueMap.setVectorValue(P, Part, EntryPart);
4150     }
4151     return;
4152   }
4153 
4154   setDebugLocFromInst(Builder, P);
4155 
4156   // This PHINode must be an induction variable.
4157   // Make sure that we know about it.
4158   assert(Legal->getInductionVars().count(P) && "Not an induction variable");
4159 
4160   InductionDescriptor II = Legal->getInductionVars().lookup(P);
4161   const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
4162 
4163   // FIXME: The newly created binary instructions should contain nsw/nuw flags,
4164   // which can be found from the original scalar operations.
4165   switch (II.getKind()) {
4166   case InductionDescriptor::IK_NoInduction:
4167     llvm_unreachable("Unknown induction");
4168   case InductionDescriptor::IK_IntInduction:
4169   case InductionDescriptor::IK_FpInduction:
4170     llvm_unreachable("Integer/fp induction is handled elsewhere.");
4171   case InductionDescriptor::IK_PtrInduction: {
4172     // Handle the pointer induction variable case.
4173     assert(P->getType()->isPointerTy() && "Unexpected type.");
4174     // This is the normalized GEP that starts counting at zero.
4175     Value *PtrInd = Induction;
4176     PtrInd = Builder.CreateSExtOrTrunc(PtrInd, II.getStep()->getType());
4177     // Determine the number of scalars we need to generate for each unroll
4178     // iteration. If the instruction is uniform, we only need to generate the
4179     // first lane. Otherwise, we generate all VF values.
4180     unsigned Lanes = Cost->isUniformAfterVectorization(P, VF) ? 1 : VF;
4181     // These are the scalar results. Notice that we don't generate vector GEPs
4182     // because scalar GEPs result in better code.
4183     for (unsigned Part = 0; Part < UF; ++Part) {
4184       for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
4185         Constant *Idx = ConstantInt::get(PtrInd->getType(), Lane + Part * VF);
4186         Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
4187         Value *SclrGep =
4188             emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II);
4189         SclrGep->setName("next.gep");
4190         VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep);
4191       }
4192     }
4193     return;
4194   }
4195   }
4196 }
4197 
4198 /// A helper function for checking whether an integer division-related
4199 /// instruction may divide by zero (in which case it must be predicated if
4200 /// executed conditionally in the scalar code).
4201 /// TODO: It may be worthwhile to generalize and check isKnownNonZero().
4202 /// Non-zero divisors that are non compile-time constants will not be
4203 /// converted into multiplication, so we will still end up scalarizing
4204 /// the division, but can do so w/o predication.
4205 static bool mayDivideByZero(Instruction &I) {
4206   assert((I.getOpcode() == Instruction::UDiv ||
4207           I.getOpcode() == Instruction::SDiv ||
4208           I.getOpcode() == Instruction::URem ||
4209           I.getOpcode() == Instruction::SRem) &&
4210          "Unexpected instruction");
4211   Value *Divisor = I.getOperand(1);
4212   auto *CInt = dyn_cast<ConstantInt>(Divisor);
4213   return !CInt || CInt->isZero();
4214 }
4215 
4216 void InnerLoopVectorizer::widenInstruction(Instruction &I) {
4217   switch (I.getOpcode()) {
4218   case Instruction::Br:
4219   case Instruction::PHI:
4220   case Instruction::GetElementPtr:
4221     llvm_unreachable("This instruction is handled by a different recipe.");
4222   case Instruction::UDiv:
4223   case Instruction::SDiv:
4224   case Instruction::SRem:
4225   case Instruction::URem:
4226   case Instruction::Add:
4227   case Instruction::FAdd:
4228   case Instruction::Sub:
4229   case Instruction::FSub:
4230   case Instruction::FNeg:
4231   case Instruction::Mul:
4232   case Instruction::FMul:
4233   case Instruction::FDiv:
4234   case Instruction::FRem:
4235   case Instruction::Shl:
4236   case Instruction::LShr:
4237   case Instruction::AShr:
4238   case Instruction::And:
4239   case Instruction::Or:
4240   case Instruction::Xor: {
4241     // Just widen unops and binops.
4242     setDebugLocFromInst(Builder, &I);
4243 
4244     for (unsigned Part = 0; Part < UF; ++Part) {
4245       SmallVector<Value *, 2> Ops;
4246       for (Value *Op : I.operands())
4247         Ops.push_back(getOrCreateVectorValue(Op, Part));
4248 
4249       Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops);
4250 
4251       if (auto *VecOp = dyn_cast<Instruction>(V))
4252         VecOp->copyIRFlags(&I);
4253 
4254       // Use this vector value for all users of the original instruction.
4255       VectorLoopValueMap.setVectorValue(&I, Part, V);
4256       addMetadata(V, &I);
4257     }
4258 
4259     break;
4260   }
4261   case Instruction::Select: {
4262     // Widen selects.
4263     // If the selector is loop invariant we can create a select
4264     // instruction with a scalar condition. Otherwise, use vector-select.
4265     auto *SE = PSE.getSE();
4266     bool InvariantCond =
4267         SE->isLoopInvariant(PSE.getSCEV(I.getOperand(0)), OrigLoop);
4268     setDebugLocFromInst(Builder, &I);
4269 
4270     // The condition can be loop invariant  but still defined inside the
4271     // loop. This means that we can't just use the original 'cond' value.
4272     // We have to take the 'vectorized' value and pick the first lane.
4273     // Instcombine will make this a no-op.
4274 
4275     auto *ScalarCond = getOrCreateScalarValue(I.getOperand(0), {0, 0});
4276 
4277     for (unsigned Part = 0; Part < UF; ++Part) {
4278       Value *Cond = getOrCreateVectorValue(I.getOperand(0), Part);
4279       Value *Op0 = getOrCreateVectorValue(I.getOperand(1), Part);
4280       Value *Op1 = getOrCreateVectorValue(I.getOperand(2), Part);
4281       Value *Sel =
4282           Builder.CreateSelect(InvariantCond ? ScalarCond : Cond, Op0, Op1);
4283       VectorLoopValueMap.setVectorValue(&I, Part, Sel);
4284       addMetadata(Sel, &I);
4285     }
4286 
4287     break;
4288   }
4289 
4290   case Instruction::ICmp:
4291   case Instruction::FCmp: {
4292     // Widen compares. Generate vector compares.
4293     bool FCmp = (I.getOpcode() == Instruction::FCmp);
4294     auto *Cmp = cast<CmpInst>(&I);
4295     setDebugLocFromInst(Builder, Cmp);
4296     for (unsigned Part = 0; Part < UF; ++Part) {
4297       Value *A = getOrCreateVectorValue(Cmp->getOperand(0), Part);
4298       Value *B = getOrCreateVectorValue(Cmp->getOperand(1), Part);
4299       Value *C = nullptr;
4300       if (FCmp) {
4301         // Propagate fast math flags.
4302         IRBuilder<>::FastMathFlagGuard FMFG(Builder);
4303         Builder.setFastMathFlags(Cmp->getFastMathFlags());
4304         C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
4305       } else {
4306         C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
4307       }
4308       VectorLoopValueMap.setVectorValue(&I, Part, C);
4309       addMetadata(C, &I);
4310     }
4311 
4312     break;
4313   }
4314 
4315   case Instruction::ZExt:
4316   case Instruction::SExt:
4317   case Instruction::FPToUI:
4318   case Instruction::FPToSI:
4319   case Instruction::FPExt:
4320   case Instruction::PtrToInt:
4321   case Instruction::IntToPtr:
4322   case Instruction::SIToFP:
4323   case Instruction::UIToFP:
4324   case Instruction::Trunc:
4325   case Instruction::FPTrunc:
4326   case Instruction::BitCast: {
4327     auto *CI = cast<CastInst>(&I);
4328     setDebugLocFromInst(Builder, CI);
4329 
4330     /// Vectorize casts.
4331     Type *DestTy =
4332         (VF == 1) ? CI->getType() : VectorType::get(CI->getType(), VF);
4333 
4334     for (unsigned Part = 0; Part < UF; ++Part) {
4335       Value *A = getOrCreateVectorValue(CI->getOperand(0), Part);
4336       Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
4337       VectorLoopValueMap.setVectorValue(&I, Part, Cast);
4338       addMetadata(Cast, &I);
4339     }
4340     break;
4341   }
4342 
4343   case Instruction::Call: {
4344     // Ignore dbg intrinsics.
4345     if (isa<DbgInfoIntrinsic>(I))
4346       break;
4347     setDebugLocFromInst(Builder, &I);
4348 
4349     Module *M = I.getParent()->getParent()->getParent();
4350     auto *CI = cast<CallInst>(&I);
4351 
4352     SmallVector<Type *, 4> Tys;
4353     for (Value *ArgOperand : CI->arg_operands())
4354       Tys.push_back(ToVectorTy(ArgOperand->getType(), VF));
4355 
4356     Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
4357 
4358     // The flag shows whether we use Intrinsic or a usual Call for vectorized
4359     // version of the instruction.
4360     // Is it beneficial to perform intrinsic call compared to lib call?
4361     bool NeedToScalarize = false;
4362     unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize);
4363     bool UseVectorIntrinsic =
4364         ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost;
4365     assert((UseVectorIntrinsic || !NeedToScalarize) &&
4366            "Instruction should be scalarized elsewhere.");
4367 
4368     for (unsigned Part = 0; Part < UF; ++Part) {
4369       SmallVector<Value *, 4> Args;
4370       for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) {
4371         Value *Arg = CI->getArgOperand(i);
4372         // Some intrinsics have a scalar argument - don't replace it with a
4373         // vector.
4374         if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, i))
4375           Arg = getOrCreateVectorValue(CI->getArgOperand(i), Part);
4376         Args.push_back(Arg);
4377       }
4378 
4379       Function *VectorF;
4380       if (UseVectorIntrinsic) {
4381         // Use vector version of the intrinsic.
4382         Type *TysForDecl[] = {CI->getType()};
4383         if (VF > 1)
4384           TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
4385         VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
4386       } else {
4387         // Use vector version of the function call.
4388         const VFShape Shape =
4389             VFShape::get(*CI, {VF, false} /*EC*/, false /*HasGlobalPred*/);
4390 #ifndef NDEBUG
4391         const SmallVector<VFInfo, 8> Infos = VFDatabase::getMappings(*CI);
4392         assert(std::find_if(Infos.begin(), Infos.end(),
4393                             [&Shape](const VFInfo &Info) {
4394                               return Info.Shape == Shape;
4395                             }) != Infos.end() &&
4396                "Vector function shape is missing from the database.");
4397 #endif
4398         VectorF = VFDatabase(*CI).getVectorizedFunction(Shape);
4399       }
4400       assert(VectorF && "Can't create vector function.");
4401 
4402       SmallVector<OperandBundleDef, 1> OpBundles;
4403       CI->getOperandBundlesAsDefs(OpBundles);
4404       CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
4405 
4406       if (isa<FPMathOperator>(V))
4407         V->copyFastMathFlags(CI);
4408 
4409       VectorLoopValueMap.setVectorValue(&I, Part, V);
4410       addMetadata(V, &I);
4411     }
4412 
4413     break;
4414   }
4415 
4416   default:
4417     // This instruction is not vectorized by simple widening.
4418     LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
4419     llvm_unreachable("Unhandled instruction!");
4420   } // end of switch.
4421 }
4422 
4423 void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
4424   // We should not collect Scalars more than once per VF. Right now, this
4425   // function is called from collectUniformsAndScalars(), which already does
4426   // this check. Collecting Scalars for VF=1 does not make any sense.
4427   assert(VF >= 2 && Scalars.find(VF) == Scalars.end() &&
4428          "This function should not be visited twice for the same VF");
4429 
4430   SmallSetVector<Instruction *, 8> Worklist;
4431 
4432   // These sets are used to seed the analysis with pointers used by memory
4433   // accesses that will remain scalar.
4434   SmallSetVector<Instruction *, 8> ScalarPtrs;
4435   SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
4436 
4437   // A helper that returns true if the use of Ptr by MemAccess will be scalar.
4438   // The pointer operands of loads and stores will be scalar as long as the
4439   // memory access is not a gather or scatter operation. The value operand of a
4440   // store will remain scalar if the store is scalarized.
4441   auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
4442     InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
4443     assert(WideningDecision != CM_Unknown &&
4444            "Widening decision should be ready at this moment");
4445     if (auto *Store = dyn_cast<StoreInst>(MemAccess))
4446       if (Ptr == Store->getValueOperand())
4447         return WideningDecision == CM_Scalarize;
4448     assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
4449            "Ptr is neither a value or pointer operand");
4450     return WideningDecision != CM_GatherScatter;
4451   };
4452 
4453   // A helper that returns true if the given value is a bitcast or
4454   // getelementptr instruction contained in the loop.
4455   auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
4456     return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
4457             isa<GetElementPtrInst>(V)) &&
4458            !TheLoop->isLoopInvariant(V);
4459   };
4460 
4461   // A helper that evaluates a memory access's use of a pointer. If the use
4462   // will be a scalar use, and the pointer is only used by memory accesses, we
4463   // place the pointer in ScalarPtrs. Otherwise, the pointer is placed in
4464   // PossibleNonScalarPtrs.
4465   auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
4466     // We only care about bitcast and getelementptr instructions contained in
4467     // the loop.
4468     if (!isLoopVaryingBitCastOrGEP(Ptr))
4469       return;
4470 
4471     // If the pointer has already been identified as scalar (e.g., if it was
4472     // also identified as uniform), there's nothing to do.
4473     auto *I = cast<Instruction>(Ptr);
4474     if (Worklist.count(I))
4475       return;
4476 
4477     // If the use of the pointer will be a scalar use, and all users of the
4478     // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
4479     // place the pointer in PossibleNonScalarPtrs.
4480     if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
4481           return isa<LoadInst>(U) || isa<StoreInst>(U);
4482         }))
4483       ScalarPtrs.insert(I);
4484     else
4485       PossibleNonScalarPtrs.insert(I);
4486   };
4487 
4488   // We seed the scalars analysis with three classes of instructions: (1)
4489   // instructions marked uniform-after-vectorization, (2) bitcast and
4490   // getelementptr instructions used by memory accesses requiring a scalar use,
4491   // and (3) pointer induction variables and their update instructions (we
4492   // currently only scalarize these).
4493   //
4494   // (1) Add to the worklist all instructions that have been identified as
4495   // uniform-after-vectorization.
4496   Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
4497 
4498   // (2) Add to the worklist all bitcast and getelementptr instructions used by
4499   // memory accesses requiring a scalar use. The pointer operands of loads and
4500   // stores will be scalar as long as the memory accesses is not a gather or
4501   // scatter operation. The value operand of a store will remain scalar if the
4502   // store is scalarized.
4503   for (auto *BB : TheLoop->blocks())
4504     for (auto &I : *BB) {
4505       if (auto *Load = dyn_cast<LoadInst>(&I)) {
4506         evaluatePtrUse(Load, Load->getPointerOperand());
4507       } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
4508         evaluatePtrUse(Store, Store->getPointerOperand());
4509         evaluatePtrUse(Store, Store->getValueOperand());
4510       }
4511     }
4512   for (auto *I : ScalarPtrs)
4513     if (PossibleNonScalarPtrs.find(I) == PossibleNonScalarPtrs.end()) {
4514       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
4515       Worklist.insert(I);
4516     }
4517 
4518   // (3) Add to the worklist all pointer induction variables and their update
4519   // instructions.
4520   //
4521   // TODO: Once we are able to vectorize pointer induction variables we should
4522   //       no longer insert them into the worklist here.
4523   auto *Latch = TheLoop->getLoopLatch();
4524   for (auto &Induction : Legal->getInductionVars()) {
4525     auto *Ind = Induction.first;
4526     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4527     if (Induction.second.getKind() != InductionDescriptor::IK_PtrInduction)
4528       continue;
4529     Worklist.insert(Ind);
4530     Worklist.insert(IndUpdate);
4531     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
4532     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
4533                       << "\n");
4534   }
4535 
4536   // Insert the forced scalars.
4537   // FIXME: Currently widenPHIInstruction() often creates a dead vector
4538   // induction variable when the PHI user is scalarized.
4539   auto ForcedScalar = ForcedScalars.find(VF);
4540   if (ForcedScalar != ForcedScalars.end())
4541     for (auto *I : ForcedScalar->second)
4542       Worklist.insert(I);
4543 
4544   // Expand the worklist by looking through any bitcasts and getelementptr
4545   // instructions we've already identified as scalar. This is similar to the
4546   // expansion step in collectLoopUniforms(); however, here we're only
4547   // expanding to include additional bitcasts and getelementptr instructions.
4548   unsigned Idx = 0;
4549   while (Idx != Worklist.size()) {
4550     Instruction *Dst = Worklist[Idx++];
4551     if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
4552       continue;
4553     auto *Src = cast<Instruction>(Dst->getOperand(0));
4554     if (llvm::all_of(Src->users(), [&](User *U) -> bool {
4555           auto *J = cast<Instruction>(U);
4556           return !TheLoop->contains(J) || Worklist.count(J) ||
4557                  ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
4558                   isScalarUse(J, Src));
4559         })) {
4560       Worklist.insert(Src);
4561       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
4562     }
4563   }
4564 
4565   // An induction variable will remain scalar if all users of the induction
4566   // variable and induction variable update remain scalar.
4567   for (auto &Induction : Legal->getInductionVars()) {
4568     auto *Ind = Induction.first;
4569     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4570 
4571     // We already considered pointer induction variables, so there's no reason
4572     // to look at their users again.
4573     //
4574     // TODO: Once we are able to vectorize pointer induction variables we
4575     //       should no longer skip over them here.
4576     if (Induction.second.getKind() == InductionDescriptor::IK_PtrInduction)
4577       continue;
4578 
4579     // Determine if all users of the induction variable are scalar after
4580     // vectorization.
4581     auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4582       auto *I = cast<Instruction>(U);
4583       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I);
4584     });
4585     if (!ScalarInd)
4586       continue;
4587 
4588     // Determine if all users of the induction variable update instruction are
4589     // scalar after vectorization.
4590     auto ScalarIndUpdate =
4591         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4592           auto *I = cast<Instruction>(U);
4593           return I == Ind || !TheLoop->contains(I) || Worklist.count(I);
4594         });
4595     if (!ScalarIndUpdate)
4596       continue;
4597 
4598     // The induction variable and its update instruction will remain scalar.
4599     Worklist.insert(Ind);
4600     Worklist.insert(IndUpdate);
4601     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
4602     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
4603                       << "\n");
4604   }
4605 
4606   Scalars[VF].insert(Worklist.begin(), Worklist.end());
4607 }
4608 
4609 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, unsigned VF) {
4610   if (!blockNeedsPredication(I->getParent()))
4611     return false;
4612   switch(I->getOpcode()) {
4613   default:
4614     break;
4615   case Instruction::Load:
4616   case Instruction::Store: {
4617     if (!Legal->isMaskRequired(I))
4618       return false;
4619     auto *Ptr = getLoadStorePointerOperand(I);
4620     auto *Ty = getMemInstValueType(I);
4621     // We have already decided how to vectorize this instruction, get that
4622     // result.
4623     if (VF > 1) {
4624       InstWidening WideningDecision = getWideningDecision(I, VF);
4625       assert(WideningDecision != CM_Unknown &&
4626              "Widening decision should be ready at this moment");
4627       return WideningDecision == CM_Scalarize;
4628     }
4629     const MaybeAlign Alignment = getLoadStoreAlignment(I);
4630     return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
4631                                 isLegalMaskedGather(Ty, Alignment))
4632                             : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
4633                                 isLegalMaskedScatter(Ty, Alignment));
4634   }
4635   case Instruction::UDiv:
4636   case Instruction::SDiv:
4637   case Instruction::SRem:
4638   case Instruction::URem:
4639     return mayDivideByZero(*I);
4640   }
4641   return false;
4642 }
4643 
4644 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(Instruction *I,
4645                                                                unsigned VF) {
4646   assert(isAccessInterleaved(I) && "Expecting interleaved access.");
4647   assert(getWideningDecision(I, VF) == CM_Unknown &&
4648          "Decision should not be set yet.");
4649   auto *Group = getInterleavedAccessGroup(I);
4650   assert(Group && "Must have a group.");
4651 
4652   // If the instruction's allocated size doesn't equal it's type size, it
4653   // requires padding and will be scalarized.
4654   auto &DL = I->getModule()->getDataLayout();
4655   auto *ScalarTy = getMemInstValueType(I);
4656   if (hasIrregularType(ScalarTy, DL, VF))
4657     return false;
4658 
4659   // Check if masking is required.
4660   // A Group may need masking for one of two reasons: it resides in a block that
4661   // needs predication, or it was decided to use masking to deal with gaps.
4662   bool PredicatedAccessRequiresMasking =
4663       Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I);
4664   bool AccessWithGapsRequiresMasking =
4665       Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
4666   if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking)
4667     return true;
4668 
4669   // If masked interleaving is required, we expect that the user/target had
4670   // enabled it, because otherwise it either wouldn't have been created or
4671   // it should have been invalidated by the CostModel.
4672   assert(useMaskedInterleavedAccesses(TTI) &&
4673          "Masked interleave-groups for predicated accesses are not enabled.");
4674 
4675   auto *Ty = getMemInstValueType(I);
4676   const MaybeAlign Alignment = getLoadStoreAlignment(I);
4677   return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
4678                           : TTI.isLegalMaskedStore(Ty, Alignment);
4679 }
4680 
4681 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(Instruction *I,
4682                                                                unsigned VF) {
4683   // Get and ensure we have a valid memory instruction.
4684   LoadInst *LI = dyn_cast<LoadInst>(I);
4685   StoreInst *SI = dyn_cast<StoreInst>(I);
4686   assert((LI || SI) && "Invalid memory instruction");
4687 
4688   auto *Ptr = getLoadStorePointerOperand(I);
4689 
4690   // In order to be widened, the pointer should be consecutive, first of all.
4691   if (!Legal->isConsecutivePtr(Ptr))
4692     return false;
4693 
4694   // If the instruction is a store located in a predicated block, it will be
4695   // scalarized.
4696   if (isScalarWithPredication(I))
4697     return false;
4698 
4699   // If the instruction's allocated size doesn't equal it's type size, it
4700   // requires padding and will be scalarized.
4701   auto &DL = I->getModule()->getDataLayout();
4702   auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType();
4703   if (hasIrregularType(ScalarTy, DL, VF))
4704     return false;
4705 
4706   return true;
4707 }
4708 
4709 void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
4710   // We should not collect Uniforms more than once per VF. Right now,
4711   // this function is called from collectUniformsAndScalars(), which
4712   // already does this check. Collecting Uniforms for VF=1 does not make any
4713   // sense.
4714 
4715   assert(VF >= 2 && Uniforms.find(VF) == Uniforms.end() &&
4716          "This function should not be visited twice for the same VF");
4717 
4718   // Visit the list of Uniforms. If we'll not find any uniform value, we'll
4719   // not analyze again.  Uniforms.count(VF) will return 1.
4720   Uniforms[VF].clear();
4721 
4722   // We now know that the loop is vectorizable!
4723   // Collect instructions inside the loop that will remain uniform after
4724   // vectorization.
4725 
4726   // Global values, params and instructions outside of current loop are out of
4727   // scope.
4728   auto isOutOfScope = [&](Value *V) -> bool {
4729     Instruction *I = dyn_cast<Instruction>(V);
4730     return (!I || !TheLoop->contains(I));
4731   };
4732 
4733   SetVector<Instruction *> Worklist;
4734   BasicBlock *Latch = TheLoop->getLoopLatch();
4735 
4736   // Instructions that are scalar with predication must not be considered
4737   // uniform after vectorization, because that would create an erroneous
4738   // replicating region where only a single instance out of VF should be formed.
4739   // TODO: optimize such seldom cases if found important, see PR40816.
4740   auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
4741     if (isScalarWithPredication(I, VF)) {
4742       LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "
4743                         << *I << "\n");
4744       return;
4745     }
4746     LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
4747     Worklist.insert(I);
4748   };
4749 
4750   // Start with the conditional branch. If the branch condition is an
4751   // instruction contained in the loop that is only used by the branch, it is
4752   // uniform.
4753   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
4754   if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
4755     addToWorklistIfAllowed(Cmp);
4756 
4757   // Holds consecutive and consecutive-like pointers. Consecutive-like pointers
4758   // are pointers that are treated like consecutive pointers during
4759   // vectorization. The pointer operands of interleaved accesses are an
4760   // example.
4761   SmallSetVector<Instruction *, 8> ConsecutiveLikePtrs;
4762 
4763   // Holds pointer operands of instructions that are possibly non-uniform.
4764   SmallPtrSet<Instruction *, 8> PossibleNonUniformPtrs;
4765 
4766   auto isUniformDecision = [&](Instruction *I, unsigned VF) {
4767     InstWidening WideningDecision = getWideningDecision(I, VF);
4768     assert(WideningDecision != CM_Unknown &&
4769            "Widening decision should be ready at this moment");
4770 
4771     return (WideningDecision == CM_Widen ||
4772             WideningDecision == CM_Widen_Reverse ||
4773             WideningDecision == CM_Interleave);
4774   };
4775   // Iterate over the instructions in the loop, and collect all
4776   // consecutive-like pointer operands in ConsecutiveLikePtrs. If it's possible
4777   // that a consecutive-like pointer operand will be scalarized, we collect it
4778   // in PossibleNonUniformPtrs instead. We use two sets here because a single
4779   // getelementptr instruction can be used by both vectorized and scalarized
4780   // memory instructions. For example, if a loop loads and stores from the same
4781   // location, but the store is conditional, the store will be scalarized, and
4782   // the getelementptr won't remain uniform.
4783   for (auto *BB : TheLoop->blocks())
4784     for (auto &I : *BB) {
4785       // If there's no pointer operand, there's nothing to do.
4786       auto *Ptr = dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
4787       if (!Ptr)
4788         continue;
4789 
4790       // True if all users of Ptr are memory accesses that have Ptr as their
4791       // pointer operand.
4792       auto UsersAreMemAccesses =
4793           llvm::all_of(Ptr->users(), [&](User *U) -> bool {
4794             return getLoadStorePointerOperand(U) == Ptr;
4795           });
4796 
4797       // Ensure the memory instruction will not be scalarized or used by
4798       // gather/scatter, making its pointer operand non-uniform. If the pointer
4799       // operand is used by any instruction other than a memory access, we
4800       // conservatively assume the pointer operand may be non-uniform.
4801       if (!UsersAreMemAccesses || !isUniformDecision(&I, VF))
4802         PossibleNonUniformPtrs.insert(Ptr);
4803 
4804       // If the memory instruction will be vectorized and its pointer operand
4805       // is consecutive-like, or interleaving - the pointer operand should
4806       // remain uniform.
4807       else
4808         ConsecutiveLikePtrs.insert(Ptr);
4809     }
4810 
4811   // Add to the Worklist all consecutive and consecutive-like pointers that
4812   // aren't also identified as possibly non-uniform.
4813   for (auto *V : ConsecutiveLikePtrs)
4814     if (PossibleNonUniformPtrs.find(V) == PossibleNonUniformPtrs.end())
4815       addToWorklistIfAllowed(V);
4816 
4817   // Expand Worklist in topological order: whenever a new instruction
4818   // is added , its users should be already inside Worklist.  It ensures
4819   // a uniform instruction will only be used by uniform instructions.
4820   unsigned idx = 0;
4821   while (idx != Worklist.size()) {
4822     Instruction *I = Worklist[idx++];
4823 
4824     for (auto OV : I->operand_values()) {
4825       // isOutOfScope operands cannot be uniform instructions.
4826       if (isOutOfScope(OV))
4827         continue;
4828       // First order recurrence Phi's should typically be considered
4829       // non-uniform.
4830       auto *OP = dyn_cast<PHINode>(OV);
4831       if (OP && Legal->isFirstOrderRecurrence(OP))
4832         continue;
4833       // If all the users of the operand are uniform, then add the
4834       // operand into the uniform worklist.
4835       auto *OI = cast<Instruction>(OV);
4836       if (llvm::all_of(OI->users(), [&](User *U) -> bool {
4837             auto *J = cast<Instruction>(U);
4838             return Worklist.count(J) ||
4839                    (OI == getLoadStorePointerOperand(J) &&
4840                     isUniformDecision(J, VF));
4841           }))
4842         addToWorklistIfAllowed(OI);
4843     }
4844   }
4845 
4846   // Returns true if Ptr is the pointer operand of a memory access instruction
4847   // I, and I is known to not require scalarization.
4848   auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
4849     return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF);
4850   };
4851 
4852   // For an instruction to be added into Worklist above, all its users inside
4853   // the loop should also be in Worklist. However, this condition cannot be
4854   // true for phi nodes that form a cyclic dependence. We must process phi
4855   // nodes separately. An induction variable will remain uniform if all users
4856   // of the induction variable and induction variable update remain uniform.
4857   // The code below handles both pointer and non-pointer induction variables.
4858   for (auto &Induction : Legal->getInductionVars()) {
4859     auto *Ind = Induction.first;
4860     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4861 
4862     // Determine if all users of the induction variable are uniform after
4863     // vectorization.
4864     auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4865       auto *I = cast<Instruction>(U);
4866       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
4867              isVectorizedMemAccessUse(I, Ind);
4868     });
4869     if (!UniformInd)
4870       continue;
4871 
4872     // Determine if all users of the induction variable update instruction are
4873     // uniform after vectorization.
4874     auto UniformIndUpdate =
4875         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4876           auto *I = cast<Instruction>(U);
4877           return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
4878                  isVectorizedMemAccessUse(I, IndUpdate);
4879         });
4880     if (!UniformIndUpdate)
4881       continue;
4882 
4883     // The induction variable and its update instruction will remain uniform.
4884     addToWorklistIfAllowed(Ind);
4885     addToWorklistIfAllowed(IndUpdate);
4886   }
4887 
4888   Uniforms[VF].insert(Worklist.begin(), Worklist.end());
4889 }
4890 
4891 bool LoopVectorizationCostModel::runtimeChecksRequired() {
4892   LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
4893 
4894   if (Legal->getRuntimePointerChecking()->Need) {
4895     reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
4896         "runtime pointer checks needed. Enable vectorization of this "
4897         "loop with '#pragma clang loop vectorize(enable)' when "
4898         "compiling with -Os/-Oz",
4899         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4900     return true;
4901   }
4902 
4903   if (!PSE.getUnionPredicate().getPredicates().empty()) {
4904     reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
4905         "runtime SCEV checks needed. Enable vectorization of this "
4906         "loop with '#pragma clang loop vectorize(enable)' when "
4907         "compiling with -Os/-Oz",
4908         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4909     return true;
4910   }
4911 
4912   // FIXME: Avoid specializing for stride==1 instead of bailing out.
4913   if (!Legal->getLAI()->getSymbolicStrides().empty()) {
4914     reportVectorizationFailure("Runtime stride check is required with -Os/-Oz",
4915         "runtime stride == 1 checks needed. Enable vectorization of "
4916         "this loop with '#pragma clang loop vectorize(enable)' when "
4917         "compiling with -Os/-Oz",
4918         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4919     return true;
4920   }
4921 
4922   return false;
4923 }
4924 
4925 Optional<unsigned> LoopVectorizationCostModel::computeMaxVF() {
4926   if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
4927     // TODO: It may by useful to do since it's still likely to be dynamically
4928     // uniform if the target can skip.
4929     reportVectorizationFailure(
4930         "Not inserting runtime ptr check for divergent target",
4931         "runtime pointer checks needed. Not enabled for divergent target",
4932         "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
4933     return None;
4934   }
4935 
4936   unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
4937   LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
4938   if (TC == 1) {
4939     reportVectorizationFailure("Single iteration (non) loop",
4940         "loop trip count is one, irrelevant for vectorization",
4941         "SingleIterationLoop", ORE, TheLoop);
4942     return None;
4943   }
4944 
4945   switch (ScalarEpilogueStatus) {
4946   case CM_ScalarEpilogueAllowed:
4947     return computeFeasibleMaxVF(TC);
4948   case CM_ScalarEpilogueNotNeededUsePredicate:
4949     LLVM_DEBUG(
4950         dbgs() << "LV: vector predicate hint/switch found.\n"
4951                << "LV: Not allowing scalar epilogue, creating predicated "
4952                << "vector loop.\n");
4953     break;
4954   case CM_ScalarEpilogueNotAllowedLowTripLoop:
4955     // fallthrough as a special case of OptForSize
4956   case CM_ScalarEpilogueNotAllowedOptSize:
4957     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
4958       LLVM_DEBUG(
4959           dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
4960     else
4961       LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
4962                         << "count.\n");
4963 
4964     // Bail if runtime checks are required, which are not good when optimising
4965     // for size.
4966     if (runtimeChecksRequired())
4967       return None;
4968     break;
4969   }
4970 
4971   // Now try the tail folding
4972 
4973   // Invalidate interleave groups that require an epilogue if we can't mask
4974   // the interleave-group.
4975   if (!useMaskedInterleavedAccesses(TTI))
4976     InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
4977 
4978   unsigned MaxVF = computeFeasibleMaxVF(TC);
4979   if (TC > 0 && TC % MaxVF == 0) {
4980     // Accept MaxVF if we do not have a tail.
4981     LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
4982     return MaxVF;
4983   }
4984 
4985   // If we don't know the precise trip count, or if the trip count that we
4986   // found modulo the vectorization factor is not zero, try to fold the tail
4987   // by masking.
4988   // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
4989   if (Legal->prepareToFoldTailByMasking()) {
4990     FoldTailByMasking = true;
4991     return MaxVF;
4992   }
4993 
4994   if (TC == 0) {
4995     reportVectorizationFailure(
4996         "Unable to calculate the loop count due to complex control flow",
4997         "unable to calculate the loop count due to complex control flow",
4998         "UnknownLoopCountComplexCFG", ORE, TheLoop);
4999     return None;
5000   }
5001 
5002   reportVectorizationFailure(
5003       "Cannot optimize for size and vectorize at the same time.",
5004       "cannot optimize for size and vectorize at the same time. "
5005       "Enable vectorization of this loop with '#pragma clang loop "
5006       "vectorize(enable)' when compiling with -Os/-Oz",
5007       "NoTailLoopWithOptForSize", ORE, TheLoop);
5008   return None;
5009 }
5010 
5011 unsigned
5012 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) {
5013   MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
5014   unsigned SmallestType, WidestType;
5015   std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
5016   unsigned WidestRegister = TTI.getRegisterBitWidth(true);
5017 
5018   // Get the maximum safe dependence distance in bits computed by LAA.
5019   // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
5020   // the memory accesses that is most restrictive (involved in the smallest
5021   // dependence distance).
5022   unsigned MaxSafeRegisterWidth = Legal->getMaxSafeRegisterWidth();
5023 
5024   WidestRegister = std::min(WidestRegister, MaxSafeRegisterWidth);
5025 
5026   unsigned MaxVectorSize = WidestRegister / WidestType;
5027 
5028   LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
5029                     << " / " << WidestType << " bits.\n");
5030   LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
5031                     << WidestRegister << " bits.\n");
5032 
5033   assert(MaxVectorSize <= 256 && "Did not expect to pack so many elements"
5034                                  " into one vector!");
5035   if (MaxVectorSize == 0) {
5036     LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n");
5037     MaxVectorSize = 1;
5038     return MaxVectorSize;
5039   } else if (ConstTripCount && ConstTripCount < MaxVectorSize &&
5040              isPowerOf2_32(ConstTripCount)) {
5041     // We need to clamp the VF to be the ConstTripCount. There is no point in
5042     // choosing a higher viable VF as done in the loop below.
5043     LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: "
5044                       << ConstTripCount << "\n");
5045     MaxVectorSize = ConstTripCount;
5046     return MaxVectorSize;
5047   }
5048 
5049   unsigned MaxVF = MaxVectorSize;
5050   if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) ||
5051       (MaximizeBandwidth && isScalarEpilogueAllowed())) {
5052     // Collect all viable vectorization factors larger than the default MaxVF
5053     // (i.e. MaxVectorSize).
5054     SmallVector<unsigned, 8> VFs;
5055     unsigned NewMaxVectorSize = WidestRegister / SmallestType;
5056     for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2)
5057       VFs.push_back(VS);
5058 
5059     // For each VF calculate its register usage.
5060     auto RUs = calculateRegisterUsage(VFs);
5061 
5062     // Select the largest VF which doesn't require more registers than existing
5063     // ones.
5064     for (int i = RUs.size() - 1; i >= 0; --i) {
5065       bool Selected = true;
5066       for (auto& pair : RUs[i].MaxLocalUsers) {
5067         unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5068         if (pair.second > TargetNumRegisters)
5069           Selected = false;
5070       }
5071       if (Selected) {
5072         MaxVF = VFs[i];
5073         break;
5074       }
5075     }
5076     if (unsigned MinVF = TTI.getMinimumVF(SmallestType)) {
5077       if (MaxVF < MinVF) {
5078         LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
5079                           << ") with target's minimum: " << MinVF << '\n');
5080         MaxVF = MinVF;
5081       }
5082     }
5083   }
5084   return MaxVF;
5085 }
5086 
5087 VectorizationFactor
5088 LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) {
5089   float Cost = expectedCost(1).first;
5090   const float ScalarCost = Cost;
5091   unsigned Width = 1;
5092   LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n");
5093 
5094   bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
5095   if (ForceVectorization && MaxVF > 1) {
5096     // Ignore scalar width, because the user explicitly wants vectorization.
5097     // Initialize cost to max so that VF = 2 is, at least, chosen during cost
5098     // evaluation.
5099     Cost = std::numeric_limits<float>::max();
5100   }
5101 
5102   for (unsigned i = 2; i <= MaxVF; i *= 2) {
5103     // Notice that the vector loop needs to be executed less times, so
5104     // we need to divide the cost of the vector loops by the width of
5105     // the vector elements.
5106     VectorizationCostTy C = expectedCost(i);
5107     float VectorCost = C.first / (float)i;
5108     LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
5109                       << " costs: " << (int)VectorCost << ".\n");
5110     if (!C.second && !ForceVectorization) {
5111       LLVM_DEBUG(
5112           dbgs() << "LV: Not considering vector loop of width " << i
5113                  << " because it will not generate any vector instructions.\n");
5114       continue;
5115     }
5116     if (VectorCost < Cost) {
5117       Cost = VectorCost;
5118       Width = i;
5119     }
5120   }
5121 
5122   if (!EnableCondStoresVectorization && NumPredStores) {
5123     reportVectorizationFailure("There are conditional stores.",
5124         "store that is conditionally executed prevents vectorization",
5125         "ConditionalStore", ORE, TheLoop);
5126     Width = 1;
5127     Cost = ScalarCost;
5128   }
5129 
5130   LLVM_DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs()
5131              << "LV: Vectorization seems to be not beneficial, "
5132              << "but was forced by a user.\n");
5133   LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n");
5134   VectorizationFactor Factor = {Width, (unsigned)(Width * Cost)};
5135   return Factor;
5136 }
5137 
5138 std::pair<unsigned, unsigned>
5139 LoopVectorizationCostModel::getSmallestAndWidestTypes() {
5140   unsigned MinWidth = -1U;
5141   unsigned MaxWidth = 8;
5142   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5143 
5144   // For each block.
5145   for (BasicBlock *BB : TheLoop->blocks()) {
5146     // For each instruction in the loop.
5147     for (Instruction &I : BB->instructionsWithoutDebug()) {
5148       Type *T = I.getType();
5149 
5150       // Skip ignored values.
5151       if (ValuesToIgnore.find(&I) != ValuesToIgnore.end())
5152         continue;
5153 
5154       // Only examine Loads, Stores and PHINodes.
5155       if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
5156         continue;
5157 
5158       // Examine PHI nodes that are reduction variables. Update the type to
5159       // account for the recurrence type.
5160       if (auto *PN = dyn_cast<PHINode>(&I)) {
5161         if (!Legal->isReductionVariable(PN))
5162           continue;
5163         RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[PN];
5164         T = RdxDesc.getRecurrenceType();
5165       }
5166 
5167       // Examine the stored values.
5168       if (auto *ST = dyn_cast<StoreInst>(&I))
5169         T = ST->getValueOperand()->getType();
5170 
5171       // Ignore loaded pointer types and stored pointer types that are not
5172       // vectorizable.
5173       //
5174       // FIXME: The check here attempts to predict whether a load or store will
5175       //        be vectorized. We only know this for certain after a VF has
5176       //        been selected. Here, we assume that if an access can be
5177       //        vectorized, it will be. We should also look at extending this
5178       //        optimization to non-pointer types.
5179       //
5180       if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) &&
5181           !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I))
5182         continue;
5183 
5184       MinWidth = std::min(MinWidth,
5185                           (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
5186       MaxWidth = std::max(MaxWidth,
5187                           (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
5188     }
5189   }
5190 
5191   return {MinWidth, MaxWidth};
5192 }
5193 
5194 unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF,
5195                                                            unsigned LoopCost) {
5196   // -- The interleave heuristics --
5197   // We interleave the loop in order to expose ILP and reduce the loop overhead.
5198   // There are many micro-architectural considerations that we can't predict
5199   // at this level. For example, frontend pressure (on decode or fetch) due to
5200   // code size, or the number and capabilities of the execution ports.
5201   //
5202   // We use the following heuristics to select the interleave count:
5203   // 1. If the code has reductions, then we interleave to break the cross
5204   // iteration dependency.
5205   // 2. If the loop is really small, then we interleave to reduce the loop
5206   // overhead.
5207   // 3. We don't interleave if we think that we will spill registers to memory
5208   // due to the increased register pressure.
5209 
5210   if (!isScalarEpilogueAllowed())
5211     return 1;
5212 
5213   // We used the distance for the interleave count.
5214   if (Legal->getMaxSafeDepDistBytes() != -1U)
5215     return 1;
5216 
5217   // Do not interleave loops with a relatively small known or estimated trip
5218   // count.
5219   auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
5220   if (BestKnownTC && *BestKnownTC < TinyTripCountInterleaveThreshold)
5221     return 1;
5222 
5223   RegisterUsage R = calculateRegisterUsage({VF})[0];
5224   // We divide by these constants so assume that we have at least one
5225   // instruction that uses at least one register.
5226   for (auto& pair : R.MaxLocalUsers) {
5227     pair.second = std::max(pair.second, 1U);
5228   }
5229 
5230   // We calculate the interleave count using the following formula.
5231   // Subtract the number of loop invariants from the number of available
5232   // registers. These registers are used by all of the interleaved instances.
5233   // Next, divide the remaining registers by the number of registers that is
5234   // required by the loop, in order to estimate how many parallel instances
5235   // fit without causing spills. All of this is rounded down if necessary to be
5236   // a power of two. We want power of two interleave count to simplify any
5237   // addressing operations or alignment considerations.
5238   // We also want power of two interleave counts to ensure that the induction
5239   // variable of the vector loop wraps to zero, when tail is folded by masking;
5240   // this currently happens when OptForSize, in which case IC is set to 1 above.
5241   unsigned IC = UINT_MAX;
5242 
5243   for (auto& pair : R.MaxLocalUsers) {
5244     unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5245     LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
5246                       << " registers of "
5247                       << TTI.getRegisterClassName(pair.first) << " register class\n");
5248     if (VF == 1) {
5249       if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
5250         TargetNumRegisters = ForceTargetNumScalarRegs;
5251     } else {
5252       if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
5253         TargetNumRegisters = ForceTargetNumVectorRegs;
5254     }
5255     unsigned MaxLocalUsers = pair.second;
5256     unsigned LoopInvariantRegs = 0;
5257     if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end())
5258       LoopInvariantRegs = R.LoopInvariantRegs[pair.first];
5259 
5260     unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers);
5261     // Don't count the induction variable as interleaved.
5262     if (EnableIndVarRegisterHeur) {
5263       TmpIC =
5264           PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) /
5265                         std::max(1U, (MaxLocalUsers - 1)));
5266     }
5267 
5268     IC = std::min(IC, TmpIC);
5269   }
5270 
5271   // Clamp the interleave ranges to reasonable counts.
5272   unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF);
5273 
5274   // Check if the user has overridden the max.
5275   if (VF == 1) {
5276     if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
5277       MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
5278   } else {
5279     if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
5280       MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
5281   }
5282 
5283   // If trip count is known or estimated compile time constant, limit the
5284   // interleave count to be less than the trip count divided by VF.
5285   if (BestKnownTC) {
5286     MaxInterleaveCount = std::min(*BestKnownTC / VF, MaxInterleaveCount);
5287   }
5288 
5289   // If we did not calculate the cost for VF (because the user selected the VF)
5290   // then we calculate the cost of VF here.
5291   if (LoopCost == 0)
5292     LoopCost = expectedCost(VF).first;
5293 
5294   assert(LoopCost && "Non-zero loop cost expected");
5295 
5296   // Clamp the calculated IC to be between the 1 and the max interleave count
5297   // that the target and trip count allows.
5298   if (IC > MaxInterleaveCount)
5299     IC = MaxInterleaveCount;
5300   else if (IC < 1)
5301     IC = 1;
5302 
5303   // Interleave if we vectorized this loop and there is a reduction that could
5304   // benefit from interleaving.
5305   if (VF > 1 && !Legal->getReductionVars().empty()) {
5306     LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
5307     return IC;
5308   }
5309 
5310   // Note that if we've already vectorized the loop we will have done the
5311   // runtime check and so interleaving won't require further checks.
5312   bool InterleavingRequiresRuntimePointerCheck =
5313       (VF == 1 && Legal->getRuntimePointerChecking()->Need);
5314 
5315   // We want to interleave small loops in order to reduce the loop overhead and
5316   // potentially expose ILP opportunities.
5317   LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n');
5318   if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) {
5319     // We assume that the cost overhead is 1 and we use the cost model
5320     // to estimate the cost of the loop and interleave until the cost of the
5321     // loop overhead is about 5% of the cost of the loop.
5322     unsigned SmallIC =
5323         std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost));
5324 
5325     // Interleave until store/load ports (estimated by max interleave count) are
5326     // saturated.
5327     unsigned NumStores = Legal->getNumStores();
5328     unsigned NumLoads = Legal->getNumLoads();
5329     unsigned StoresIC = IC / (NumStores ? NumStores : 1);
5330     unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
5331 
5332     // If we have a scalar reduction (vector reductions are already dealt with
5333     // by this point), we can increase the critical path length if the loop
5334     // we're interleaving is inside another loop. Limit, by default to 2, so the
5335     // critical path only gets increased by one reduction operation.
5336     if (!Legal->getReductionVars().empty() && TheLoop->getLoopDepth() > 1) {
5337       unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
5338       SmallIC = std::min(SmallIC, F);
5339       StoresIC = std::min(StoresIC, F);
5340       LoadsIC = std::min(LoadsIC, F);
5341     }
5342 
5343     if (EnableLoadStoreRuntimeInterleave &&
5344         std::max(StoresIC, LoadsIC) > SmallIC) {
5345       LLVM_DEBUG(
5346           dbgs() << "LV: Interleaving to saturate store or load ports.\n");
5347       return std::max(StoresIC, LoadsIC);
5348     }
5349 
5350     LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
5351     return SmallIC;
5352   }
5353 
5354   // Interleave if this is a large loop (small loops are already dealt with by
5355   // this point) that could benefit from interleaving.
5356   bool HasReductions = !Legal->getReductionVars().empty();
5357   if (TTI.enableAggressiveInterleaving(HasReductions)) {
5358     LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5359     return IC;
5360   }
5361 
5362   LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
5363   return 1;
5364 }
5365 
5366 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
5367 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {
5368   // This function calculates the register usage by measuring the highest number
5369   // of values that are alive at a single location. Obviously, this is a very
5370   // rough estimation. We scan the loop in a topological order in order and
5371   // assign a number to each instruction. We use RPO to ensure that defs are
5372   // met before their users. We assume that each instruction that has in-loop
5373   // users starts an interval. We record every time that an in-loop value is
5374   // used, so we have a list of the first and last occurrences of each
5375   // instruction. Next, we transpose this data structure into a multi map that
5376   // holds the list of intervals that *end* at a specific location. This multi
5377   // map allows us to perform a linear search. We scan the instructions linearly
5378   // and record each time that a new interval starts, by placing it in a set.
5379   // If we find this value in the multi-map then we remove it from the set.
5380   // The max register usage is the maximum size of the set.
5381   // We also search for instructions that are defined outside the loop, but are
5382   // used inside the loop. We need this number separately from the max-interval
5383   // usage number because when we unroll, loop-invariant values do not take
5384   // more register.
5385   LoopBlocksDFS DFS(TheLoop);
5386   DFS.perform(LI);
5387 
5388   RegisterUsage RU;
5389 
5390   // Each 'key' in the map opens a new interval. The values
5391   // of the map are the index of the 'last seen' usage of the
5392   // instruction that is the key.
5393   using IntervalMap = DenseMap<Instruction *, unsigned>;
5394 
5395   // Maps instruction to its index.
5396   SmallVector<Instruction *, 64> IdxToInstr;
5397   // Marks the end of each interval.
5398   IntervalMap EndPoint;
5399   // Saves the list of instruction indices that are used in the loop.
5400   SmallPtrSet<Instruction *, 8> Ends;
5401   // Saves the list of values that are used in the loop but are
5402   // defined outside the loop, such as arguments and constants.
5403   SmallPtrSet<Value *, 8> LoopInvariants;
5404 
5405   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
5406     for (Instruction &I : BB->instructionsWithoutDebug()) {
5407       IdxToInstr.push_back(&I);
5408 
5409       // Save the end location of each USE.
5410       for (Value *U : I.operands()) {
5411         auto *Instr = dyn_cast<Instruction>(U);
5412 
5413         // Ignore non-instruction values such as arguments, constants, etc.
5414         if (!Instr)
5415           continue;
5416 
5417         // If this instruction is outside the loop then record it and continue.
5418         if (!TheLoop->contains(Instr)) {
5419           LoopInvariants.insert(Instr);
5420           continue;
5421         }
5422 
5423         // Overwrite previous end points.
5424         EndPoint[Instr] = IdxToInstr.size();
5425         Ends.insert(Instr);
5426       }
5427     }
5428   }
5429 
5430   // Saves the list of intervals that end with the index in 'key'.
5431   using InstrList = SmallVector<Instruction *, 2>;
5432   DenseMap<unsigned, InstrList> TransposeEnds;
5433 
5434   // Transpose the EndPoints to a list of values that end at each index.
5435   for (auto &Interval : EndPoint)
5436     TransposeEnds[Interval.second].push_back(Interval.first);
5437 
5438   SmallPtrSet<Instruction *, 8> OpenIntervals;
5439 
5440   // Get the size of the widest register.
5441   unsigned MaxSafeDepDist = -1U;
5442   if (Legal->getMaxSafeDepDistBytes() != -1U)
5443     MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8;
5444   unsigned WidestRegister =
5445       std::min(TTI.getRegisterBitWidth(true), MaxSafeDepDist);
5446   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5447 
5448   SmallVector<RegisterUsage, 8> RUs(VFs.size());
5449   SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size());
5450 
5451   LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
5452 
5453   // A lambda that gets the register usage for the given type and VF.
5454   auto GetRegUsage = [&DL, WidestRegister](Type *Ty, unsigned VF) {
5455     if (Ty->isTokenTy())
5456       return 0U;
5457     unsigned TypeSize = DL.getTypeSizeInBits(Ty->getScalarType());
5458     return std::max<unsigned>(1, VF * TypeSize / WidestRegister);
5459   };
5460 
5461   for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
5462     Instruction *I = IdxToInstr[i];
5463 
5464     // Remove all of the instructions that end at this location.
5465     InstrList &List = TransposeEnds[i];
5466     for (Instruction *ToRemove : List)
5467       OpenIntervals.erase(ToRemove);
5468 
5469     // Ignore instructions that are never used within the loop.
5470     if (Ends.find(I) == Ends.end())
5471       continue;
5472 
5473     // Skip ignored values.
5474     if (ValuesToIgnore.find(I) != ValuesToIgnore.end())
5475       continue;
5476 
5477     // For each VF find the maximum usage of registers.
5478     for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
5479       // Count the number of live intervals.
5480       SmallMapVector<unsigned, unsigned, 4> RegUsage;
5481 
5482       if (VFs[j] == 1) {
5483         for (auto Inst : OpenIntervals) {
5484           unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
5485           if (RegUsage.find(ClassID) == RegUsage.end())
5486             RegUsage[ClassID] = 1;
5487           else
5488             RegUsage[ClassID] += 1;
5489         }
5490       } else {
5491         collectUniformsAndScalars(VFs[j]);
5492         for (auto Inst : OpenIntervals) {
5493           // Skip ignored values for VF > 1.
5494           if (VecValuesToIgnore.find(Inst) != VecValuesToIgnore.end())
5495             continue;
5496           if (isScalarAfterVectorization(Inst, VFs[j])) {
5497             unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
5498             if (RegUsage.find(ClassID) == RegUsage.end())
5499               RegUsage[ClassID] = 1;
5500             else
5501               RegUsage[ClassID] += 1;
5502           } else {
5503             unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType());
5504             if (RegUsage.find(ClassID) == RegUsage.end())
5505               RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]);
5506             else
5507               RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]);
5508           }
5509         }
5510       }
5511 
5512       for (auto& pair : RegUsage) {
5513         if (MaxUsages[j].find(pair.first) != MaxUsages[j].end())
5514           MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second);
5515         else
5516           MaxUsages[j][pair.first] = pair.second;
5517       }
5518     }
5519 
5520     LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
5521                       << OpenIntervals.size() << '\n');
5522 
5523     // Add the current instruction to the list of open intervals.
5524     OpenIntervals.insert(I);
5525   }
5526 
5527   for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
5528     SmallMapVector<unsigned, unsigned, 4> Invariant;
5529 
5530     for (auto Inst : LoopInvariants) {
5531       unsigned Usage = VFs[i] == 1 ? 1 : GetRegUsage(Inst->getType(), VFs[i]);
5532       unsigned ClassID = TTI.getRegisterClassForType(VFs[i] > 1, Inst->getType());
5533       if (Invariant.find(ClassID) == Invariant.end())
5534         Invariant[ClassID] = Usage;
5535       else
5536         Invariant[ClassID] += Usage;
5537     }
5538 
5539     LLVM_DEBUG({
5540       dbgs() << "LV(REG): VF = " << VFs[i] << '\n';
5541       dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size()
5542              << " item\n";
5543       for (const auto &pair : MaxUsages[i]) {
5544         dbgs() << "LV(REG): RegisterClass: "
5545                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5546                << " registers\n";
5547       }
5548       dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()
5549              << " item\n";
5550       for (const auto &pair : Invariant) {
5551         dbgs() << "LV(REG): RegisterClass: "
5552                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5553                << " registers\n";
5554       }
5555     });
5556 
5557     RU.LoopInvariantRegs = Invariant;
5558     RU.MaxLocalUsers = MaxUsages[i];
5559     RUs[i] = RU;
5560   }
5561 
5562   return RUs;
5563 }
5564 
5565 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){
5566   // TODO: Cost model for emulated masked load/store is completely
5567   // broken. This hack guides the cost model to use an artificially
5568   // high enough value to practically disable vectorization with such
5569   // operations, except where previously deployed legality hack allowed
5570   // using very low cost values. This is to avoid regressions coming simply
5571   // from moving "masked load/store" check from legality to cost model.
5572   // Masked Load/Gather emulation was previously never allowed.
5573   // Limited number of Masked Store/Scatter emulation was allowed.
5574   assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction");
5575   return isa<LoadInst>(I) ||
5576          (isa<StoreInst>(I) &&
5577           NumPredStores > NumberOfStoresToPredicate);
5578 }
5579 
5580 void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) {
5581   // If we aren't vectorizing the loop, or if we've already collected the
5582   // instructions to scalarize, there's nothing to do. Collection may already
5583   // have occurred if we have a user-selected VF and are now computing the
5584   // expected cost for interleaving.
5585   if (VF < 2 || InstsToScalarize.find(VF) != InstsToScalarize.end())
5586     return;
5587 
5588   // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
5589   // not profitable to scalarize any instructions, the presence of VF in the
5590   // map will indicate that we've analyzed it already.
5591   ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
5592 
5593   // Find all the instructions that are scalar with predication in the loop and
5594   // determine if it would be better to not if-convert the blocks they are in.
5595   // If so, we also record the instructions to scalarize.
5596   for (BasicBlock *BB : TheLoop->blocks()) {
5597     if (!blockNeedsPredication(BB))
5598       continue;
5599     for (Instruction &I : *BB)
5600       if (isScalarWithPredication(&I)) {
5601         ScalarCostsTy ScalarCosts;
5602         // Do not apply discount logic if hacked cost is needed
5603         // for emulated masked memrefs.
5604         if (!useEmulatedMaskMemRefHack(&I) &&
5605             computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
5606           ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
5607         // Remember that BB will remain after vectorization.
5608         PredicatedBBsAfterVectorization.insert(BB);
5609       }
5610   }
5611 }
5612 
5613 int LoopVectorizationCostModel::computePredInstDiscount(
5614     Instruction *PredInst, DenseMap<Instruction *, unsigned> &ScalarCosts,
5615     unsigned VF) {
5616   assert(!isUniformAfterVectorization(PredInst, VF) &&
5617          "Instruction marked uniform-after-vectorization will be predicated");
5618 
5619   // Initialize the discount to zero, meaning that the scalar version and the
5620   // vector version cost the same.
5621   int Discount = 0;
5622 
5623   // Holds instructions to analyze. The instructions we visit are mapped in
5624   // ScalarCosts. Those instructions are the ones that would be scalarized if
5625   // we find that the scalar version costs less.
5626   SmallVector<Instruction *, 8> Worklist;
5627 
5628   // Returns true if the given instruction can be scalarized.
5629   auto canBeScalarized = [&](Instruction *I) -> bool {
5630     // We only attempt to scalarize instructions forming a single-use chain
5631     // from the original predicated block that would otherwise be vectorized.
5632     // Although not strictly necessary, we give up on instructions we know will
5633     // already be scalar to avoid traversing chains that are unlikely to be
5634     // beneficial.
5635     if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
5636         isScalarAfterVectorization(I, VF))
5637       return false;
5638 
5639     // If the instruction is scalar with predication, it will be analyzed
5640     // separately. We ignore it within the context of PredInst.
5641     if (isScalarWithPredication(I))
5642       return false;
5643 
5644     // If any of the instruction's operands are uniform after vectorization,
5645     // the instruction cannot be scalarized. This prevents, for example, a
5646     // masked load from being scalarized.
5647     //
5648     // We assume we will only emit a value for lane zero of an instruction
5649     // marked uniform after vectorization, rather than VF identical values.
5650     // Thus, if we scalarize an instruction that uses a uniform, we would
5651     // create uses of values corresponding to the lanes we aren't emitting code
5652     // for. This behavior can be changed by allowing getScalarValue to clone
5653     // the lane zero values for uniforms rather than asserting.
5654     for (Use &U : I->operands())
5655       if (auto *J = dyn_cast<Instruction>(U.get()))
5656         if (isUniformAfterVectorization(J, VF))
5657           return false;
5658 
5659     // Otherwise, we can scalarize the instruction.
5660     return true;
5661   };
5662 
5663   // Compute the expected cost discount from scalarizing the entire expression
5664   // feeding the predicated instruction. We currently only consider expressions
5665   // that are single-use instruction chains.
5666   Worklist.push_back(PredInst);
5667   while (!Worklist.empty()) {
5668     Instruction *I = Worklist.pop_back_val();
5669 
5670     // If we've already analyzed the instruction, there's nothing to do.
5671     if (ScalarCosts.find(I) != ScalarCosts.end())
5672       continue;
5673 
5674     // Compute the cost of the vector instruction. Note that this cost already
5675     // includes the scalarization overhead of the predicated instruction.
5676     unsigned VectorCost = getInstructionCost(I, VF).first;
5677 
5678     // Compute the cost of the scalarized instruction. This cost is the cost of
5679     // the instruction as if it wasn't if-converted and instead remained in the
5680     // predicated block. We will scale this cost by block probability after
5681     // computing the scalarization overhead.
5682     unsigned ScalarCost = VF * getInstructionCost(I, 1).first;
5683 
5684     // Compute the scalarization overhead of needed insertelement instructions
5685     // and phi nodes.
5686     if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) {
5687       ScalarCost += TTI.getScalarizationOverhead(ToVectorTy(I->getType(), VF),
5688                                                  true, false);
5689       ScalarCost += VF * TTI.getCFInstrCost(Instruction::PHI);
5690     }
5691 
5692     // Compute the scalarization overhead of needed extractelement
5693     // instructions. For each of the instruction's operands, if the operand can
5694     // be scalarized, add it to the worklist; otherwise, account for the
5695     // overhead.
5696     for (Use &U : I->operands())
5697       if (auto *J = dyn_cast<Instruction>(U.get())) {
5698         assert(VectorType::isValidElementType(J->getType()) &&
5699                "Instruction has non-scalar type");
5700         if (canBeScalarized(J))
5701           Worklist.push_back(J);
5702         else if (needsExtract(J, VF))
5703           ScalarCost += TTI.getScalarizationOverhead(
5704                               ToVectorTy(J->getType(),VF), false, true);
5705       }
5706 
5707     // Scale the total scalar cost by block probability.
5708     ScalarCost /= getReciprocalPredBlockProb();
5709 
5710     // Compute the discount. A non-negative discount means the vector version
5711     // of the instruction costs more, and scalarizing would be beneficial.
5712     Discount += VectorCost - ScalarCost;
5713     ScalarCosts[I] = ScalarCost;
5714   }
5715 
5716   return Discount;
5717 }
5718 
5719 LoopVectorizationCostModel::VectorizationCostTy
5720 LoopVectorizationCostModel::expectedCost(unsigned VF) {
5721   VectorizationCostTy Cost;
5722 
5723   // For each block.
5724   for (BasicBlock *BB : TheLoop->blocks()) {
5725     VectorizationCostTy BlockCost;
5726 
5727     // For each instruction in the old loop.
5728     for (Instruction &I : BB->instructionsWithoutDebug()) {
5729       // Skip ignored values.
5730       if (ValuesToIgnore.find(&I) != ValuesToIgnore.end() ||
5731           (VF > 1 && VecValuesToIgnore.find(&I) != VecValuesToIgnore.end()))
5732         continue;
5733 
5734       VectorizationCostTy C = getInstructionCost(&I, VF);
5735 
5736       // Check if we should override the cost.
5737       if (ForceTargetInstructionCost.getNumOccurrences() > 0)
5738         C.first = ForceTargetInstructionCost;
5739 
5740       BlockCost.first += C.first;
5741       BlockCost.second |= C.second;
5742       LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first
5743                         << " for VF " << VF << " For instruction: " << I
5744                         << '\n');
5745     }
5746 
5747     // If we are vectorizing a predicated block, it will have been
5748     // if-converted. This means that the block's instructions (aside from
5749     // stores and instructions that may divide by zero) will now be
5750     // unconditionally executed. For the scalar case, we may not always execute
5751     // the predicated block. Thus, scale the block's cost by the probability of
5752     // executing it.
5753     if (VF == 1 && blockNeedsPredication(BB))
5754       BlockCost.first /= getReciprocalPredBlockProb();
5755 
5756     Cost.first += BlockCost.first;
5757     Cost.second |= BlockCost.second;
5758   }
5759 
5760   return Cost;
5761 }
5762 
5763 /// Gets Address Access SCEV after verifying that the access pattern
5764 /// is loop invariant except the induction variable dependence.
5765 ///
5766 /// This SCEV can be sent to the Target in order to estimate the address
5767 /// calculation cost.
5768 static const SCEV *getAddressAccessSCEV(
5769               Value *Ptr,
5770               LoopVectorizationLegality *Legal,
5771               PredicatedScalarEvolution &PSE,
5772               const Loop *TheLoop) {
5773 
5774   auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
5775   if (!Gep)
5776     return nullptr;
5777 
5778   // We are looking for a gep with all loop invariant indices except for one
5779   // which should be an induction variable.
5780   auto SE = PSE.getSE();
5781   unsigned NumOperands = Gep->getNumOperands();
5782   for (unsigned i = 1; i < NumOperands; ++i) {
5783     Value *Opd = Gep->getOperand(i);
5784     if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
5785         !Legal->isInductionVariable(Opd))
5786       return nullptr;
5787   }
5788 
5789   // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
5790   return PSE.getSCEV(Ptr);
5791 }
5792 
5793 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
5794   return Legal->hasStride(I->getOperand(0)) ||
5795          Legal->hasStride(I->getOperand(1));
5796 }
5797 
5798 unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
5799                                                                  unsigned VF) {
5800   assert(VF > 1 && "Scalarization cost of instruction implies vectorization.");
5801   Type *ValTy = getMemInstValueType(I);
5802   auto SE = PSE.getSE();
5803 
5804   unsigned AS = getLoadStoreAddressSpace(I);
5805   Value *Ptr = getLoadStorePointerOperand(I);
5806   Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
5807 
5808   // Figure out whether the access is strided and get the stride value
5809   // if it's known in compile time
5810   const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
5811 
5812   // Get the cost of the scalar memory instruction and address computation.
5813   unsigned Cost = VF * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
5814 
5815   // Don't pass *I here, since it is scalar but will actually be part of a
5816   // vectorized loop where the user of it is a vectorized instruction.
5817   const MaybeAlign Alignment = getLoadStoreAlignment(I);
5818   Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(),
5819                                    Alignment, AS);
5820 
5821   // Get the overhead of the extractelement and insertelement instructions
5822   // we might create due to scalarization.
5823   Cost += getScalarizationOverhead(I, VF);
5824 
5825   // If we have a predicated store, it may not be executed for each vector
5826   // lane. Scale the cost by the probability of executing the predicated
5827   // block.
5828   if (isPredicatedInst(I)) {
5829     Cost /= getReciprocalPredBlockProb();
5830 
5831     if (useEmulatedMaskMemRefHack(I))
5832       // Artificially setting to a high enough value to practically disable
5833       // vectorization with such operations.
5834       Cost = 3000000;
5835   }
5836 
5837   return Cost;
5838 }
5839 
5840 unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
5841                                                              unsigned VF) {
5842   Type *ValTy = getMemInstValueType(I);
5843   Type *VectorTy = ToVectorTy(ValTy, VF);
5844   Value *Ptr = getLoadStorePointerOperand(I);
5845   unsigned AS = getLoadStoreAddressSpace(I);
5846   int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
5847 
5848   assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
5849          "Stride should be 1 or -1 for consecutive memory access");
5850   const MaybeAlign Alignment = getLoadStoreAlignment(I);
5851   unsigned Cost = 0;
5852   if (Legal->isMaskRequired(I))
5853     Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy,
5854                                       Alignment ? Alignment->value() : 0, AS);
5855   else
5856     Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, I);
5857 
5858   bool Reverse = ConsecutiveStride < 0;
5859   if (Reverse)
5860     Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
5861   return Cost;
5862 }
5863 
5864 unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
5865                                                          unsigned VF) {
5866   Type *ValTy = getMemInstValueType(I);
5867   Type *VectorTy = ToVectorTy(ValTy, VF);
5868   const MaybeAlign Alignment = getLoadStoreAlignment(I);
5869   unsigned AS = getLoadStoreAddressSpace(I);
5870   if (isa<LoadInst>(I)) {
5871     return TTI.getAddressComputationCost(ValTy) +
5872            TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS) +
5873            TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
5874   }
5875   StoreInst *SI = cast<StoreInst>(I);
5876 
5877   bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand());
5878   return TTI.getAddressComputationCost(ValTy) +
5879          TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS) +
5880          (isLoopInvariantStoreValue
5881               ? 0
5882               : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
5883                                        VF - 1));
5884 }
5885 
5886 unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
5887                                                           unsigned VF) {
5888   Type *ValTy = getMemInstValueType(I);
5889   Type *VectorTy = ToVectorTy(ValTy, VF);
5890   const MaybeAlign Alignment = getLoadStoreAlignment(I);
5891   Value *Ptr = getLoadStorePointerOperand(I);
5892 
5893   return TTI.getAddressComputationCost(VectorTy) +
5894          TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr,
5895                                     Legal->isMaskRequired(I),
5896                                     Alignment ? Alignment->value() : 0, I);
5897 }
5898 
5899 unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
5900                                                             unsigned VF) {
5901   Type *ValTy = getMemInstValueType(I);
5902   Type *VectorTy = ToVectorTy(ValTy, VF);
5903   unsigned AS = getLoadStoreAddressSpace(I);
5904 
5905   auto Group = getInterleavedAccessGroup(I);
5906   assert(Group && "Fail to get an interleaved access group.");
5907 
5908   unsigned InterleaveFactor = Group->getFactor();
5909   Type *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
5910 
5911   // Holds the indices of existing members in an interleaved load group.
5912   // An interleaved store group doesn't need this as it doesn't allow gaps.
5913   SmallVector<unsigned, 4> Indices;
5914   if (isa<LoadInst>(I)) {
5915     for (unsigned i = 0; i < InterleaveFactor; i++)
5916       if (Group->getMember(i))
5917         Indices.push_back(i);
5918   }
5919 
5920   // Calculate the cost of the whole interleaved group.
5921   bool UseMaskForGaps =
5922       Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
5923   unsigned Cost = TTI.getInterleavedMemoryOpCost(
5924       I->getOpcode(), WideVecTy, Group->getFactor(), Indices,
5925       Group->getAlign().value(), AS, Legal->isMaskRequired(I), UseMaskForGaps);
5926 
5927   if (Group->isReverse()) {
5928     // TODO: Add support for reversed masked interleaved access.
5929     assert(!Legal->isMaskRequired(I) &&
5930            "Reverse masked interleaved access not supported.");
5931     Cost += Group->getNumMembers() *
5932             TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
5933   }
5934   return Cost;
5935 }
5936 
5937 unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
5938                                                               unsigned VF) {
5939   // Calculate scalar cost only. Vectorization cost should be ready at this
5940   // moment.
5941   if (VF == 1) {
5942     Type *ValTy = getMemInstValueType(I);
5943     const MaybeAlign Alignment = getLoadStoreAlignment(I);
5944     unsigned AS = getLoadStoreAddressSpace(I);
5945 
5946     return TTI.getAddressComputationCost(ValTy) +
5947            TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, I);
5948   }
5949   return getWideningCost(I, VF);
5950 }
5951 
5952 LoopVectorizationCostModel::VectorizationCostTy
5953 LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
5954   // If we know that this instruction will remain uniform, check the cost of
5955   // the scalar version.
5956   if (isUniformAfterVectorization(I, VF))
5957     VF = 1;
5958 
5959   if (VF > 1 && isProfitableToScalarize(I, VF))
5960     return VectorizationCostTy(InstsToScalarize[VF][I], false);
5961 
5962   // Forced scalars do not have any scalarization overhead.
5963   auto ForcedScalar = ForcedScalars.find(VF);
5964   if (VF > 1 && ForcedScalar != ForcedScalars.end()) {
5965     auto InstSet = ForcedScalar->second;
5966     if (InstSet.find(I) != InstSet.end())
5967       return VectorizationCostTy((getInstructionCost(I, 1).first * VF), false);
5968   }
5969 
5970   Type *VectorTy;
5971   unsigned C = getInstructionCost(I, VF, VectorTy);
5972 
5973   bool TypeNotScalarized =
5974       VF > 1 && VectorTy->isVectorTy() && TTI.getNumberOfParts(VectorTy) < VF;
5975   return VectorizationCostTy(C, TypeNotScalarized);
5976 }
5977 
5978 unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
5979                                                               unsigned VF) {
5980 
5981   if (VF == 1)
5982     return 0;
5983 
5984   unsigned Cost = 0;
5985   Type *RetTy = ToVectorTy(I->getType(), VF);
5986   if (!RetTy->isVoidTy() &&
5987       (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
5988     Cost += TTI.getScalarizationOverhead(RetTy, true, false);
5989 
5990   // Some targets keep addresses scalar.
5991   if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
5992     return Cost;
5993 
5994   // Some targets support efficient element stores.
5995   if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
5996     return Cost;
5997 
5998   // Collect operands to consider.
5999   CallInst *CI = dyn_cast<CallInst>(I);
6000   Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands();
6001 
6002   // Skip operands that do not require extraction/scalarization and do not incur
6003   // any overhead.
6004   return Cost + TTI.getOperandsScalarizationOverhead(
6005                     filterExtractingOperands(Ops, VF), VF);
6006 }
6007 
6008 void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {
6009   if (VF == 1)
6010     return;
6011   NumPredStores = 0;
6012   for (BasicBlock *BB : TheLoop->blocks()) {
6013     // For each instruction in the old loop.
6014     for (Instruction &I : *BB) {
6015       Value *Ptr =  getLoadStorePointerOperand(&I);
6016       if (!Ptr)
6017         continue;
6018 
6019       // TODO: We should generate better code and update the cost model for
6020       // predicated uniform stores. Today they are treated as any other
6021       // predicated store (see added test cases in
6022       // invariant-store-vectorization.ll).
6023       if (isa<StoreInst>(&I) && isScalarWithPredication(&I))
6024         NumPredStores++;
6025 
6026       if (Legal->isUniform(Ptr) &&
6027           // Conditional loads and stores should be scalarized and predicated.
6028           // isScalarWithPredication cannot be used here since masked
6029           // gather/scatters are not considered scalar with predication.
6030           !Legal->blockNeedsPredication(I.getParent())) {
6031         // TODO: Avoid replicating loads and stores instead of
6032         // relying on instcombine to remove them.
6033         // Load: Scalar load + broadcast
6034         // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
6035         unsigned Cost = getUniformMemOpCost(&I, VF);
6036         setWideningDecision(&I, VF, CM_Scalarize, Cost);
6037         continue;
6038       }
6039 
6040       // We assume that widening is the best solution when possible.
6041       if (memoryInstructionCanBeWidened(&I, VF)) {
6042         unsigned Cost = getConsecutiveMemOpCost(&I, VF);
6043         int ConsecutiveStride =
6044                Legal->isConsecutivePtr(getLoadStorePointerOperand(&I));
6045         assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6046                "Expected consecutive stride.");
6047         InstWidening Decision =
6048             ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
6049         setWideningDecision(&I, VF, Decision, Cost);
6050         continue;
6051       }
6052 
6053       // Choose between Interleaving, Gather/Scatter or Scalarization.
6054       unsigned InterleaveCost = std::numeric_limits<unsigned>::max();
6055       unsigned NumAccesses = 1;
6056       if (isAccessInterleaved(&I)) {
6057         auto Group = getInterleavedAccessGroup(&I);
6058         assert(Group && "Fail to get an interleaved access group.");
6059 
6060         // Make one decision for the whole group.
6061         if (getWideningDecision(&I, VF) != CM_Unknown)
6062           continue;
6063 
6064         NumAccesses = Group->getNumMembers();
6065         if (interleavedAccessCanBeWidened(&I, VF))
6066           InterleaveCost = getInterleaveGroupCost(&I, VF);
6067       }
6068 
6069       unsigned GatherScatterCost =
6070           isLegalGatherOrScatter(&I)
6071               ? getGatherScatterCost(&I, VF) * NumAccesses
6072               : std::numeric_limits<unsigned>::max();
6073 
6074       unsigned ScalarizationCost =
6075           getMemInstScalarizationCost(&I, VF) * NumAccesses;
6076 
6077       // Choose better solution for the current VF,
6078       // write down this decision and use it during vectorization.
6079       unsigned Cost;
6080       InstWidening Decision;
6081       if (InterleaveCost <= GatherScatterCost &&
6082           InterleaveCost < ScalarizationCost) {
6083         Decision = CM_Interleave;
6084         Cost = InterleaveCost;
6085       } else if (GatherScatterCost < ScalarizationCost) {
6086         Decision = CM_GatherScatter;
6087         Cost = GatherScatterCost;
6088       } else {
6089         Decision = CM_Scalarize;
6090         Cost = ScalarizationCost;
6091       }
6092       // If the instructions belongs to an interleave group, the whole group
6093       // receives the same decision. The whole group receives the cost, but
6094       // the cost will actually be assigned to one instruction.
6095       if (auto Group = getInterleavedAccessGroup(&I))
6096         setWideningDecision(Group, VF, Decision, Cost);
6097       else
6098         setWideningDecision(&I, VF, Decision, Cost);
6099     }
6100   }
6101 
6102   // Make sure that any load of address and any other address computation
6103   // remains scalar unless there is gather/scatter support. This avoids
6104   // inevitable extracts into address registers, and also has the benefit of
6105   // activating LSR more, since that pass can't optimize vectorized
6106   // addresses.
6107   if (TTI.prefersVectorizedAddressing())
6108     return;
6109 
6110   // Start with all scalar pointer uses.
6111   SmallPtrSet<Instruction *, 8> AddrDefs;
6112   for (BasicBlock *BB : TheLoop->blocks())
6113     for (Instruction &I : *BB) {
6114       Instruction *PtrDef =
6115         dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
6116       if (PtrDef && TheLoop->contains(PtrDef) &&
6117           getWideningDecision(&I, VF) != CM_GatherScatter)
6118         AddrDefs.insert(PtrDef);
6119     }
6120 
6121   // Add all instructions used to generate the addresses.
6122   SmallVector<Instruction *, 4> Worklist;
6123   for (auto *I : AddrDefs)
6124     Worklist.push_back(I);
6125   while (!Worklist.empty()) {
6126     Instruction *I = Worklist.pop_back_val();
6127     for (auto &Op : I->operands())
6128       if (auto *InstOp = dyn_cast<Instruction>(Op))
6129         if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
6130             AddrDefs.insert(InstOp).second)
6131           Worklist.push_back(InstOp);
6132   }
6133 
6134   for (auto *I : AddrDefs) {
6135     if (isa<LoadInst>(I)) {
6136       // Setting the desired widening decision should ideally be handled in
6137       // by cost functions, but since this involves the task of finding out
6138       // if the loaded register is involved in an address computation, it is
6139       // instead changed here when we know this is the case.
6140       InstWidening Decision = getWideningDecision(I, VF);
6141       if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
6142         // Scalarize a widened load of address.
6143         setWideningDecision(I, VF, CM_Scalarize,
6144                             (VF * getMemoryInstructionCost(I, 1)));
6145       else if (auto Group = getInterleavedAccessGroup(I)) {
6146         // Scalarize an interleave group of address loads.
6147         for (unsigned I = 0; I < Group->getFactor(); ++I) {
6148           if (Instruction *Member = Group->getMember(I))
6149             setWideningDecision(Member, VF, CM_Scalarize,
6150                                 (VF * getMemoryInstructionCost(Member, 1)));
6151         }
6152       }
6153     } else
6154       // Make sure I gets scalarized and a cost estimate without
6155       // scalarization overhead.
6156       ForcedScalars[VF].insert(I);
6157   }
6158 }
6159 
6160 unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
6161                                                         unsigned VF,
6162                                                         Type *&VectorTy) {
6163   Type *RetTy = I->getType();
6164   if (canTruncateToMinimalBitwidth(I, VF))
6165     RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
6166   VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF);
6167   auto SE = PSE.getSE();
6168 
6169   // TODO: We need to estimate the cost of intrinsic calls.
6170   switch (I->getOpcode()) {
6171   case Instruction::GetElementPtr:
6172     // We mark this instruction as zero-cost because the cost of GEPs in
6173     // vectorized code depends on whether the corresponding memory instruction
6174     // is scalarized or not. Therefore, we handle GEPs with the memory
6175     // instruction cost.
6176     return 0;
6177   case Instruction::Br: {
6178     // In cases of scalarized and predicated instructions, there will be VF
6179     // predicated blocks in the vectorized loop. Each branch around these
6180     // blocks requires also an extract of its vector compare i1 element.
6181     bool ScalarPredicatedBB = false;
6182     BranchInst *BI = cast<BranchInst>(I);
6183     if (VF > 1 && BI->isConditional() &&
6184         (PredicatedBBsAfterVectorization.find(BI->getSuccessor(0)) !=
6185              PredicatedBBsAfterVectorization.end() ||
6186          PredicatedBBsAfterVectorization.find(BI->getSuccessor(1)) !=
6187              PredicatedBBsAfterVectorization.end()))
6188       ScalarPredicatedBB = true;
6189 
6190     if (ScalarPredicatedBB) {
6191       // Return cost for branches around scalarized and predicated blocks.
6192       Type *Vec_i1Ty =
6193           VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
6194       return (TTI.getScalarizationOverhead(Vec_i1Ty, false, true) +
6195               (TTI.getCFInstrCost(Instruction::Br) * VF));
6196     } else if (I->getParent() == TheLoop->getLoopLatch() || VF == 1)
6197       // The back-edge branch will remain, as will all scalar branches.
6198       return TTI.getCFInstrCost(Instruction::Br);
6199     else
6200       // This branch will be eliminated by if-conversion.
6201       return 0;
6202     // Note: We currently assume zero cost for an unconditional branch inside
6203     // a predicated block since it will become a fall-through, although we
6204     // may decide in the future to call TTI for all branches.
6205   }
6206   case Instruction::PHI: {
6207     auto *Phi = cast<PHINode>(I);
6208 
6209     // First-order recurrences are replaced by vector shuffles inside the loop.
6210     // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type.
6211     if (VF > 1 && Legal->isFirstOrderRecurrence(Phi))
6212       return TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector,
6213                                 VectorTy, VF - 1, VectorType::get(RetTy, 1));
6214 
6215     // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
6216     // converted into select instructions. We require N - 1 selects per phi
6217     // node, where N is the number of incoming values.
6218     if (VF > 1 && Phi->getParent() != TheLoop->getHeader())
6219       return (Phi->getNumIncomingValues() - 1) *
6220              TTI.getCmpSelInstrCost(
6221                  Instruction::Select, ToVectorTy(Phi->getType(), VF),
6222                  ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF));
6223 
6224     return TTI.getCFInstrCost(Instruction::PHI);
6225   }
6226   case Instruction::UDiv:
6227   case Instruction::SDiv:
6228   case Instruction::URem:
6229   case Instruction::SRem:
6230     // If we have a predicated instruction, it may not be executed for each
6231     // vector lane. Get the scalarization cost and scale this amount by the
6232     // probability of executing the predicated block. If the instruction is not
6233     // predicated, we fall through to the next case.
6234     if (VF > 1 && isScalarWithPredication(I)) {
6235       unsigned Cost = 0;
6236 
6237       // These instructions have a non-void type, so account for the phi nodes
6238       // that we will create. This cost is likely to be zero. The phi node
6239       // cost, if any, should be scaled by the block probability because it
6240       // models a copy at the end of each predicated block.
6241       Cost += VF * TTI.getCFInstrCost(Instruction::PHI);
6242 
6243       // The cost of the non-predicated instruction.
6244       Cost += VF * TTI.getArithmeticInstrCost(I->getOpcode(), RetTy);
6245 
6246       // The cost of insertelement and extractelement instructions needed for
6247       // scalarization.
6248       Cost += getScalarizationOverhead(I, VF);
6249 
6250       // Scale the cost by the probability of executing the predicated blocks.
6251       // This assumes the predicated block for each vector lane is equally
6252       // likely.
6253       return Cost / getReciprocalPredBlockProb();
6254     }
6255     LLVM_FALLTHROUGH;
6256   case Instruction::Add:
6257   case Instruction::FAdd:
6258   case Instruction::Sub:
6259   case Instruction::FSub:
6260   case Instruction::Mul:
6261   case Instruction::FMul:
6262   case Instruction::FDiv:
6263   case Instruction::FRem:
6264   case Instruction::Shl:
6265   case Instruction::LShr:
6266   case Instruction::AShr:
6267   case Instruction::And:
6268   case Instruction::Or:
6269   case Instruction::Xor: {
6270     // Since we will replace the stride by 1 the multiplication should go away.
6271     if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal))
6272       return 0;
6273     // Certain instructions can be cheaper to vectorize if they have a constant
6274     // second vector operand. One example of this are shifts on x86.
6275     Value *Op2 = I->getOperand(1);
6276     TargetTransformInfo::OperandValueProperties Op2VP;
6277     TargetTransformInfo::OperandValueKind Op2VK =
6278         TTI.getOperandInfo(Op2, Op2VP);
6279     if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2))
6280       Op2VK = TargetTransformInfo::OK_UniformValue;
6281 
6282     SmallVector<const Value *, 4> Operands(I->operand_values());
6283     unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
6284     return N * TTI.getArithmeticInstrCost(
6285                    I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue,
6286                    Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I);
6287   }
6288   case Instruction::FNeg: {
6289     unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
6290     return N * TTI.getArithmeticInstrCost(
6291                    I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue,
6292                    TargetTransformInfo::OK_AnyValue,
6293                    TargetTransformInfo::OP_None, TargetTransformInfo::OP_None,
6294                    I->getOperand(0), I);
6295   }
6296   case Instruction::Select: {
6297     SelectInst *SI = cast<SelectInst>(I);
6298     const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
6299     bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
6300     Type *CondTy = SI->getCondition()->getType();
6301     if (!ScalarCond)
6302       CondTy = VectorType::get(CondTy, VF);
6303 
6304     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, I);
6305   }
6306   case Instruction::ICmp:
6307   case Instruction::FCmp: {
6308     Type *ValTy = I->getOperand(0)->getType();
6309     Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
6310     if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
6311       ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
6312     VectorTy = ToVectorTy(ValTy, VF);
6313     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, I);
6314   }
6315   case Instruction::Store:
6316   case Instruction::Load: {
6317     unsigned Width = VF;
6318     if (Width > 1) {
6319       InstWidening Decision = getWideningDecision(I, Width);
6320       assert(Decision != CM_Unknown &&
6321              "CM decision should be taken at this point");
6322       if (Decision == CM_Scalarize)
6323         Width = 1;
6324     }
6325     VectorTy = ToVectorTy(getMemInstValueType(I), Width);
6326     return getMemoryInstructionCost(I, VF);
6327   }
6328   case Instruction::ZExt:
6329   case Instruction::SExt:
6330   case Instruction::FPToUI:
6331   case Instruction::FPToSI:
6332   case Instruction::FPExt:
6333   case Instruction::PtrToInt:
6334   case Instruction::IntToPtr:
6335   case Instruction::SIToFP:
6336   case Instruction::UIToFP:
6337   case Instruction::Trunc:
6338   case Instruction::FPTrunc:
6339   case Instruction::BitCast: {
6340     // We optimize the truncation of induction variables having constant
6341     // integer steps. The cost of these truncations is the same as the scalar
6342     // operation.
6343     if (isOptimizableIVTruncate(I, VF)) {
6344       auto *Trunc = cast<TruncInst>(I);
6345       return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
6346                                   Trunc->getSrcTy(), Trunc);
6347     }
6348 
6349     Type *SrcScalarTy = I->getOperand(0)->getType();
6350     Type *SrcVecTy =
6351         VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
6352     if (canTruncateToMinimalBitwidth(I, VF)) {
6353       // This cast is going to be shrunk. This may remove the cast or it might
6354       // turn it into slightly different cast. For example, if MinBW == 16,
6355       // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
6356       //
6357       // Calculate the modified src and dest types.
6358       Type *MinVecTy = VectorTy;
6359       if (I->getOpcode() == Instruction::Trunc) {
6360         SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
6361         VectorTy =
6362             largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
6363       } else if (I->getOpcode() == Instruction::ZExt ||
6364                  I->getOpcode() == Instruction::SExt) {
6365         SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy);
6366         VectorTy =
6367             smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
6368       }
6369     }
6370 
6371     unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
6372     return N * TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy, I);
6373   }
6374   case Instruction::Call: {
6375     bool NeedToScalarize;
6376     CallInst *CI = cast<CallInst>(I);
6377     unsigned CallCost = getVectorCallCost(CI, VF, NeedToScalarize);
6378     if (getVectorIntrinsicIDForCall(CI, TLI))
6379       return std::min(CallCost, getVectorIntrinsicCost(CI, VF));
6380     return CallCost;
6381   }
6382   default:
6383     // The cost of executing VF copies of the scalar instruction. This opcode
6384     // is unknown. Assume that it is the same as 'mul'.
6385     return VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy) +
6386            getScalarizationOverhead(I, VF);
6387   } // end of switch.
6388 }
6389 
6390 char LoopVectorize::ID = 0;
6391 
6392 static const char lv_name[] = "Loop Vectorization";
6393 
6394 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
6395 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
6396 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)
6397 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
6398 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
6399 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
6400 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
6401 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
6402 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
6403 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
6404 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
6405 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
6406 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
6407 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
6408 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy)
6409 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
6410 
6411 namespace llvm {
6412 
6413 Pass *createLoopVectorizePass() { return new LoopVectorize(); }
6414 
6415 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced,
6416                               bool VectorizeOnlyWhenForced) {
6417   return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced);
6418 }
6419 
6420 } // end namespace llvm
6421 
6422 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
6423   // Check if the pointer operand of a load or store instruction is
6424   // consecutive.
6425   if (auto *Ptr = getLoadStorePointerOperand(Inst))
6426     return Legal->isConsecutivePtr(Ptr);
6427   return false;
6428 }
6429 
6430 void LoopVectorizationCostModel::collectValuesToIgnore() {
6431   // Ignore ephemeral values.
6432   CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
6433 
6434   // Ignore type-promoting instructions we identified during reduction
6435   // detection.
6436   for (auto &Reduction : Legal->getReductionVars()) {
6437     RecurrenceDescriptor &RedDes = Reduction.second;
6438     SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
6439     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
6440   }
6441   // Ignore type-casting instructions we identified during induction
6442   // detection.
6443   for (auto &Induction : Legal->getInductionVars()) {
6444     InductionDescriptor &IndDes = Induction.second;
6445     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
6446     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
6447   }
6448 }
6449 
6450 // TODO: we could return a pair of values that specify the max VF and
6451 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
6452 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
6453 // doesn't have a cost model that can choose which plan to execute if
6454 // more than one is generated.
6455 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits,
6456                                  LoopVectorizationCostModel &CM) {
6457   unsigned WidestType;
6458   std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
6459   return WidestVectorRegBits / WidestType;
6460 }
6461 
6462 VectorizationFactor
6463 LoopVectorizationPlanner::planInVPlanNativePath(unsigned UserVF) {
6464   unsigned VF = UserVF;
6465   // Outer loop handling: They may require CFG and instruction level
6466   // transformations before even evaluating whether vectorization is profitable.
6467   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
6468   // the vectorization pipeline.
6469   if (!OrigLoop->empty()) {
6470     // If the user doesn't provide a vectorization factor, determine a
6471     // reasonable one.
6472     if (!UserVF) {
6473       VF = determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM);
6474       LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
6475 
6476       // Make sure we have a VF > 1 for stress testing.
6477       if (VPlanBuildStressTest && VF < 2) {
6478         LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
6479                           << "overriding computed VF.\n");
6480         VF = 4;
6481       }
6482     }
6483     assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
6484     assert(isPowerOf2_32(VF) && "VF needs to be a power of two");
6485     LLVM_DEBUG(dbgs() << "LV: Using " << (UserVF ? "user " : "") << "VF " << VF
6486                       << " to build VPlans.\n");
6487     buildVPlans(VF, VF);
6488 
6489     // For VPlan build stress testing, we bail out after VPlan construction.
6490     if (VPlanBuildStressTest)
6491       return VectorizationFactor::Disabled();
6492 
6493     return {VF, 0};
6494   }
6495 
6496   LLVM_DEBUG(
6497       dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
6498                 "VPlan-native path.\n");
6499   return VectorizationFactor::Disabled();
6500 }
6501 
6502 Optional<VectorizationFactor> LoopVectorizationPlanner::plan(unsigned UserVF) {
6503   assert(OrigLoop->empty() && "Inner loop expected.");
6504   Optional<unsigned> MaybeMaxVF = CM.computeMaxVF();
6505   if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved.
6506     return None;
6507 
6508   // Invalidate interleave groups if all blocks of loop will be predicated.
6509   if (CM.blockNeedsPredication(OrigLoop->getHeader()) &&
6510       !useMaskedInterleavedAccesses(*TTI)) {
6511     LLVM_DEBUG(
6512         dbgs()
6513         << "LV: Invalidate all interleaved groups due to fold-tail by masking "
6514            "which requires masked-interleaved support.\n");
6515     CM.InterleaveInfo.reset();
6516   }
6517 
6518   if (UserVF) {
6519     LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
6520     assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two");
6521     // Collect the instructions (and their associated costs) that will be more
6522     // profitable to scalarize.
6523     CM.selectUserVectorizationFactor(UserVF);
6524     buildVPlansWithVPRecipes(UserVF, UserVF);
6525     LLVM_DEBUG(printPlans(dbgs()));
6526     return {{UserVF, 0}};
6527   }
6528 
6529   unsigned MaxVF = MaybeMaxVF.getValue();
6530   assert(MaxVF != 0 && "MaxVF is zero.");
6531 
6532   for (unsigned VF = 1; VF <= MaxVF; VF *= 2) {
6533     // Collect Uniform and Scalar instructions after vectorization with VF.
6534     CM.collectUniformsAndScalars(VF);
6535 
6536     // Collect the instructions (and their associated costs) that will be more
6537     // profitable to scalarize.
6538     if (VF > 1)
6539       CM.collectInstsToScalarize(VF);
6540   }
6541 
6542   buildVPlansWithVPRecipes(1, MaxVF);
6543   LLVM_DEBUG(printPlans(dbgs()));
6544   if (MaxVF == 1)
6545     return VectorizationFactor::Disabled();
6546 
6547   // Select the optimal vectorization factor.
6548   return CM.selectVectorizationFactor(MaxVF);
6549 }
6550 
6551 void LoopVectorizationPlanner::setBestPlan(unsigned VF, unsigned UF) {
6552   LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF
6553                     << '\n');
6554   BestVF = VF;
6555   BestUF = UF;
6556 
6557   erase_if(VPlans, [VF](const VPlanPtr &Plan) {
6558     return !Plan->hasVF(VF);
6559   });
6560   assert(VPlans.size() == 1 && "Best VF has not a single VPlan.");
6561 }
6562 
6563 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV,
6564                                            DominatorTree *DT) {
6565   // Perform the actual loop transformation.
6566 
6567   // 1. Create a new empty loop. Unlink the old loop and connect the new one.
6568   VPCallbackILV CallbackILV(ILV);
6569 
6570   VPTransformState State{BestVF, BestUF,      LI,
6571                          DT,     ILV.Builder, ILV.VectorLoopValueMap,
6572                          &ILV,   CallbackILV};
6573   State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
6574   State.TripCount = ILV.getOrCreateTripCount(nullptr);
6575 
6576   //===------------------------------------------------===//
6577   //
6578   // Notice: any optimization or new instruction that go
6579   // into the code below should also be implemented in
6580   // the cost-model.
6581   //
6582   //===------------------------------------------------===//
6583 
6584   // 2. Copy and widen instructions from the old loop into the new loop.
6585   assert(VPlans.size() == 1 && "Not a single VPlan to execute.");
6586   VPlans.front()->execute(&State);
6587 
6588   // 3. Fix the vectorized code: take care of header phi's, live-outs,
6589   //    predication, updating analyses.
6590   ILV.fixVectorizedLoop();
6591 }
6592 
6593 void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
6594     SmallPtrSetImpl<Instruction *> &DeadInstructions) {
6595   BasicBlock *Latch = OrigLoop->getLoopLatch();
6596 
6597   // We create new control-flow for the vectorized loop, so the original
6598   // condition will be dead after vectorization if it's only used by the
6599   // branch.
6600   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
6601   if (Cmp && Cmp->hasOneUse())
6602     DeadInstructions.insert(Cmp);
6603 
6604   // We create new "steps" for induction variable updates to which the original
6605   // induction variables map. An original update instruction will be dead if
6606   // all its users except the induction variable are dead.
6607   for (auto &Induction : Legal->getInductionVars()) {
6608     PHINode *Ind = Induction.first;
6609     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
6610     if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
6611           return U == Ind || DeadInstructions.find(cast<Instruction>(U)) !=
6612                                  DeadInstructions.end();
6613         }))
6614       DeadInstructions.insert(IndUpdate);
6615 
6616     // We record as "Dead" also the type-casting instructions we had identified
6617     // during induction analysis. We don't need any handling for them in the
6618     // vectorized loop because we have proven that, under a proper runtime
6619     // test guarding the vectorized loop, the value of the phi, and the casted
6620     // value of the phi, are the same. The last instruction in this casting chain
6621     // will get its scalar/vector/widened def from the scalar/vector/widened def
6622     // of the respective phi node. Any other casts in the induction def-use chain
6623     // have no other uses outside the phi update chain, and will be ignored.
6624     InductionDescriptor &IndDes = Induction.second;
6625     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
6626     DeadInstructions.insert(Casts.begin(), Casts.end());
6627   }
6628 }
6629 
6630 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; }
6631 
6632 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; }
6633 
6634 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step,
6635                                         Instruction::BinaryOps BinOp) {
6636   // When unrolling and the VF is 1, we only need to add a simple scalar.
6637   Type *Ty = Val->getType();
6638   assert(!Ty->isVectorTy() && "Val must be a scalar");
6639 
6640   if (Ty->isFloatingPointTy()) {
6641     Constant *C = ConstantFP::get(Ty, (double)StartIdx);
6642 
6643     // Floating point operations had to be 'fast' to enable the unrolling.
6644     Value *MulOp = addFastMathFlag(Builder.CreateFMul(C, Step));
6645     return addFastMathFlag(Builder.CreateBinOp(BinOp, Val, MulOp));
6646   }
6647   Constant *C = ConstantInt::get(Ty, StartIdx);
6648   return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction");
6649 }
6650 
6651 static void AddRuntimeUnrollDisableMetaData(Loop *L) {
6652   SmallVector<Metadata *, 4> MDs;
6653   // Reserve first location for self reference to the LoopID metadata node.
6654   MDs.push_back(nullptr);
6655   bool IsUnrollMetadata = false;
6656   MDNode *LoopID = L->getLoopID();
6657   if (LoopID) {
6658     // First find existing loop unrolling disable metadata.
6659     for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
6660       auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
6661       if (MD) {
6662         const auto *S = dyn_cast<MDString>(MD->getOperand(0));
6663         IsUnrollMetadata =
6664             S && S->getString().startswith("llvm.loop.unroll.disable");
6665       }
6666       MDs.push_back(LoopID->getOperand(i));
6667     }
6668   }
6669 
6670   if (!IsUnrollMetadata) {
6671     // Add runtime unroll disable metadata.
6672     LLVMContext &Context = L->getHeader()->getContext();
6673     SmallVector<Metadata *, 1> DisableOperands;
6674     DisableOperands.push_back(
6675         MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
6676     MDNode *DisableNode = MDNode::get(Context, DisableOperands);
6677     MDs.push_back(DisableNode);
6678     MDNode *NewLoopID = MDNode::get(Context, MDs);
6679     // Set operand 0 to refer to the loop id itself.
6680     NewLoopID->replaceOperandWith(0, NewLoopID);
6681     L->setLoopID(NewLoopID);
6682   }
6683 }
6684 
6685 bool LoopVectorizationPlanner::getDecisionAndClampRange(
6686     const std::function<bool(unsigned)> &Predicate, VFRange &Range) {
6687   assert(Range.End > Range.Start && "Trying to test an empty VF range.");
6688   bool PredicateAtRangeStart = Predicate(Range.Start);
6689 
6690   for (unsigned TmpVF = Range.Start * 2; TmpVF < Range.End; TmpVF *= 2)
6691     if (Predicate(TmpVF) != PredicateAtRangeStart) {
6692       Range.End = TmpVF;
6693       break;
6694     }
6695 
6696   return PredicateAtRangeStart;
6697 }
6698 
6699 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
6700 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
6701 /// of VF's starting at a given VF and extending it as much as possible. Each
6702 /// vectorization decision can potentially shorten this sub-range during
6703 /// buildVPlan().
6704 void LoopVectorizationPlanner::buildVPlans(unsigned MinVF, unsigned MaxVF) {
6705   for (unsigned VF = MinVF; VF < MaxVF + 1;) {
6706     VFRange SubRange = {VF, MaxVF + 1};
6707     VPlans.push_back(buildVPlan(SubRange));
6708     VF = SubRange.End;
6709   }
6710 }
6711 
6712 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
6713                                          VPlanPtr &Plan) {
6714   assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
6715 
6716   // Look for cached value.
6717   std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
6718   EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
6719   if (ECEntryIt != EdgeMaskCache.end())
6720     return ECEntryIt->second;
6721 
6722   VPValue *SrcMask = createBlockInMask(Src, Plan);
6723 
6724   // The terminator has to be a branch inst!
6725   BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
6726   assert(BI && "Unexpected terminator found");
6727 
6728   if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1))
6729     return EdgeMaskCache[Edge] = SrcMask;
6730 
6731   VPValue *EdgeMask = Plan->getVPValue(BI->getCondition());
6732   assert(EdgeMask && "No Edge Mask found for condition");
6733 
6734   if (BI->getSuccessor(0) != Dst)
6735     EdgeMask = Builder.createNot(EdgeMask);
6736 
6737   if (SrcMask) // Otherwise block in-mask is all-one, no need to AND.
6738     EdgeMask = Builder.createAnd(EdgeMask, SrcMask);
6739 
6740   return EdgeMaskCache[Edge] = EdgeMask;
6741 }
6742 
6743 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
6744   assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
6745 
6746   // Look for cached value.
6747   BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB);
6748   if (BCEntryIt != BlockMaskCache.end())
6749     return BCEntryIt->second;
6750 
6751   // All-one mask is modelled as no-mask following the convention for masked
6752   // load/store/gather/scatter. Initialize BlockMask to no-mask.
6753   VPValue *BlockMask = nullptr;
6754 
6755   if (OrigLoop->getHeader() == BB) {
6756     if (!CM.blockNeedsPredication(BB))
6757       return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one.
6758 
6759     // Introduce the early-exit compare IV <= BTC to form header block mask.
6760     // This is used instead of IV < TC because TC may wrap, unlike BTC.
6761     VPValue *IV = Plan->getVPValue(Legal->getPrimaryInduction());
6762     VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
6763     BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
6764     return BlockMaskCache[BB] = BlockMask;
6765   }
6766 
6767   // This is the block mask. We OR all incoming edges.
6768   for (auto *Predecessor : predecessors(BB)) {
6769     VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan);
6770     if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too.
6771       return BlockMaskCache[BB] = EdgeMask;
6772 
6773     if (!BlockMask) { // BlockMask has its initialized nullptr value.
6774       BlockMask = EdgeMask;
6775       continue;
6776     }
6777 
6778     BlockMask = Builder.createOr(BlockMask, EdgeMask);
6779   }
6780 
6781   return BlockMaskCache[BB] = BlockMask;
6782 }
6783 
6784 VPWidenMemoryInstructionRecipe *
6785 VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range,
6786                                   VPlanPtr &Plan) {
6787   if (!isa<LoadInst>(I) && !isa<StoreInst>(I))
6788     return nullptr;
6789 
6790   auto willWiden = [&](unsigned VF) -> bool {
6791     if (VF == 1)
6792       return false;
6793     LoopVectorizationCostModel::InstWidening Decision =
6794         CM.getWideningDecision(I, VF);
6795     assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
6796            "CM decision should be taken at this point.");
6797     if (Decision == LoopVectorizationCostModel::CM_Interleave)
6798       return true;
6799     if (CM.isScalarAfterVectorization(I, VF) ||
6800         CM.isProfitableToScalarize(I, VF))
6801       return false;
6802     return Decision != LoopVectorizationCostModel::CM_Scalarize;
6803   };
6804 
6805   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
6806     return nullptr;
6807 
6808   VPValue *Mask = nullptr;
6809   if (Legal->isMaskRequired(I))
6810     Mask = createBlockInMask(I->getParent(), Plan);
6811 
6812   VPValue *Addr = Plan->getOrAddVPValue(getLoadStorePointerOperand(I));
6813   if (LoadInst *Load = dyn_cast<LoadInst>(I))
6814     return new VPWidenMemoryInstructionRecipe(*Load, Addr, Mask);
6815 
6816   StoreInst *Store = cast<StoreInst>(I);
6817   VPValue *StoredValue = Plan->getOrAddVPValue(Store->getValueOperand());
6818   return new VPWidenMemoryInstructionRecipe(*Store, Addr, StoredValue, Mask);
6819 }
6820 
6821 VPWidenIntOrFpInductionRecipe *
6822 VPRecipeBuilder::tryToOptimizeInduction(Instruction *I, VFRange &Range) {
6823   if (PHINode *Phi = dyn_cast<PHINode>(I)) {
6824     // Check if this is an integer or fp induction. If so, build the recipe that
6825     // produces its scalar and vector values.
6826     InductionDescriptor II = Legal->getInductionVars().lookup(Phi);
6827     if (II.getKind() == InductionDescriptor::IK_IntInduction ||
6828         II.getKind() == InductionDescriptor::IK_FpInduction)
6829       return new VPWidenIntOrFpInductionRecipe(Phi);
6830 
6831     return nullptr;
6832   }
6833 
6834   // Optimize the special case where the source is a constant integer
6835   // induction variable. Notice that we can only optimize the 'trunc' case
6836   // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
6837   // (c) other casts depend on pointer size.
6838 
6839   // Determine whether \p K is a truncation based on an induction variable that
6840   // can be optimized.
6841   auto isOptimizableIVTruncate =
6842       [&](Instruction *K) -> std::function<bool(unsigned)> {
6843     return
6844         [=](unsigned VF) -> bool { return CM.isOptimizableIVTruncate(K, VF); };
6845   };
6846 
6847   if (isa<TruncInst>(I) && LoopVectorizationPlanner::getDecisionAndClampRange(
6848                                isOptimizableIVTruncate(I), Range))
6849     return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)),
6850                                              cast<TruncInst>(I));
6851   return nullptr;
6852 }
6853 
6854 VPBlendRecipe *VPRecipeBuilder::tryToBlend(Instruction *I, VPlanPtr &Plan) {
6855   PHINode *Phi = dyn_cast<PHINode>(I);
6856   if (!Phi || Phi->getParent() == OrigLoop->getHeader())
6857     return nullptr;
6858 
6859   // We know that all PHIs in non-header blocks are converted into selects, so
6860   // we don't have to worry about the insertion order and we can just use the
6861   // builder. At this point we generate the predication tree. There may be
6862   // duplications since this is a simple recursive scan, but future
6863   // optimizations will clean it up.
6864 
6865   SmallVector<VPValue *, 2> Masks;
6866   unsigned NumIncoming = Phi->getNumIncomingValues();
6867   for (unsigned In = 0; In < NumIncoming; In++) {
6868     VPValue *EdgeMask =
6869       createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan);
6870     assert((EdgeMask || NumIncoming == 1) &&
6871            "Multiple predecessors with one having a full mask");
6872     if (EdgeMask)
6873       Masks.push_back(EdgeMask);
6874   }
6875   return new VPBlendRecipe(Phi, Masks);
6876 }
6877 
6878 bool VPRecipeBuilder::tryToWiden(Instruction *I, VPBasicBlock *VPBB,
6879                                  VFRange &Range) {
6880 
6881   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
6882       [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range);
6883 
6884   if (IsPredicated)
6885     return false;
6886 
6887   auto IsVectorizableOpcode = [](unsigned Opcode) {
6888     switch (Opcode) {
6889     case Instruction::Add:
6890     case Instruction::And:
6891     case Instruction::AShr:
6892     case Instruction::BitCast:
6893     case Instruction::Br:
6894     case Instruction::Call:
6895     case Instruction::FAdd:
6896     case Instruction::FCmp:
6897     case Instruction::FDiv:
6898     case Instruction::FMul:
6899     case Instruction::FNeg:
6900     case Instruction::FPExt:
6901     case Instruction::FPToSI:
6902     case Instruction::FPToUI:
6903     case Instruction::FPTrunc:
6904     case Instruction::FRem:
6905     case Instruction::FSub:
6906     case Instruction::ICmp:
6907     case Instruction::IntToPtr:
6908     case Instruction::Load:
6909     case Instruction::LShr:
6910     case Instruction::Mul:
6911     case Instruction::Or:
6912     case Instruction::PHI:
6913     case Instruction::PtrToInt:
6914     case Instruction::SDiv:
6915     case Instruction::Select:
6916     case Instruction::SExt:
6917     case Instruction::Shl:
6918     case Instruction::SIToFP:
6919     case Instruction::SRem:
6920     case Instruction::Store:
6921     case Instruction::Sub:
6922     case Instruction::Trunc:
6923     case Instruction::UDiv:
6924     case Instruction::UIToFP:
6925     case Instruction::URem:
6926     case Instruction::Xor:
6927     case Instruction::ZExt:
6928       return true;
6929     }
6930     return false;
6931   };
6932 
6933   if (!IsVectorizableOpcode(I->getOpcode()))
6934     return false;
6935 
6936   if (CallInst *CI = dyn_cast<CallInst>(I)) {
6937     Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
6938     if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
6939                ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect))
6940       return false;
6941   }
6942 
6943   auto willWiden = [&](unsigned VF) -> bool {
6944     if (!isa<PHINode>(I) && (CM.isScalarAfterVectorization(I, VF) ||
6945                              CM.isProfitableToScalarize(I, VF)))
6946       return false;
6947     if (CallInst *CI = dyn_cast<CallInst>(I)) {
6948       Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
6949       // The following case may be scalarized depending on the VF.
6950       // The flag shows whether we use Intrinsic or a usual Call for vectorized
6951       // version of the instruction.
6952       // Is it beneficial to perform intrinsic call compared to lib call?
6953       bool NeedToScalarize;
6954       unsigned CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize);
6955       bool UseVectorIntrinsic =
6956           ID && CM.getVectorIntrinsicCost(CI, VF) <= CallCost;
6957       return UseVectorIntrinsic || !NeedToScalarize;
6958     }
6959     if (isa<LoadInst>(I) || isa<StoreInst>(I)) {
6960       assert(CM.getWideningDecision(I, VF) ==
6961                  LoopVectorizationCostModel::CM_Scalarize &&
6962              "Memory widening decisions should have been taken care by now");
6963       return false;
6964     }
6965     return true;
6966   };
6967 
6968   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
6969     return false;
6970   // If this ingredient's recipe is to be recorded, keep its recipe a singleton
6971   // to avoid having to split recipes later.
6972   bool IsSingleton = Ingredient2Recipe.count(I);
6973 
6974   // Success: widen this instruction.
6975 
6976   // Use the default widening recipe. We optimize the common case where
6977   // consecutive instructions can be represented by a single recipe.
6978   if (!IsSingleton && !VPBB->empty() && LastExtensibleRecipe == &VPBB->back() &&
6979       LastExtensibleRecipe->appendInstruction(I))
6980     return true;
6981 
6982   VPWidenRecipe *WidenRecipe = new VPWidenRecipe(I);
6983   if (!IsSingleton)
6984     LastExtensibleRecipe = WidenRecipe;
6985   setRecipe(I, WidenRecipe);
6986   VPBB->appendRecipe(WidenRecipe);
6987   return true;
6988 }
6989 
6990 VPBasicBlock *VPRecipeBuilder::handleReplication(
6991     Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
6992     DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe,
6993     VPlanPtr &Plan) {
6994   bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
6995       [&](unsigned VF) { return CM.isUniformAfterVectorization(I, VF); },
6996       Range);
6997 
6998   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
6999       [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range);
7000 
7001   auto *Recipe = new VPReplicateRecipe(I, IsUniform, IsPredicated);
7002   setRecipe(I, Recipe);
7003 
7004   // Find if I uses a predicated instruction. If so, it will use its scalar
7005   // value. Avoid hoisting the insert-element which packs the scalar value into
7006   // a vector value, as that happens iff all users use the vector value.
7007   for (auto &Op : I->operands())
7008     if (auto *PredInst = dyn_cast<Instruction>(Op))
7009       if (PredInst2Recipe.find(PredInst) != PredInst2Recipe.end())
7010         PredInst2Recipe[PredInst]->setAlsoPack(false);
7011 
7012   // Finalize the recipe for Instr, first if it is not predicated.
7013   if (!IsPredicated) {
7014     LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
7015     VPBB->appendRecipe(Recipe);
7016     return VPBB;
7017   }
7018   LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
7019   assert(VPBB->getSuccessors().empty() &&
7020          "VPBB has successors when handling predicated replication.");
7021   // Record predicated instructions for above packing optimizations.
7022   PredInst2Recipe[I] = Recipe;
7023   VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan);
7024   VPBlockUtils::insertBlockAfter(Region, VPBB);
7025   auto *RegSucc = new VPBasicBlock();
7026   VPBlockUtils::insertBlockAfter(RegSucc, Region);
7027   return RegSucc;
7028 }
7029 
7030 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr,
7031                                                       VPRecipeBase *PredRecipe,
7032                                                       VPlanPtr &Plan) {
7033   // Instructions marked for predication are replicated and placed under an
7034   // if-then construct to prevent side-effects.
7035 
7036   // Generate recipes to compute the block mask for this region.
7037   VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan);
7038 
7039   // Build the triangular if-then region.
7040   std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
7041   assert(Instr->getParent() && "Predicated instruction not in any basic block");
7042   auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask);
7043   auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
7044   auto *PHIRecipe =
7045       Instr->getType()->isVoidTy() ? nullptr : new VPPredInstPHIRecipe(Instr);
7046   auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
7047   auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe);
7048   VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true);
7049 
7050   // Note: first set Entry as region entry and then connect successors starting
7051   // from it in order, to propagate the "parent" of each VPBasicBlock.
7052   VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry);
7053   VPBlockUtils::connectBlocks(Pred, Exit);
7054 
7055   return Region;
7056 }
7057 
7058 bool VPRecipeBuilder::tryToCreateRecipe(Instruction *Instr, VFRange &Range,
7059                                         VPlanPtr &Plan, VPBasicBlock *VPBB) {
7060   VPRecipeBase *Recipe = nullptr;
7061 
7062   // First, check for specific widening recipes that deal with memory
7063   // operations, inductions and Phi nodes.
7064   if ((Recipe = tryToWidenMemory(Instr, Range, Plan)) ||
7065       (Recipe = tryToOptimizeInduction(Instr, Range)) ||
7066       (Recipe = tryToBlend(Instr, Plan)) ||
7067       (isa<PHINode>(Instr) &&
7068        (Recipe = new VPWidenPHIRecipe(cast<PHINode>(Instr))))) {
7069     setRecipe(Instr, Recipe);
7070     VPBB->appendRecipe(Recipe);
7071     return true;
7072   }
7073 
7074   // Handle GEP widening.
7075   if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Instr)) {
7076     auto Scalarize = [&](unsigned VF) {
7077       return CM.isScalarWithPredication(Instr, VF) ||
7078              CM.isScalarAfterVectorization(Instr, VF) ||
7079              CM.isProfitableToScalarize(Instr, VF);
7080     };
7081     if (LoopVectorizationPlanner::getDecisionAndClampRange(Scalarize, Range))
7082       return false;
7083     VPWidenGEPRecipe *Recipe = new VPWidenGEPRecipe(GEP, OrigLoop);
7084     setRecipe(Instr, Recipe);
7085     VPBB->appendRecipe(Recipe);
7086     return true;
7087   }
7088 
7089   // Check if Instr is to be widened by a general VPWidenRecipe, after
7090   // having first checked for specific widening recipes.
7091   if (tryToWiden(Instr, VPBB, Range))
7092     return true;
7093 
7094   return false;
7095 }
7096 
7097 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF,
7098                                                         unsigned MaxVF) {
7099   assert(OrigLoop->empty() && "Inner loop expected.");
7100 
7101   // Collect conditions feeding internal conditional branches; they need to be
7102   // represented in VPlan for it to model masking.
7103   SmallPtrSet<Value *, 1> NeedDef;
7104 
7105   auto *Latch = OrigLoop->getLoopLatch();
7106   for (BasicBlock *BB : OrigLoop->blocks()) {
7107     if (BB == Latch)
7108       continue;
7109     BranchInst *Branch = dyn_cast<BranchInst>(BB->getTerminator());
7110     if (Branch && Branch->isConditional())
7111       NeedDef.insert(Branch->getCondition());
7112   }
7113 
7114   // If the tail is to be folded by masking, the primary induction variable
7115   // needs to be represented in VPlan for it to model early-exit masking.
7116   // Also, both the Phi and the live-out instruction of each reduction are
7117   // required in order to introduce a select between them in VPlan.
7118   if (CM.foldTailByMasking()) {
7119     NeedDef.insert(Legal->getPrimaryInduction());
7120     for (auto &Reduction : Legal->getReductionVars()) {
7121       NeedDef.insert(Reduction.first);
7122       NeedDef.insert(Reduction.second.getLoopExitInstr());
7123     }
7124   }
7125 
7126   // Collect instructions from the original loop that will become trivially dead
7127   // in the vectorized loop. We don't need to vectorize these instructions. For
7128   // example, original induction update instructions can become dead because we
7129   // separately emit induction "steps" when generating code for the new loop.
7130   // Similarly, we create a new latch condition when setting up the structure
7131   // of the new loop, so the old one can become dead.
7132   SmallPtrSet<Instruction *, 4> DeadInstructions;
7133   collectTriviallyDeadInstructions(DeadInstructions);
7134 
7135   // Add assume instructions we need to drop to DeadInstructions, to prevent
7136   // them from being added to the VPlan.
7137   // TODO: We only need to drop assumes in blocks that get flattend. If the
7138   // control flow is preserved, we should keep them.
7139   auto &ConditionalAssumes = Legal->getConditionalAssumes();
7140   DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end());
7141 
7142   DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter();
7143   // Dead instructions do not need sinking. Remove them from SinkAfter.
7144   for (Instruction *I : DeadInstructions)
7145     SinkAfter.erase(I);
7146 
7147   for (unsigned VF = MinVF; VF < MaxVF + 1;) {
7148     VFRange SubRange = {VF, MaxVF + 1};
7149     VPlans.push_back(buildVPlanWithVPRecipes(SubRange, NeedDef,
7150                                              DeadInstructions, SinkAfter));
7151     VF = SubRange.End;
7152   }
7153 }
7154 
7155 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
7156     VFRange &Range, SmallPtrSetImpl<Value *> &NeedDef,
7157     SmallPtrSetImpl<Instruction *> &DeadInstructions,
7158     const DenseMap<Instruction *, Instruction *> &SinkAfter) {
7159 
7160   // Hold a mapping from predicated instructions to their recipes, in order to
7161   // fix their AlsoPack behavior if a user is determined to replicate and use a
7162   // scalar instead of vector value.
7163   DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe;
7164 
7165   SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
7166 
7167   VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, Builder);
7168 
7169   // ---------------------------------------------------------------------------
7170   // Pre-construction: record ingredients whose recipes we'll need to further
7171   // process after constructing the initial VPlan.
7172   // ---------------------------------------------------------------------------
7173 
7174   // Mark instructions we'll need to sink later and their targets as
7175   // ingredients whose recipe we'll need to record.
7176   for (auto &Entry : SinkAfter) {
7177     RecipeBuilder.recordRecipeOf(Entry.first);
7178     RecipeBuilder.recordRecipeOf(Entry.second);
7179   }
7180 
7181   // For each interleave group which is relevant for this (possibly trimmed)
7182   // Range, add it to the set of groups to be later applied to the VPlan and add
7183   // placeholders for its members' Recipes which we'll be replacing with a
7184   // single VPInterleaveRecipe.
7185   for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
7186     auto applyIG = [IG, this](unsigned VF) -> bool {
7187       return (VF >= 2 && // Query is illegal for VF == 1
7188               CM.getWideningDecision(IG->getInsertPos(), VF) ==
7189                   LoopVectorizationCostModel::CM_Interleave);
7190     };
7191     if (!getDecisionAndClampRange(applyIG, Range))
7192       continue;
7193     InterleaveGroups.insert(IG);
7194     for (unsigned i = 0; i < IG->getFactor(); i++)
7195       if (Instruction *Member = IG->getMember(i))
7196         RecipeBuilder.recordRecipeOf(Member);
7197   };
7198 
7199   // ---------------------------------------------------------------------------
7200   // Build initial VPlan: Scan the body of the loop in a topological order to
7201   // visit each basic block after having visited its predecessor basic blocks.
7202   // ---------------------------------------------------------------------------
7203 
7204   // Create a dummy pre-entry VPBasicBlock to start building the VPlan.
7205   auto Plan = std::make_unique<VPlan>();
7206   VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry");
7207   Plan->setEntry(VPBB);
7208 
7209   // Represent values that will have defs inside VPlan.
7210   for (Value *V : NeedDef)
7211     Plan->addVPValue(V);
7212 
7213   // Scan the body of the loop in a topological order to visit each basic block
7214   // after having visited its predecessor basic blocks.
7215   LoopBlocksDFS DFS(OrigLoop);
7216   DFS.perform(LI);
7217 
7218   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
7219     // Relevant instructions from basic block BB will be grouped into VPRecipe
7220     // ingredients and fill a new VPBasicBlock.
7221     unsigned VPBBsForBB = 0;
7222     auto *FirstVPBBForBB = new VPBasicBlock(BB->getName());
7223     VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB);
7224     VPBB = FirstVPBBForBB;
7225     Builder.setInsertPoint(VPBB);
7226 
7227     // Introduce each ingredient into VPlan.
7228     for (Instruction &I : BB->instructionsWithoutDebug()) {
7229       Instruction *Instr = &I;
7230 
7231       // First filter out irrelevant instructions, to ensure no recipes are
7232       // built for them.
7233       if (isa<BranchInst>(Instr) ||
7234           DeadInstructions.find(Instr) != DeadInstructions.end())
7235         continue;
7236 
7237       if (RecipeBuilder.tryToCreateRecipe(Instr, Range, Plan, VPBB))
7238         continue;
7239 
7240       // Otherwise, if all widening options failed, Instruction is to be
7241       // replicated. This may create a successor for VPBB.
7242       VPBasicBlock *NextVPBB = RecipeBuilder.handleReplication(
7243           Instr, Range, VPBB, PredInst2Recipe, Plan);
7244       if (NextVPBB != VPBB) {
7245         VPBB = NextVPBB;
7246         VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++)
7247                                     : "");
7248       }
7249     }
7250   }
7251 
7252   // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks
7253   // may also be empty, such as the last one VPBB, reflecting original
7254   // basic-blocks with no recipes.
7255   VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry());
7256   assert(PreEntry->empty() && "Expecting empty pre-entry block.");
7257   VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor());
7258   VPBlockUtils::disconnectBlocks(PreEntry, Entry);
7259   delete PreEntry;
7260 
7261   // ---------------------------------------------------------------------------
7262   // Transform initial VPlan: Apply previously taken decisions, in order, to
7263   // bring the VPlan to its final state.
7264   // ---------------------------------------------------------------------------
7265 
7266   // Apply Sink-After legal constraints.
7267   for (auto &Entry : SinkAfter) {
7268     VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first);
7269     VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second);
7270     Sink->moveAfter(Target);
7271   }
7272 
7273   // Interleave memory: for each Interleave Group we marked earlier as relevant
7274   // for this VPlan, replace the Recipes widening its memory instructions with a
7275   // single VPInterleaveRecipe at its insertion point.
7276   for (auto IG : InterleaveGroups) {
7277     auto *Recipe = cast<VPWidenMemoryInstructionRecipe>(
7278         RecipeBuilder.getRecipe(IG->getInsertPos()));
7279     (new VPInterleaveRecipe(IG, Recipe->getAddr(), Recipe->getMask()))
7280         ->insertBefore(Recipe);
7281 
7282     for (unsigned i = 0; i < IG->getFactor(); ++i)
7283       if (Instruction *Member = IG->getMember(i)) {
7284         RecipeBuilder.getRecipe(Member)->eraseFromParent();
7285       }
7286   }
7287 
7288   // Finally, if tail is folded by masking, introduce selects between the phi
7289   // and the live-out instruction of each reduction, at the end of the latch.
7290   if (CM.foldTailByMasking()) {
7291     Builder.setInsertPoint(VPBB);
7292     auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan);
7293     for (auto &Reduction : Legal->getReductionVars()) {
7294       VPValue *Phi = Plan->getVPValue(Reduction.first);
7295       VPValue *Red = Plan->getVPValue(Reduction.second.getLoopExitInstr());
7296       Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi});
7297     }
7298   }
7299 
7300   std::string PlanName;
7301   raw_string_ostream RSO(PlanName);
7302   unsigned VF = Range.Start;
7303   Plan->addVF(VF);
7304   RSO << "Initial VPlan for VF={" << VF;
7305   for (VF *= 2; VF < Range.End; VF *= 2) {
7306     Plan->addVF(VF);
7307     RSO << "," << VF;
7308   }
7309   RSO << "},UF>=1";
7310   RSO.flush();
7311   Plan->setName(PlanName);
7312 
7313   return Plan;
7314 }
7315 
7316 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
7317   // Outer loop handling: They may require CFG and instruction level
7318   // transformations before even evaluating whether vectorization is profitable.
7319   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
7320   // the vectorization pipeline.
7321   assert(!OrigLoop->empty());
7322   assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
7323 
7324   // Create new empty VPlan
7325   auto Plan = std::make_unique<VPlan>();
7326 
7327   // Build hierarchical CFG
7328   VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
7329   HCFGBuilder.buildHierarchicalCFG();
7330 
7331   for (unsigned VF = Range.Start; VF < Range.End; VF *= 2)
7332     Plan->addVF(VF);
7333 
7334   if (EnableVPlanPredication) {
7335     VPlanPredicator VPP(*Plan);
7336     VPP.predicate();
7337 
7338     // Avoid running transformation to recipes until masked code generation in
7339     // VPlan-native path is in place.
7340     return Plan;
7341   }
7342 
7343   SmallPtrSet<Instruction *, 1> DeadInstructions;
7344   VPlanTransforms::VPInstructionsToVPRecipes(
7345       OrigLoop, Plan, Legal->getInductionVars(), DeadInstructions);
7346   return Plan;
7347 }
7348 
7349 Value* LoopVectorizationPlanner::VPCallbackILV::
7350 getOrCreateVectorValues(Value *V, unsigned Part) {
7351       return ILV.getOrCreateVectorValue(V, Part);
7352 }
7353 
7354 Value *LoopVectorizationPlanner::VPCallbackILV::getOrCreateScalarValue(
7355     Value *V, const VPIteration &Instance) {
7356   return ILV.getOrCreateScalarValue(V, Instance);
7357 }
7358 
7359 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
7360                                VPSlotTracker &SlotTracker) const {
7361   O << " +\n"
7362     << Indent << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
7363   IG->getInsertPos()->printAsOperand(O, false);
7364   O << ", ";
7365   getAddr()->printAsOperand(O, SlotTracker);
7366   VPValue *Mask = getMask();
7367   if (Mask) {
7368     O << ", ";
7369     Mask->printAsOperand(O, SlotTracker);
7370   }
7371   O << "\\l\"";
7372   for (unsigned i = 0; i < IG->getFactor(); ++i)
7373     if (Instruction *I = IG->getMember(i))
7374       O << " +\n"
7375         << Indent << "\"  " << VPlanIngredient(I) << " " << i << "\\l\"";
7376 }
7377 
7378 void VPWidenRecipe::execute(VPTransformState &State) {
7379   for (auto &Instr : make_range(Begin, End))
7380     State.ILV->widenInstruction(Instr);
7381 }
7382 
7383 void VPWidenGEPRecipe::execute(VPTransformState &State) {
7384   State.ILV->widenGEP(GEP, State.UF, State.VF, IsPtrLoopInvariant,
7385                       IsIndexLoopInvariant);
7386 }
7387 
7388 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
7389   assert(!State.Instance && "Int or FP induction being replicated.");
7390   State.ILV->widenIntOrFpInduction(IV, Trunc);
7391 }
7392 
7393 void VPWidenPHIRecipe::execute(VPTransformState &State) {
7394   State.ILV->widenPHIInstruction(Phi, State.UF, State.VF);
7395 }
7396 
7397 void VPBlendRecipe::execute(VPTransformState &State) {
7398   State.ILV->setDebugLocFromInst(State.Builder, Phi);
7399   // We know that all PHIs in non-header blocks are converted into
7400   // selects, so we don't have to worry about the insertion order and we
7401   // can just use the builder.
7402   // At this point we generate the predication tree. There may be
7403   // duplications since this is a simple recursive scan, but future
7404   // optimizations will clean it up.
7405 
7406   unsigned NumIncoming = Phi->getNumIncomingValues();
7407 
7408   assert((User || NumIncoming == 1) &&
7409          "Multiple predecessors with predecessors having a full mask");
7410   // Generate a sequence of selects of the form:
7411   // SELECT(Mask3, In3,
7412   //      SELECT(Mask2, In2,
7413   //                   ( ...)))
7414   InnerLoopVectorizer::VectorParts Entry(State.UF);
7415   for (unsigned In = 0; In < NumIncoming; ++In) {
7416     for (unsigned Part = 0; Part < State.UF; ++Part) {
7417       // We might have single edge PHIs (blocks) - use an identity
7418       // 'select' for the first PHI operand.
7419       Value *In0 =
7420           State.ILV->getOrCreateVectorValue(Phi->getIncomingValue(In), Part);
7421       if (In == 0)
7422         Entry[Part] = In0; // Initialize with the first incoming value.
7423       else {
7424         // Select between the current value and the previous incoming edge
7425         // based on the incoming mask.
7426         Value *Cond = State.get(User->getOperand(In), Part);
7427         Entry[Part] =
7428             State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi");
7429       }
7430     }
7431   }
7432   for (unsigned Part = 0; Part < State.UF; ++Part)
7433     State.ValueMap.setVectorValue(Phi, Part, Entry[Part]);
7434 }
7435 
7436 void VPInterleaveRecipe::execute(VPTransformState &State) {
7437   assert(!State.Instance && "Interleave group being replicated.");
7438   State.ILV->vectorizeInterleaveGroup(IG->getInsertPos(), State, getAddr(),
7439                                       getMask());
7440 }
7441 
7442 void VPReplicateRecipe::execute(VPTransformState &State) {
7443   if (State.Instance) { // Generate a single instance.
7444     State.ILV->scalarizeInstruction(Ingredient, *State.Instance, IsPredicated);
7445     // Insert scalar instance packing it into a vector.
7446     if (AlsoPack && State.VF > 1) {
7447       // If we're constructing lane 0, initialize to start from undef.
7448       if (State.Instance->Lane == 0) {
7449         Value *Undef =
7450             UndefValue::get(VectorType::get(Ingredient->getType(), State.VF));
7451         State.ValueMap.setVectorValue(Ingredient, State.Instance->Part, Undef);
7452       }
7453       State.ILV->packScalarIntoVectorValue(Ingredient, *State.Instance);
7454     }
7455     return;
7456   }
7457 
7458   // Generate scalar instances for all VF lanes of all UF parts, unless the
7459   // instruction is uniform inwhich case generate only the first lane for each
7460   // of the UF parts.
7461   unsigned EndLane = IsUniform ? 1 : State.VF;
7462   for (unsigned Part = 0; Part < State.UF; ++Part)
7463     for (unsigned Lane = 0; Lane < EndLane; ++Lane)
7464       State.ILV->scalarizeInstruction(Ingredient, {Part, Lane}, IsPredicated);
7465 }
7466 
7467 void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
7468   assert(State.Instance && "Branch on Mask works only on single instance.");
7469 
7470   unsigned Part = State.Instance->Part;
7471   unsigned Lane = State.Instance->Lane;
7472 
7473   Value *ConditionBit = nullptr;
7474   if (!User) // Block in mask is all-one.
7475     ConditionBit = State.Builder.getTrue();
7476   else {
7477     VPValue *BlockInMask = User->getOperand(0);
7478     ConditionBit = State.get(BlockInMask, Part);
7479     if (ConditionBit->getType()->isVectorTy())
7480       ConditionBit = State.Builder.CreateExtractElement(
7481           ConditionBit, State.Builder.getInt32(Lane));
7482   }
7483 
7484   // Replace the temporary unreachable terminator with a new conditional branch,
7485   // whose two destinations will be set later when they are created.
7486   auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
7487   assert(isa<UnreachableInst>(CurrentTerminator) &&
7488          "Expected to replace unreachable terminator with conditional branch.");
7489   auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
7490   CondBr->setSuccessor(0, nullptr);
7491   ReplaceInstWithInst(CurrentTerminator, CondBr);
7492 }
7493 
7494 void VPPredInstPHIRecipe::execute(VPTransformState &State) {
7495   assert(State.Instance && "Predicated instruction PHI works per instance.");
7496   Instruction *ScalarPredInst = cast<Instruction>(
7497       State.ValueMap.getScalarValue(PredInst, *State.Instance));
7498   BasicBlock *PredicatedBB = ScalarPredInst->getParent();
7499   BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
7500   assert(PredicatingBB && "Predicated block has no single predecessor.");
7501 
7502   // By current pack/unpack logic we need to generate only a single phi node: if
7503   // a vector value for the predicated instruction exists at this point it means
7504   // the instruction has vector users only, and a phi for the vector value is
7505   // needed. In this case the recipe of the predicated instruction is marked to
7506   // also do that packing, thereby "hoisting" the insert-element sequence.
7507   // Otherwise, a phi node for the scalar value is needed.
7508   unsigned Part = State.Instance->Part;
7509   if (State.ValueMap.hasVectorValue(PredInst, Part)) {
7510     Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part);
7511     InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
7512     PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
7513     VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
7514     VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
7515     State.ValueMap.resetVectorValue(PredInst, Part, VPhi); // Update cache.
7516   } else {
7517     Type *PredInstType = PredInst->getType();
7518     PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
7519     Phi->addIncoming(UndefValue::get(ScalarPredInst->getType()), PredicatingBB);
7520     Phi->addIncoming(ScalarPredInst, PredicatedBB);
7521     State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi);
7522   }
7523 }
7524 
7525 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
7526   VPValue *StoredValue = isa<StoreInst>(Instr) ? getStoredValue() : nullptr;
7527   State.ILV->vectorizeMemoryInstruction(&Instr, State, getAddr(), StoredValue,
7528                                         getMask());
7529 }
7530 
7531 // Determine how to lower the scalar epilogue, which depends on 1) optimising
7532 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
7533 // predication, and 4) a TTI hook that analyses whether the loop is suitable
7534 // for predication.
7535 static ScalarEpilogueLowering getScalarEpilogueLowering(
7536     Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI,
7537     BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
7538     AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
7539     LoopVectorizationLegality &LVL) {
7540   bool OptSize =
7541       F->hasOptSize() || llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
7542                                                      PGSOQueryType::IRPass);
7543   // 1) OptSize takes precedence over all other options, i.e. if this is set,
7544   // don't look at hints or options, and don't request a scalar epilogue.
7545   if (OptSize && Hints.getForce() != LoopVectorizeHints::FK_Enabled)
7546     return CM_ScalarEpilogueNotAllowedOptSize;
7547 
7548   bool PredicateOptDisabled = PreferPredicateOverEpilog.getNumOccurrences() &&
7549                               !PreferPredicateOverEpilog;
7550 
7551   // 2) Next, if disabling predication is requested on the command line, honour
7552   // this and request a scalar epilogue. Also do this if we don't have a
7553   // primary induction variable, which is required for predication.
7554   if (PredicateOptDisabled || !LVL.getPrimaryInduction())
7555     return CM_ScalarEpilogueAllowed;
7556 
7557   // 3) and 4) look if enabling predication is requested on the command line,
7558   // with a loop hint, or if the TTI hook indicates this is profitable, request
7559   // predication .
7560   if (PreferPredicateOverEpilog ||
7561       Hints.getPredicate() == LoopVectorizeHints::FK_Enabled ||
7562       (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT,
7563                                         LVL.getLAI()) &&
7564        Hints.getPredicate() != LoopVectorizeHints::FK_Disabled))
7565     return CM_ScalarEpilogueNotNeededUsePredicate;
7566 
7567   return CM_ScalarEpilogueAllowed;
7568 }
7569 
7570 // Process the loop in the VPlan-native vectorization path. This path builds
7571 // VPlan upfront in the vectorization pipeline, which allows to apply
7572 // VPlan-to-VPlan transformations from the very beginning without modifying the
7573 // input LLVM IR.
7574 static bool processLoopInVPlanNativePath(
7575     Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
7576     LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
7577     TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
7578     OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
7579     ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) {
7580 
7581   assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
7582   Function *F = L->getHeader()->getParent();
7583   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
7584 
7585   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
7586       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL);
7587 
7588   LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
7589                                 &Hints, IAI);
7590   // Use the planner for outer loop vectorization.
7591   // TODO: CM is not used at this point inside the planner. Turn CM into an
7592   // optional argument if we don't need it in the future.
7593   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI);
7594 
7595   // Get user vectorization factor.
7596   const unsigned UserVF = Hints.getWidth();
7597 
7598   // Plan how to best vectorize, return the best VF and its cost.
7599   const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
7600 
7601   // If we are stress testing VPlan builds, do not attempt to generate vector
7602   // code. Masked vector code generation support will follow soon.
7603   // Also, do not attempt to vectorize if no vector code will be produced.
7604   if (VPlanBuildStressTest || EnableVPlanPredication ||
7605       VectorizationFactor::Disabled() == VF)
7606     return false;
7607 
7608   LVP.setBestPlan(VF.Width, 1);
7609 
7610   InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL,
7611                          &CM);
7612   LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
7613                     << L->getHeader()->getParent()->getName() << "\"\n");
7614   LVP.executePlan(LB, DT);
7615 
7616   // Mark the loop as already vectorized to avoid vectorizing again.
7617   Hints.setAlreadyVectorized();
7618 
7619   LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent()));
7620   return true;
7621 }
7622 
7623 bool LoopVectorizePass::processLoop(Loop *L) {
7624   assert((EnableVPlanNativePath || L->empty()) &&
7625          "VPlan-native path is not enabled. Only process inner loops.");
7626 
7627 #ifndef NDEBUG
7628   const std::string DebugLocStr = getDebugLocString(L);
7629 #endif /* NDEBUG */
7630 
7631   LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \""
7632                     << L->getHeader()->getParent()->getName() << "\" from "
7633                     << DebugLocStr << "\n");
7634 
7635   LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE);
7636 
7637   LLVM_DEBUG(
7638       dbgs() << "LV: Loop hints:"
7639              << " force="
7640              << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
7641                      ? "disabled"
7642                      : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
7643                             ? "enabled"
7644                             : "?"))
7645              << " width=" << Hints.getWidth()
7646              << " unroll=" << Hints.getInterleave() << "\n");
7647 
7648   // Function containing loop
7649   Function *F = L->getHeader()->getParent();
7650 
7651   // Looking at the diagnostic output is the only way to determine if a loop
7652   // was vectorized (other than looking at the IR or machine code), so it
7653   // is important to generate an optimization remark for each loop. Most of
7654   // these messages are generated as OptimizationRemarkAnalysis. Remarks
7655   // generated as OptimizationRemark and OptimizationRemarkMissed are
7656   // less verbose reporting vectorized loops and unvectorized loops that may
7657   // benefit from vectorization, respectively.
7658 
7659   if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
7660     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
7661     return false;
7662   }
7663 
7664   PredicatedScalarEvolution PSE(*SE, *L);
7665 
7666   // Check if it is legal to vectorize the loop.
7667   LoopVectorizationRequirements Requirements(*ORE);
7668   LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE,
7669                                 &Requirements, &Hints, DB, AC);
7670   if (!LVL.canVectorize(EnableVPlanNativePath)) {
7671     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
7672     Hints.emitRemarkWithHints();
7673     return false;
7674   }
7675 
7676   // Check the function attributes and profiles to find out if this function
7677   // should be optimized for size.
7678   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
7679       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL);
7680 
7681   // Entrance to the VPlan-native vectorization path. Outer loops are processed
7682   // here. They may require CFG and instruction level transformations before
7683   // even evaluating whether vectorization is profitable. Since we cannot modify
7684   // the incoming IR, we need to build VPlan upfront in the vectorization
7685   // pipeline.
7686   if (!L->empty())
7687     return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
7688                                         ORE, BFI, PSI, Hints);
7689 
7690   assert(L->empty() && "Inner loop expected.");
7691 
7692   // Check the loop for a trip count threshold: vectorize loops with a tiny trip
7693   // count by optimizing for size, to minimize overheads.
7694   auto ExpectedTC = getSmallBestKnownTC(*SE, L);
7695   if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
7696     LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
7697                       << "This loop is worth vectorizing only if no scalar "
7698                       << "iteration overheads are incurred.");
7699     if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
7700       LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
7701     else {
7702       LLVM_DEBUG(dbgs() << "\n");
7703       SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
7704     }
7705   }
7706 
7707   // Check the function attributes to see if implicit floats are allowed.
7708   // FIXME: This check doesn't seem possibly correct -- what if the loop is
7709   // an integer loop and the vector instructions selected are purely integer
7710   // vector instructions?
7711   if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
7712     reportVectorizationFailure(
7713         "Can't vectorize when the NoImplicitFloat attribute is used",
7714         "loop not vectorized due to NoImplicitFloat attribute",
7715         "NoImplicitFloat", ORE, L);
7716     Hints.emitRemarkWithHints();
7717     return false;
7718   }
7719 
7720   // Check if the target supports potentially unsafe FP vectorization.
7721   // FIXME: Add a check for the type of safety issue (denormal, signaling)
7722   // for the target we're vectorizing for, to make sure none of the
7723   // additional fp-math flags can help.
7724   if (Hints.isPotentiallyUnsafe() &&
7725       TTI->isFPVectorizationPotentiallyUnsafe()) {
7726     reportVectorizationFailure(
7727         "Potentially unsafe FP op prevents vectorization",
7728         "loop not vectorized due to unsafe FP support.",
7729         "UnsafeFP", ORE, L);
7730     Hints.emitRemarkWithHints();
7731     return false;
7732   }
7733 
7734   bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
7735   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
7736 
7737   // If an override option has been passed in for interleaved accesses, use it.
7738   if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
7739     UseInterleaved = EnableInterleavedMemAccesses;
7740 
7741   // Analyze interleaved memory accesses.
7742   if (UseInterleaved) {
7743     IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
7744   }
7745 
7746   // Use the cost model.
7747   LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
7748                                 F, &Hints, IAI);
7749   CM.collectValuesToIgnore();
7750 
7751   // Use the planner for vectorization.
7752   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI);
7753 
7754   // Get user vectorization factor.
7755   unsigned UserVF = Hints.getWidth();
7756 
7757   // Plan how to best vectorize, return the best VF and its cost.
7758   Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF);
7759 
7760   VectorizationFactor VF = VectorizationFactor::Disabled();
7761   unsigned IC = 1;
7762   unsigned UserIC = Hints.getInterleave();
7763 
7764   if (MaybeVF) {
7765     VF = *MaybeVF;
7766     // Select the interleave count.
7767     IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
7768   }
7769 
7770   // Identify the diagnostic messages that should be produced.
7771   std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
7772   bool VectorizeLoop = true, InterleaveLoop = true;
7773   if (Requirements.doesNotMeet(F, L, Hints)) {
7774     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization "
7775                          "requirements.\n");
7776     Hints.emitRemarkWithHints();
7777     return false;
7778   }
7779 
7780   if (VF.Width == 1) {
7781     LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
7782     VecDiagMsg = std::make_pair(
7783         "VectorizationNotBeneficial",
7784         "the cost-model indicates that vectorization is not beneficial");
7785     VectorizeLoop = false;
7786   }
7787 
7788   if (!MaybeVF && UserIC > 1) {
7789     // Tell the user interleaving was avoided up-front, despite being explicitly
7790     // requested.
7791     LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
7792                          "interleaving should be avoided up front\n");
7793     IntDiagMsg = std::make_pair(
7794         "InterleavingAvoided",
7795         "Ignoring UserIC, because interleaving was avoided up front");
7796     InterleaveLoop = false;
7797   } else if (IC == 1 && UserIC <= 1) {
7798     // Tell the user interleaving is not beneficial.
7799     LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
7800     IntDiagMsg = std::make_pair(
7801         "InterleavingNotBeneficial",
7802         "the cost-model indicates that interleaving is not beneficial");
7803     InterleaveLoop = false;
7804     if (UserIC == 1) {
7805       IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
7806       IntDiagMsg.second +=
7807           " and is explicitly disabled or interleave count is set to 1";
7808     }
7809   } else if (IC > 1 && UserIC == 1) {
7810     // Tell the user interleaving is beneficial, but it explicitly disabled.
7811     LLVM_DEBUG(
7812         dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
7813     IntDiagMsg = std::make_pair(
7814         "InterleavingBeneficialButDisabled",
7815         "the cost-model indicates that interleaving is beneficial "
7816         "but is explicitly disabled or interleave count is set to 1");
7817     InterleaveLoop = false;
7818   }
7819 
7820   // Override IC if user provided an interleave count.
7821   IC = UserIC > 0 ? UserIC : IC;
7822 
7823   // Emit diagnostic messages, if any.
7824   const char *VAPassName = Hints.vectorizeAnalysisPassName();
7825   if (!VectorizeLoop && !InterleaveLoop) {
7826     // Do not vectorize or interleaving the loop.
7827     ORE->emit([&]() {
7828       return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
7829                                       L->getStartLoc(), L->getHeader())
7830              << VecDiagMsg.second;
7831     });
7832     ORE->emit([&]() {
7833       return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
7834                                       L->getStartLoc(), L->getHeader())
7835              << IntDiagMsg.second;
7836     });
7837     return false;
7838   } else if (!VectorizeLoop && InterleaveLoop) {
7839     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
7840     ORE->emit([&]() {
7841       return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
7842                                         L->getStartLoc(), L->getHeader())
7843              << VecDiagMsg.second;
7844     });
7845   } else if (VectorizeLoop && !InterleaveLoop) {
7846     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
7847                       << ") in " << DebugLocStr << '\n');
7848     ORE->emit([&]() {
7849       return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
7850                                         L->getStartLoc(), L->getHeader())
7851              << IntDiagMsg.second;
7852     });
7853   } else if (VectorizeLoop && InterleaveLoop) {
7854     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
7855                       << ") in " << DebugLocStr << '\n');
7856     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
7857   }
7858 
7859   LVP.setBestPlan(VF.Width, IC);
7860 
7861   using namespace ore;
7862   bool DisableRuntimeUnroll = false;
7863   MDNode *OrigLoopID = L->getLoopID();
7864 
7865   if (!VectorizeLoop) {
7866     assert(IC > 1 && "interleave count should not be 1 or 0");
7867     // If we decided that it is not legal to vectorize the loop, then
7868     // interleave it.
7869     InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
7870                                &CM);
7871     LVP.executePlan(Unroller, DT);
7872 
7873     ORE->emit([&]() {
7874       return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
7875                                 L->getHeader())
7876              << "interleaved loop (interleaved count: "
7877              << NV("InterleaveCount", IC) << ")";
7878     });
7879   } else {
7880     // If we decided that it is *legal* to vectorize the loop, then do it.
7881     InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
7882                            &LVL, &CM);
7883     LVP.executePlan(LB, DT);
7884     ++LoopsVectorized;
7885 
7886     // Add metadata to disable runtime unrolling a scalar loop when there are
7887     // no runtime checks about strides and memory. A scalar loop that is
7888     // rarely used is not worth unrolling.
7889     if (!LB.areSafetyChecksAdded())
7890       DisableRuntimeUnroll = true;
7891 
7892     // Report the vectorization decision.
7893     ORE->emit([&]() {
7894       return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(),
7895                                 L->getHeader())
7896              << "vectorized loop (vectorization width: "
7897              << NV("VectorizationFactor", VF.Width)
7898              << ", interleaved count: " << NV("InterleaveCount", IC) << ")";
7899     });
7900   }
7901 
7902   Optional<MDNode *> RemainderLoopID =
7903       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
7904                                       LLVMLoopVectorizeFollowupEpilogue});
7905   if (RemainderLoopID.hasValue()) {
7906     L->setLoopID(RemainderLoopID.getValue());
7907   } else {
7908     if (DisableRuntimeUnroll)
7909       AddRuntimeUnrollDisableMetaData(L);
7910 
7911     // Mark the loop as already vectorized to avoid vectorizing again.
7912     Hints.setAlreadyVectorized();
7913   }
7914 
7915   LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent()));
7916   return true;
7917 }
7918 
7919 bool LoopVectorizePass::runImpl(
7920     Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_,
7921     DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_,
7922     DemandedBits &DB_, AliasAnalysis &AA_, AssumptionCache &AC_,
7923     std::function<const LoopAccessInfo &(Loop &)> &GetLAA_,
7924     OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) {
7925   SE = &SE_;
7926   LI = &LI_;
7927   TTI = &TTI_;
7928   DT = &DT_;
7929   BFI = &BFI_;
7930   TLI = TLI_;
7931   AA = &AA_;
7932   AC = &AC_;
7933   GetLAA = &GetLAA_;
7934   DB = &DB_;
7935   ORE = &ORE_;
7936   PSI = PSI_;
7937 
7938   // Don't attempt if
7939   // 1. the target claims to have no vector registers, and
7940   // 2. interleaving won't help ILP.
7941   //
7942   // The second condition is necessary because, even if the target has no
7943   // vector registers, loop vectorization may still enable scalar
7944   // interleaving.
7945   if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) &&
7946       TTI->getMaxInterleaveFactor(1) < 2)
7947     return false;
7948 
7949   bool Changed = false;
7950 
7951   // The vectorizer requires loops to be in simplified form.
7952   // Since simplification may add new inner loops, it has to run before the
7953   // legality and profitability checks. This means running the loop vectorizer
7954   // will simplify all loops, regardless of whether anything end up being
7955   // vectorized.
7956   for (auto &L : *LI)
7957     Changed |=
7958         simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
7959 
7960   // Build up a worklist of inner-loops to vectorize. This is necessary as
7961   // the act of vectorizing or partially unrolling a loop creates new loops
7962   // and can invalidate iterators across the loops.
7963   SmallVector<Loop *, 8> Worklist;
7964 
7965   for (Loop *L : *LI)
7966     collectSupportedLoops(*L, LI, ORE, Worklist);
7967 
7968   LoopsAnalyzed += Worklist.size();
7969 
7970   // Now walk the identified inner loops.
7971   while (!Worklist.empty()) {
7972     Loop *L = Worklist.pop_back_val();
7973 
7974     // For the inner loops we actually process, form LCSSA to simplify the
7975     // transform.
7976     Changed |= formLCSSARecursively(*L, *DT, LI, SE);
7977 
7978     Changed |= processLoop(L);
7979   }
7980 
7981   // Process each loop nest in the function.
7982   return Changed;
7983 }
7984 
7985 PreservedAnalyses LoopVectorizePass::run(Function &F,
7986                                          FunctionAnalysisManager &AM) {
7987     auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
7988     auto &LI = AM.getResult<LoopAnalysis>(F);
7989     auto &TTI = AM.getResult<TargetIRAnalysis>(F);
7990     auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
7991     auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F);
7992     auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
7993     auto &AA = AM.getResult<AAManager>(F);
7994     auto &AC = AM.getResult<AssumptionAnalysis>(F);
7995     auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
7996     auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
7997     MemorySSA *MSSA = EnableMSSALoopDependency
7998                           ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA()
7999                           : nullptr;
8000 
8001     auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
8002     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
8003         [&](Loop &L) -> const LoopAccessInfo & {
8004       LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI, MSSA};
8005       return LAM.getResult<LoopAccessAnalysis>(L, AR);
8006     };
8007     const ModuleAnalysisManager &MAM =
8008         AM.getResult<ModuleAnalysisManagerFunctionProxy>(F).getManager();
8009     ProfileSummaryInfo *PSI =
8010         MAM.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
8011     bool Changed =
8012         runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI);
8013     if (!Changed)
8014       return PreservedAnalyses::all();
8015     PreservedAnalyses PA;
8016 
8017     // We currently do not preserve loopinfo/dominator analyses with outer loop
8018     // vectorization. Until this is addressed, mark these analyses as preserved
8019     // only for non-VPlan-native path.
8020     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
8021     if (!EnableVPlanNativePath) {
8022       PA.preserve<LoopAnalysis>();
8023       PA.preserve<DominatorTreeAnalysis>();
8024     }
8025     PA.preserve<BasicAA>();
8026     PA.preserve<GlobalsAA>();
8027     return PA;
8028 }
8029