1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10 // and generates target-independent LLVM-IR.
11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12 // of instructions in order to estimate the profitability of vectorization.
13 //
14 // The loop vectorizer combines consecutive loop iterations into a single
15 // 'wide' iteration. After this transformation the index is incremented
16 // by the SIMD vector width, and not by one.
17 //
18 // This pass has three parts:
19 // 1. The main loop pass that drives the different parts.
20 // 2. LoopVectorizationLegality - A unit that checks for the legality
21 //    of the vectorization.
22 // 3. InnerLoopVectorizer - A unit that performs the actual
23 //    widening of instructions.
24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
25 //    of vectorization. It decides on the optimal vector width, which
26 //    can be one, if vectorization is not profitable.
27 //
28 // There is a development effort going on to migrate loop vectorizer to the
29 // VPlan infrastructure and to introduce outer loop vectorization support (see
30 // docs/Proposal/VectorizationPlan.rst and
31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32 // purpose, we temporarily introduced the VPlan-native vectorization path: an
33 // alternative vectorization path that is natively implemented on top of the
34 // VPlan infrastructure. See EnableVPlanNativePath for enabling.
35 //
36 //===----------------------------------------------------------------------===//
37 //
38 // The reduction-variable vectorization is based on the paper:
39 //  D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40 //
41 // Variable uniformity checks are inspired by:
42 //  Karrenberg, R. and Hack, S. Whole Function Vectorization.
43 //
44 // The interleaved access vectorization is based on the paper:
45 //  Dorit Nuzman, Ira Rosen and Ayal Zaks.  Auto-Vectorization of Interleaved
46 //  Data for SIMD
47 //
48 // Other ideas/concepts are from:
49 //  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50 //
51 //  S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua.  An Evaluation of
52 //  Vectorizing Compilers.
53 //
54 //===----------------------------------------------------------------------===//
55 
56 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
57 #include "LoopVectorizationPlanner.h"
58 #include "VPRecipeBuilder.h"
59 #include "VPlan.h"
60 #include "VPlanHCFGBuilder.h"
61 #include "VPlanPredicator.h"
62 #include "VPlanTransforms.h"
63 #include "llvm/ADT/APInt.h"
64 #include "llvm/ADT/ArrayRef.h"
65 #include "llvm/ADT/DenseMap.h"
66 #include "llvm/ADT/DenseMapInfo.h"
67 #include "llvm/ADT/Hashing.h"
68 #include "llvm/ADT/MapVector.h"
69 #include "llvm/ADT/None.h"
70 #include "llvm/ADT/Optional.h"
71 #include "llvm/ADT/STLExtras.h"
72 #include "llvm/ADT/SetVector.h"
73 #include "llvm/ADT/SmallPtrSet.h"
74 #include "llvm/ADT/SmallVector.h"
75 #include "llvm/ADT/Statistic.h"
76 #include "llvm/ADT/StringRef.h"
77 #include "llvm/ADT/Twine.h"
78 #include "llvm/ADT/iterator_range.h"
79 #include "llvm/Analysis/AssumptionCache.h"
80 #include "llvm/Analysis/BasicAliasAnalysis.h"
81 #include "llvm/Analysis/BlockFrequencyInfo.h"
82 #include "llvm/Analysis/CFG.h"
83 #include "llvm/Analysis/CodeMetrics.h"
84 #include "llvm/Analysis/DemandedBits.h"
85 #include "llvm/Analysis/GlobalsModRef.h"
86 #include "llvm/Analysis/LoopAccessAnalysis.h"
87 #include "llvm/Analysis/LoopAnalysisManager.h"
88 #include "llvm/Analysis/LoopInfo.h"
89 #include "llvm/Analysis/LoopIterator.h"
90 #include "llvm/Analysis/MemorySSA.h"
91 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
92 #include "llvm/Analysis/ProfileSummaryInfo.h"
93 #include "llvm/Analysis/ScalarEvolution.h"
94 #include "llvm/Analysis/ScalarEvolutionExpander.h"
95 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
96 #include "llvm/Analysis/TargetLibraryInfo.h"
97 #include "llvm/Analysis/TargetTransformInfo.h"
98 #include "llvm/Analysis/VectorUtils.h"
99 #include "llvm/IR/Attributes.h"
100 #include "llvm/IR/BasicBlock.h"
101 #include "llvm/IR/CFG.h"
102 #include "llvm/IR/Constant.h"
103 #include "llvm/IR/Constants.h"
104 #include "llvm/IR/DataLayout.h"
105 #include "llvm/IR/DebugInfoMetadata.h"
106 #include "llvm/IR/DebugLoc.h"
107 #include "llvm/IR/DerivedTypes.h"
108 #include "llvm/IR/DiagnosticInfo.h"
109 #include "llvm/IR/Dominators.h"
110 #include "llvm/IR/Function.h"
111 #include "llvm/IR/IRBuilder.h"
112 #include "llvm/IR/InstrTypes.h"
113 #include "llvm/IR/Instruction.h"
114 #include "llvm/IR/Instructions.h"
115 #include "llvm/IR/IntrinsicInst.h"
116 #include "llvm/IR/Intrinsics.h"
117 #include "llvm/IR/LLVMContext.h"
118 #include "llvm/IR/Metadata.h"
119 #include "llvm/IR/Module.h"
120 #include "llvm/IR/Operator.h"
121 #include "llvm/IR/Type.h"
122 #include "llvm/IR/Use.h"
123 #include "llvm/IR/User.h"
124 #include "llvm/IR/Value.h"
125 #include "llvm/IR/ValueHandle.h"
126 #include "llvm/IR/Verifier.h"
127 #include "llvm/InitializePasses.h"
128 #include "llvm/Pass.h"
129 #include "llvm/Support/Casting.h"
130 #include "llvm/Support/CommandLine.h"
131 #include "llvm/Support/Compiler.h"
132 #include "llvm/Support/Debug.h"
133 #include "llvm/Support/ErrorHandling.h"
134 #include "llvm/Support/MathExtras.h"
135 #include "llvm/Support/raw_ostream.h"
136 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
137 #include "llvm/Transforms/Utils/InjectTLIMappings.h"
138 #include "llvm/Transforms/Utils/LoopSimplify.h"
139 #include "llvm/Transforms/Utils/LoopUtils.h"
140 #include "llvm/Transforms/Utils/LoopVersioning.h"
141 #include "llvm/Transforms/Utils/SizeOpts.h"
142 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
143 #include <algorithm>
144 #include <cassert>
145 #include <cstdint>
146 #include <cstdlib>
147 #include <functional>
148 #include <iterator>
149 #include <limits>
150 #include <memory>
151 #include <string>
152 #include <tuple>
153 #include <utility>
154 
155 using namespace llvm;
156 
157 #define LV_NAME "loop-vectorize"
158 #define DEBUG_TYPE LV_NAME
159 
160 /// @{
161 /// Metadata attribute names
162 static const char *const LLVMLoopVectorizeFollowupAll =
163     "llvm.loop.vectorize.followup_all";
164 static const char *const LLVMLoopVectorizeFollowupVectorized =
165     "llvm.loop.vectorize.followup_vectorized";
166 static const char *const LLVMLoopVectorizeFollowupEpilogue =
167     "llvm.loop.vectorize.followup_epilogue";
168 /// @}
169 
170 STATISTIC(LoopsVectorized, "Number of loops vectorized");
171 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
172 
173 /// Loops with a known constant trip count below this number are vectorized only
174 /// if no scalar iteration overheads are incurred.
175 static cl::opt<unsigned> TinyTripCountVectorThreshold(
176     "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
177     cl::desc("Loops with a constant trip count that is smaller than this "
178              "value are vectorized only if no scalar iteration overheads "
179              "are incurred."));
180 
181 // Indicates that an epilogue is undesired, predication is preferred.
182 // This means that the vectorizer will try to fold the loop-tail (epilogue)
183 // into the loop and predicate the loop body accordingly.
184 static cl::opt<bool> PreferPredicateOverEpilog(
185     "prefer-predicate-over-epilog", cl::init(false), cl::Hidden,
186     cl::desc("Indicate that an epilogue is undesired, predication should be "
187              "used instead."));
188 
189 static cl::opt<bool> MaximizeBandwidth(
190     "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
191     cl::desc("Maximize bandwidth when selecting vectorization factor which "
192              "will be determined by the smallest type in loop."));
193 
194 static cl::opt<bool> EnableInterleavedMemAccesses(
195     "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
196     cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
197 
198 /// An interleave-group may need masking if it resides in a block that needs
199 /// predication, or in order to mask away gaps.
200 static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
201     "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
202     cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
203 
204 static cl::opt<unsigned> TinyTripCountInterleaveThreshold(
205     "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden,
206     cl::desc("We don't interleave loops with a estimated constant trip count "
207              "below this number"));
208 
209 static cl::opt<unsigned> ForceTargetNumScalarRegs(
210     "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
211     cl::desc("A flag that overrides the target's number of scalar registers."));
212 
213 static cl::opt<unsigned> ForceTargetNumVectorRegs(
214     "force-target-num-vector-regs", cl::init(0), cl::Hidden,
215     cl::desc("A flag that overrides the target's number of vector registers."));
216 
217 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
218     "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
219     cl::desc("A flag that overrides the target's max interleave factor for "
220              "scalar loops."));
221 
222 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
223     "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
224     cl::desc("A flag that overrides the target's max interleave factor for "
225              "vectorized loops."));
226 
227 static cl::opt<unsigned> ForceTargetInstructionCost(
228     "force-target-instruction-cost", cl::init(0), cl::Hidden,
229     cl::desc("A flag that overrides the target's expected cost for "
230              "an instruction to a single constant value. Mostly "
231              "useful for getting consistent testing."));
232 
233 static cl::opt<unsigned> SmallLoopCost(
234     "small-loop-cost", cl::init(20), cl::Hidden,
235     cl::desc(
236         "The cost of a loop that is considered 'small' by the interleaver."));
237 
238 static cl::opt<bool> LoopVectorizeWithBlockFrequency(
239     "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
240     cl::desc("Enable the use of the block frequency analysis to access PGO "
241              "heuristics minimizing code growth in cold regions and being more "
242              "aggressive in hot regions."));
243 
244 // Runtime interleave loops for load/store throughput.
245 static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
246     "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
247     cl::desc(
248         "Enable runtime interleaving until load/store ports are saturated"));
249 
250 /// The number of stores in a loop that are allowed to need predication.
251 static cl::opt<unsigned> NumberOfStoresToPredicate(
252     "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
253     cl::desc("Max number of stores to be predicated behind an if."));
254 
255 static cl::opt<bool> EnableIndVarRegisterHeur(
256     "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
257     cl::desc("Count the induction variable only once when interleaving"));
258 
259 static cl::opt<bool> EnableCondStoresVectorization(
260     "enable-cond-stores-vec", cl::init(true), cl::Hidden,
261     cl::desc("Enable if predication of stores during vectorization."));
262 
263 static cl::opt<unsigned> MaxNestedScalarReductionIC(
264     "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
265     cl::desc("The maximum interleave count to use when interleaving a scalar "
266              "reduction in a nested loop."));
267 
268 cl::opt<bool> EnableVPlanNativePath(
269     "enable-vplan-native-path", cl::init(false), cl::Hidden,
270     cl::desc("Enable VPlan-native vectorization path with "
271              "support for outer loop vectorization."));
272 
273 // FIXME: Remove this switch once we have divergence analysis. Currently we
274 // assume divergent non-backedge branches when this switch is true.
275 cl::opt<bool> EnableVPlanPredication(
276     "enable-vplan-predication", cl::init(false), cl::Hidden,
277     cl::desc("Enable VPlan-native vectorization path predicator with "
278              "support for outer loop vectorization."));
279 
280 // This flag enables the stress testing of the VPlan H-CFG construction in the
281 // VPlan-native vectorization path. It must be used in conjuction with
282 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
283 // verification of the H-CFGs built.
284 static cl::opt<bool> VPlanBuildStressTest(
285     "vplan-build-stress-test", cl::init(false), cl::Hidden,
286     cl::desc(
287         "Build VPlan for every supported loop nest in the function and bail "
288         "out right after the build (stress test the VPlan H-CFG construction "
289         "in the VPlan-native vectorization path)."));
290 
291 cl::opt<bool> llvm::EnableLoopInterleaving(
292     "interleave-loops", cl::init(true), cl::Hidden,
293     cl::desc("Enable loop interleaving in Loop vectorization passes"));
294 cl::opt<bool> llvm::EnableLoopVectorization(
295     "vectorize-loops", cl::init(true), cl::Hidden,
296     cl::desc("Run the Loop vectorization passes"));
297 
298 /// A helper function that returns the type of loaded or stored value.
299 static Type *getMemInstValueType(Value *I) {
300   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
301          "Expected Load or Store instruction");
302   if (auto *LI = dyn_cast<LoadInst>(I))
303     return LI->getType();
304   return cast<StoreInst>(I)->getValueOperand()->getType();
305 }
306 
307 /// A helper function that returns true if the given type is irregular. The
308 /// type is irregular if its allocated size doesn't equal the store size of an
309 /// element of the corresponding vector type at the given vectorization factor.
310 static bool hasIrregularType(Type *Ty, const DataLayout &DL, unsigned VF) {
311   // Determine if an array of VF elements of type Ty is "bitcast compatible"
312   // with a <VF x Ty> vector.
313   if (VF > 1) {
314     auto *VectorTy = VectorType::get(Ty, VF);
315     return VF * DL.getTypeAllocSize(Ty) != DL.getTypeStoreSize(VectorTy);
316   }
317 
318   // If the vectorization factor is one, we just check if an array of type Ty
319   // requires padding between elements.
320   return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
321 }
322 
323 /// A helper function that returns the reciprocal of the block probability of
324 /// predicated blocks. If we return X, we are assuming the predicated block
325 /// will execute once for every X iterations of the loop header.
326 ///
327 /// TODO: We should use actual block probability here, if available. Currently,
328 ///       we always assume predicated blocks have a 50% chance of executing.
329 static unsigned getReciprocalPredBlockProb() { return 2; }
330 
331 /// A helper function that adds a 'fast' flag to floating-point operations.
332 static Value *addFastMathFlag(Value *V) {
333   if (isa<FPMathOperator>(V))
334     cast<Instruction>(V)->setFastMathFlags(FastMathFlags::getFast());
335   return V;
336 }
337 
338 static Value *addFastMathFlag(Value *V, FastMathFlags FMF) {
339   if (isa<FPMathOperator>(V))
340     cast<Instruction>(V)->setFastMathFlags(FMF);
341   return V;
342 }
343 
344 /// A helper function that returns an integer or floating-point constant with
345 /// value C.
346 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
347   return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
348                            : ConstantFP::get(Ty, C);
349 }
350 
351 /// Returns "best known" trip count for the specified loop \p L as defined by
352 /// the following procedure:
353 ///   1) Returns exact trip count if it is known.
354 ///   2) Returns expected trip count according to profile data if any.
355 ///   3) Returns upper bound estimate if it is known.
356 ///   4) Returns None if all of the above failed.
357 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) {
358   // Check if exact trip count is known.
359   if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
360     return ExpectedTC;
361 
362   // Check if there is an expected trip count available from profile data.
363   if (LoopVectorizeWithBlockFrequency)
364     if (auto EstimatedTC = getLoopEstimatedTripCount(L))
365       return EstimatedTC;
366 
367   // Check if upper bound estimate is known.
368   if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
369     return ExpectedTC;
370 
371   return None;
372 }
373 
374 namespace llvm {
375 
376 /// InnerLoopVectorizer vectorizes loops which contain only one basic
377 /// block to a specified vectorization factor (VF).
378 /// This class performs the widening of scalars into vectors, or multiple
379 /// scalars. This class also implements the following features:
380 /// * It inserts an epilogue loop for handling loops that don't have iteration
381 ///   counts that are known to be a multiple of the vectorization factor.
382 /// * It handles the code generation for reduction variables.
383 /// * Scalarization (implementation using scalars) of un-vectorizable
384 ///   instructions.
385 /// InnerLoopVectorizer does not perform any vectorization-legality
386 /// checks, and relies on the caller to check for the different legality
387 /// aspects. The InnerLoopVectorizer relies on the
388 /// LoopVectorizationLegality class to provide information about the induction
389 /// and reduction variables that were found to a given vectorization factor.
390 class InnerLoopVectorizer {
391 public:
392   InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
393                       LoopInfo *LI, DominatorTree *DT,
394                       const TargetLibraryInfo *TLI,
395                       const TargetTransformInfo *TTI, AssumptionCache *AC,
396                       OptimizationRemarkEmitter *ORE, unsigned VecWidth,
397                       unsigned UnrollFactor, LoopVectorizationLegality *LVL,
398                       LoopVectorizationCostModel *CM)
399       : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
400         AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
401         Builder(PSE.getSE()->getContext()),
402         VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM) {}
403   virtual ~InnerLoopVectorizer() = default;
404 
405   /// Create a new empty loop. Unlink the old loop and connect the new one.
406   /// Return the pre-header block of the new loop.
407   BasicBlock *createVectorizedLoopSkeleton();
408 
409   /// Widen a single instruction within the innermost loop.
410   void widenInstruction(Instruction &I);
411 
412   /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
413   void fixVectorizedLoop();
414 
415   // Return true if any runtime check is added.
416   bool areSafetyChecksAdded() { return AddedSafetyChecks; }
417 
418   /// A type for vectorized values in the new loop. Each value from the
419   /// original loop, when vectorized, is represented by UF vector values in the
420   /// new unrolled loop, where UF is the unroll factor.
421   using VectorParts = SmallVector<Value *, 2>;
422 
423   /// Vectorize a single GetElementPtrInst based on information gathered and
424   /// decisions taken during planning.
425   void widenGEP(GetElementPtrInst *GEP, unsigned UF, unsigned VF,
426                 bool IsPtrLoopInvariant, SmallBitVector &IsIndexLoopInvariant);
427 
428   /// Vectorize a single PHINode in a block. This method handles the induction
429   /// variable canonicalization. It supports both VF = 1 for unrolled loops and
430   /// arbitrary length vectors.
431   void widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF);
432 
433   /// A helper function to scalarize a single Instruction in the innermost loop.
434   /// Generates a sequence of scalar instances for each lane between \p MinLane
435   /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
436   /// inclusive..
437   void scalarizeInstruction(Instruction *Instr, const VPIteration &Instance,
438                             bool IfPredicateInstr);
439 
440   /// Widen an integer or floating-point induction variable \p IV. If \p Trunc
441   /// is provided, the integer induction variable will first be truncated to
442   /// the corresponding type.
443   void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr);
444 
445   /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a
446   /// vector or scalar value on-demand if one is not yet available. When
447   /// vectorizing a loop, we visit the definition of an instruction before its
448   /// uses. When visiting the definition, we either vectorize or scalarize the
449   /// instruction, creating an entry for it in the corresponding map. (In some
450   /// cases, such as induction variables, we will create both vector and scalar
451   /// entries.) Then, as we encounter uses of the definition, we derive values
452   /// for each scalar or vector use unless such a value is already available.
453   /// For example, if we scalarize a definition and one of its uses is vector,
454   /// we build the required vector on-demand with an insertelement sequence
455   /// when visiting the use. Otherwise, if the use is scalar, we can use the
456   /// existing scalar definition.
457   ///
458   /// Return a value in the new loop corresponding to \p V from the original
459   /// loop at unroll index \p Part. If the value has already been vectorized,
460   /// the corresponding vector entry in VectorLoopValueMap is returned. If,
461   /// however, the value has a scalar entry in VectorLoopValueMap, we construct
462   /// a new vector value on-demand by inserting the scalar values into a vector
463   /// with an insertelement sequence. If the value has been neither vectorized
464   /// nor scalarized, it must be loop invariant, so we simply broadcast the
465   /// value into a vector.
466   Value *getOrCreateVectorValue(Value *V, unsigned Part);
467 
468   /// Return a value in the new loop corresponding to \p V from the original
469   /// loop at unroll and vector indices \p Instance. If the value has been
470   /// vectorized but not scalarized, the necessary extractelement instruction
471   /// will be generated.
472   Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance);
473 
474   /// Construct the vector value of a scalarized value \p V one lane at a time.
475   void packScalarIntoVectorValue(Value *V, const VPIteration &Instance);
476 
477   /// Try to vectorize the interleaved access group that \p Instr belongs to
478   /// with the base address given in \p Addr, optionally masking the vector
479   /// operations if \p BlockInMask is non-null. Use \p State to translate given
480   /// VPValues to IR values in the vectorized loop.
481   void vectorizeInterleaveGroup(Instruction *Instr, VPTransformState &State,
482                                 VPValue *Addr, VPValue *BlockInMask = nullptr);
483 
484   /// Vectorize Load and Store instructions with the base address given in \p
485   /// Addr, optionally masking the vector operations if \p BlockInMask is
486   /// non-null. Use \p State to translate given VPValues to IR values in the
487   /// vectorized loop.
488   void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State,
489                                   VPValue *Addr,
490                                   VPValue *BlockInMask = nullptr);
491 
492   /// Set the debug location in the builder using the debug location in
493   /// the instruction.
494   void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr);
495 
496   /// Fix the non-induction PHIs in the OrigPHIsToFix vector.
497   void fixNonInductionPHIs(void);
498 
499 protected:
500   friend class LoopVectorizationPlanner;
501 
502   /// A small list of PHINodes.
503   using PhiVector = SmallVector<PHINode *, 4>;
504 
505   /// A type for scalarized values in the new loop. Each value from the
506   /// original loop, when scalarized, is represented by UF x VF scalar values
507   /// in the new unrolled loop, where UF is the unroll factor and VF is the
508   /// vectorization factor.
509   using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
510 
511   /// Set up the values of the IVs correctly when exiting the vector loop.
512   void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
513                     Value *CountRoundDown, Value *EndValue,
514                     BasicBlock *MiddleBlock);
515 
516   /// Create a new induction variable inside L.
517   PHINode *createInductionVariable(Loop *L, Value *Start, Value *End,
518                                    Value *Step, Instruction *DL);
519 
520   /// Handle all cross-iteration phis in the header.
521   void fixCrossIterationPHIs();
522 
523   /// Fix a first-order recurrence. This is the second phase of vectorizing
524   /// this phi node.
525   void fixFirstOrderRecurrence(PHINode *Phi);
526 
527   /// Fix a reduction cross-iteration phi. This is the second phase of
528   /// vectorizing this phi node.
529   void fixReduction(PHINode *Phi);
530 
531   /// Clear NSW/NUW flags from reduction instructions if necessary.
532   void clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc);
533 
534   /// The Loop exit block may have single value PHI nodes with some
535   /// incoming value. While vectorizing we only handled real values
536   /// that were defined inside the loop and we should have one value for
537   /// each predecessor of its parent basic block. See PR14725.
538   void fixLCSSAPHIs();
539 
540   /// Iteratively sink the scalarized operands of a predicated instruction into
541   /// the block that was created for it.
542   void sinkScalarOperands(Instruction *PredInst);
543 
544   /// Shrinks vector element sizes to the smallest bitwidth they can be legally
545   /// represented as.
546   void truncateToMinimalBitwidths();
547 
548   /// Create a broadcast instruction. This method generates a broadcast
549   /// instruction (shuffle) for loop invariant values and for the induction
550   /// value. If this is the induction variable then we extend it to N, N+1, ...
551   /// this is needed because each iteration in the loop corresponds to a SIMD
552   /// element.
553   virtual Value *getBroadcastInstrs(Value *V);
554 
555   /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...)
556   /// to each vector element of Val. The sequence starts at StartIndex.
557   /// \p Opcode is relevant for FP induction variable.
558   virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step,
559                                Instruction::BinaryOps Opcode =
560                                Instruction::BinaryOpsEnd);
561 
562   /// Compute scalar induction steps. \p ScalarIV is the scalar induction
563   /// variable on which to base the steps, \p Step is the size of the step, and
564   /// \p EntryVal is the value from the original loop that maps to the steps.
565   /// Note that \p EntryVal doesn't have to be an induction variable - it
566   /// can also be a truncate instruction.
567   void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal,
568                         const InductionDescriptor &ID);
569 
570   /// Create a vector induction phi node based on an existing scalar one. \p
571   /// EntryVal is the value from the original loop that maps to the vector phi
572   /// node, and \p Step is the loop-invariant step. If \p EntryVal is a
573   /// truncate instruction, instead of widening the original IV, we widen a
574   /// version of the IV truncated to \p EntryVal's type.
575   void createVectorIntOrFpInductionPHI(const InductionDescriptor &II,
576                                        Value *Step, Instruction *EntryVal);
577 
578   /// Returns true if an instruction \p I should be scalarized instead of
579   /// vectorized for the chosen vectorization factor.
580   bool shouldScalarizeInstruction(Instruction *I) const;
581 
582   /// Returns true if we should generate a scalar version of \p IV.
583   bool needsScalarInduction(Instruction *IV) const;
584 
585   /// If there is a cast involved in the induction variable \p ID, which should
586   /// be ignored in the vectorized loop body, this function records the
587   /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the
588   /// cast. We had already proved that the casted Phi is equal to the uncasted
589   /// Phi in the vectorized loop (under a runtime guard), and therefore
590   /// there is no need to vectorize the cast - the same value can be used in the
591   /// vector loop for both the Phi and the cast.
592   /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified,
593   /// Otherwise, \p VectorLoopValue is a widened/vectorized value.
594   ///
595   /// \p EntryVal is the value from the original loop that maps to the vector
596   /// phi node and is used to distinguish what is the IV currently being
597   /// processed - original one (if \p EntryVal is a phi corresponding to the
598   /// original IV) or the "newly-created" one based on the proof mentioned above
599   /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the
600   /// latter case \p EntryVal is a TruncInst and we must not record anything for
601   /// that IV, but it's error-prone to expect callers of this routine to care
602   /// about that, hence this explicit parameter.
603   void recordVectorLoopValueForInductionCast(const InductionDescriptor &ID,
604                                              const Instruction *EntryVal,
605                                              Value *VectorLoopValue,
606                                              unsigned Part,
607                                              unsigned Lane = UINT_MAX);
608 
609   /// Generate a shuffle sequence that will reverse the vector Vec.
610   virtual Value *reverseVector(Value *Vec);
611 
612   /// Returns (and creates if needed) the original loop trip count.
613   Value *getOrCreateTripCount(Loop *NewLoop);
614 
615   /// Returns (and creates if needed) the trip count of the widened loop.
616   Value *getOrCreateVectorTripCount(Loop *NewLoop);
617 
618   /// Returns a bitcasted value to the requested vector type.
619   /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
620   Value *createBitOrPointerCast(Value *V, VectorType *DstVTy,
621                                 const DataLayout &DL);
622 
623   /// Emit a bypass check to see if the vector trip count is zero, including if
624   /// it overflows.
625   void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass);
626 
627   /// Emit a bypass check to see if all of the SCEV assumptions we've
628   /// had to make are correct.
629   void emitSCEVChecks(Loop *L, BasicBlock *Bypass);
630 
631   /// Emit bypass checks to check any memory assumptions we may have made.
632   void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass);
633 
634   /// Compute the transformed value of Index at offset StartValue using step
635   /// StepValue.
636   /// For integer induction, returns StartValue + Index * StepValue.
637   /// For pointer induction, returns StartValue[Index * StepValue].
638   /// FIXME: The newly created binary instructions should contain nsw/nuw
639   /// flags, which can be found from the original scalar operations.
640   Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE,
641                               const DataLayout &DL,
642                               const InductionDescriptor &ID) const;
643 
644   /// Add additional metadata to \p To that was not present on \p Orig.
645   ///
646   /// Currently this is used to add the noalias annotations based on the
647   /// inserted memchecks.  Use this for instructions that are *cloned* into the
648   /// vector loop.
649   void addNewMetadata(Instruction *To, const Instruction *Orig);
650 
651   /// Add metadata from one instruction to another.
652   ///
653   /// This includes both the original MDs from \p From and additional ones (\see
654   /// addNewMetadata).  Use this for *newly created* instructions in the vector
655   /// loop.
656   void addMetadata(Instruction *To, Instruction *From);
657 
658   /// Similar to the previous function but it adds the metadata to a
659   /// vector of instructions.
660   void addMetadata(ArrayRef<Value *> To, Instruction *From);
661 
662   /// The original loop.
663   Loop *OrigLoop;
664 
665   /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
666   /// dynamic knowledge to simplify SCEV expressions and converts them to a
667   /// more usable form.
668   PredicatedScalarEvolution &PSE;
669 
670   /// Loop Info.
671   LoopInfo *LI;
672 
673   /// Dominator Tree.
674   DominatorTree *DT;
675 
676   /// Alias Analysis.
677   AliasAnalysis *AA;
678 
679   /// Target Library Info.
680   const TargetLibraryInfo *TLI;
681 
682   /// Target Transform Info.
683   const TargetTransformInfo *TTI;
684 
685   /// Assumption Cache.
686   AssumptionCache *AC;
687 
688   /// Interface to emit optimization remarks.
689   OptimizationRemarkEmitter *ORE;
690 
691   /// LoopVersioning.  It's only set up (non-null) if memchecks were
692   /// used.
693   ///
694   /// This is currently only used to add no-alias metadata based on the
695   /// memchecks.  The actually versioning is performed manually.
696   std::unique_ptr<LoopVersioning> LVer;
697 
698   /// The vectorization SIMD factor to use. Each vector will have this many
699   /// vector elements.
700   unsigned VF;
701 
702   /// The vectorization unroll factor to use. Each scalar is vectorized to this
703   /// many different vector instructions.
704   unsigned UF;
705 
706   /// The builder that we use
707   IRBuilder<> Builder;
708 
709   // --- Vectorization state ---
710 
711   /// The vector-loop preheader.
712   BasicBlock *LoopVectorPreHeader;
713 
714   /// The scalar-loop preheader.
715   BasicBlock *LoopScalarPreHeader;
716 
717   /// Middle Block between the vector and the scalar.
718   BasicBlock *LoopMiddleBlock;
719 
720   /// The ExitBlock of the scalar loop.
721   BasicBlock *LoopExitBlock;
722 
723   /// The vector loop body.
724   BasicBlock *LoopVectorBody;
725 
726   /// The scalar loop body.
727   BasicBlock *LoopScalarBody;
728 
729   /// A list of all bypass blocks. The first block is the entry of the loop.
730   SmallVector<BasicBlock *, 4> LoopBypassBlocks;
731 
732   /// The new Induction variable which was added to the new block.
733   PHINode *Induction = nullptr;
734 
735   /// The induction variable of the old basic block.
736   PHINode *OldInduction = nullptr;
737 
738   /// Maps values from the original loop to their corresponding values in the
739   /// vectorized loop. A key value can map to either vector values, scalar
740   /// values or both kinds of values, depending on whether the key was
741   /// vectorized and scalarized.
742   VectorizerValueMap VectorLoopValueMap;
743 
744   /// Store instructions that were predicated.
745   SmallVector<Instruction *, 4> PredicatedInstructions;
746 
747   /// Trip count of the original loop.
748   Value *TripCount = nullptr;
749 
750   /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
751   Value *VectorTripCount = nullptr;
752 
753   /// The legality analysis.
754   LoopVectorizationLegality *Legal;
755 
756   /// The profitablity analysis.
757   LoopVectorizationCostModel *Cost;
758 
759   // Record whether runtime checks are added.
760   bool AddedSafetyChecks = false;
761 
762   // Holds the end values for each induction variable. We save the end values
763   // so we can later fix-up the external users of the induction variables.
764   DenseMap<PHINode *, Value *> IVEndValues;
765 
766   // Vector of original scalar PHIs whose corresponding widened PHIs need to be
767   // fixed up at the end of vector code generation.
768   SmallVector<PHINode *, 8> OrigPHIsToFix;
769 };
770 
771 class InnerLoopUnroller : public InnerLoopVectorizer {
772 public:
773   InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
774                     LoopInfo *LI, DominatorTree *DT,
775                     const TargetLibraryInfo *TLI,
776                     const TargetTransformInfo *TTI, AssumptionCache *AC,
777                     OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
778                     LoopVectorizationLegality *LVL,
779                     LoopVectorizationCostModel *CM)
780       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1,
781                             UnrollFactor, LVL, CM) {}
782 
783 private:
784   Value *getBroadcastInstrs(Value *V) override;
785   Value *getStepVector(Value *Val, int StartIdx, Value *Step,
786                        Instruction::BinaryOps Opcode =
787                        Instruction::BinaryOpsEnd) override;
788   Value *reverseVector(Value *Vec) override;
789 };
790 
791 } // end namespace llvm
792 
793 /// Look for a meaningful debug location on the instruction or it's
794 /// operands.
795 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
796   if (!I)
797     return I;
798 
799   DebugLoc Empty;
800   if (I->getDebugLoc() != Empty)
801     return I;
802 
803   for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) {
804     if (Instruction *OpInst = dyn_cast<Instruction>(*OI))
805       if (OpInst->getDebugLoc() != Empty)
806         return OpInst;
807   }
808 
809   return I;
810 }
811 
812 void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) {
813   if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) {
814     const DILocation *DIL = Inst->getDebugLoc();
815     if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
816         !isa<DbgInfoIntrinsic>(Inst)) {
817       auto NewDIL = DIL->cloneByMultiplyingDuplicationFactor(UF * VF);
818       if (NewDIL)
819         B.SetCurrentDebugLocation(NewDIL.getValue());
820       else
821         LLVM_DEBUG(dbgs()
822                    << "Failed to create new discriminator: "
823                    << DIL->getFilename() << " Line: " << DIL->getLine());
824     }
825     else
826       B.SetCurrentDebugLocation(DIL);
827   } else
828     B.SetCurrentDebugLocation(DebugLoc());
829 }
830 
831 /// Write a record \p DebugMsg about vectorization failure to the debug
832 /// output stream. If \p I is passed, it is an instruction that prevents
833 /// vectorization.
834 #ifndef NDEBUG
835 static void debugVectorizationFailure(const StringRef DebugMsg,
836     Instruction *I) {
837   dbgs() << "LV: Not vectorizing: " << DebugMsg;
838   if (I != nullptr)
839     dbgs() << " " << *I;
840   else
841     dbgs() << '.';
842   dbgs() << '\n';
843 }
844 #endif
845 
846 /// Create an analysis remark that explains why vectorization failed
847 ///
848 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint).  \p
849 /// RemarkName is the identifier for the remark.  If \p I is passed it is an
850 /// instruction that prevents vectorization.  Otherwise \p TheLoop is used for
851 /// the location of the remark.  \return the remark object that can be
852 /// streamed to.
853 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
854     StringRef RemarkName, Loop *TheLoop, Instruction *I) {
855   Value *CodeRegion = TheLoop->getHeader();
856   DebugLoc DL = TheLoop->getStartLoc();
857 
858   if (I) {
859     CodeRegion = I->getParent();
860     // If there is no debug location attached to the instruction, revert back to
861     // using the loop's.
862     if (I->getDebugLoc())
863       DL = I->getDebugLoc();
864   }
865 
866   OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion);
867   R << "loop not vectorized: ";
868   return R;
869 }
870 
871 namespace llvm {
872 
873 void reportVectorizationFailure(const StringRef DebugMsg,
874     const StringRef OREMsg, const StringRef ORETag,
875     OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) {
876   LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I));
877   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
878   ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(),
879                 ORETag, TheLoop, I) << OREMsg);
880 }
881 
882 } // end namespace llvm
883 
884 #ifndef NDEBUG
885 /// \return string containing a file name and a line # for the given loop.
886 static std::string getDebugLocString(const Loop *L) {
887   std::string Result;
888   if (L) {
889     raw_string_ostream OS(Result);
890     if (const DebugLoc LoopDbgLoc = L->getStartLoc())
891       LoopDbgLoc.print(OS);
892     else
893       // Just print the module name.
894       OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
895     OS.flush();
896   }
897   return Result;
898 }
899 #endif
900 
901 void InnerLoopVectorizer::addNewMetadata(Instruction *To,
902                                          const Instruction *Orig) {
903   // If the loop was versioned with memchecks, add the corresponding no-alias
904   // metadata.
905   if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig)))
906     LVer->annotateInstWithNoAlias(To, Orig);
907 }
908 
909 void InnerLoopVectorizer::addMetadata(Instruction *To,
910                                       Instruction *From) {
911   propagateMetadata(To, From);
912   addNewMetadata(To, From);
913 }
914 
915 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,
916                                       Instruction *From) {
917   for (Value *V : To) {
918     if (Instruction *I = dyn_cast<Instruction>(V))
919       addMetadata(I, From);
920   }
921 }
922 
923 namespace llvm {
924 
925 // Loop vectorization cost-model hints how the scalar epilogue loop should be
926 // lowered.
927 enum ScalarEpilogueLowering {
928 
929   // The default: allowing scalar epilogues.
930   CM_ScalarEpilogueAllowed,
931 
932   // Vectorization with OptForSize: don't allow epilogues.
933   CM_ScalarEpilogueNotAllowedOptSize,
934 
935   // A special case of vectorisation with OptForSize: loops with a very small
936   // trip count are considered for vectorization under OptForSize, thereby
937   // making sure the cost of their loop body is dominant, free of runtime
938   // guards and scalar iteration overheads.
939   CM_ScalarEpilogueNotAllowedLowTripLoop,
940 
941   // Loop hint predicate indicating an epilogue is undesired.
942   CM_ScalarEpilogueNotNeededUsePredicate
943 };
944 
945 /// LoopVectorizationCostModel - estimates the expected speedups due to
946 /// vectorization.
947 /// In many cases vectorization is not profitable. This can happen because of
948 /// a number of reasons. In this class we mainly attempt to predict the
949 /// expected speedup/slowdowns due to the supported instruction set. We use the
950 /// TargetTransformInfo to query the different backends for the cost of
951 /// different operations.
952 class LoopVectorizationCostModel {
953 public:
954   LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
955                              PredicatedScalarEvolution &PSE, LoopInfo *LI,
956                              LoopVectorizationLegality *Legal,
957                              const TargetTransformInfo &TTI,
958                              const TargetLibraryInfo *TLI, DemandedBits *DB,
959                              AssumptionCache *AC,
960                              OptimizationRemarkEmitter *ORE, const Function *F,
961                              const LoopVectorizeHints *Hints,
962                              InterleavedAccessInfo &IAI)
963       : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
964         TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
965         Hints(Hints), InterleaveInfo(IAI) {}
966 
967   /// \return An upper bound for the vectorization factor, or None if
968   /// vectorization and interleaving should be avoided up front.
969   Optional<unsigned> computeMaxVF();
970 
971   /// \return True if runtime checks are required for vectorization, and false
972   /// otherwise.
973   bool runtimeChecksRequired();
974 
975   /// \return The most profitable vectorization factor and the cost of that VF.
976   /// This method checks every power of two up to MaxVF. If UserVF is not ZERO
977   /// then this vectorization factor will be selected if vectorization is
978   /// possible.
979   VectorizationFactor selectVectorizationFactor(unsigned MaxVF);
980 
981   /// Setup cost-based decisions for user vectorization factor.
982   void selectUserVectorizationFactor(unsigned UserVF) {
983     collectUniformsAndScalars(UserVF);
984     collectInstsToScalarize(UserVF);
985   }
986 
987   /// \return The size (in bits) of the smallest and widest types in the code
988   /// that needs to be vectorized. We ignore values that remain scalar such as
989   /// 64 bit loop indices.
990   std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
991 
992   /// \return The desired interleave count.
993   /// If interleave count has been specified by metadata it will be returned.
994   /// Otherwise, the interleave count is computed and returned. VF and LoopCost
995   /// are the selected vectorization factor and the cost of the selected VF.
996   unsigned selectInterleaveCount(unsigned VF, unsigned LoopCost);
997 
998   /// Memory access instruction may be vectorized in more than one way.
999   /// Form of instruction after vectorization depends on cost.
1000   /// This function takes cost-based decisions for Load/Store instructions
1001   /// and collects them in a map. This decisions map is used for building
1002   /// the lists of loop-uniform and loop-scalar instructions.
1003   /// The calculated cost is saved with widening decision in order to
1004   /// avoid redundant calculations.
1005   void setCostBasedWideningDecision(unsigned VF);
1006 
1007   /// A struct that represents some properties of the register usage
1008   /// of a loop.
1009   struct RegisterUsage {
1010     /// Holds the number of loop invariant values that are used in the loop.
1011     /// The key is ClassID of target-provided register class.
1012     SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs;
1013     /// Holds the maximum number of concurrent live intervals in the loop.
1014     /// The key is ClassID of target-provided register class.
1015     SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
1016   };
1017 
1018   /// \return Returns information about the register usages of the loop for the
1019   /// given vectorization factors.
1020   SmallVector<RegisterUsage, 8> calculateRegisterUsage(ArrayRef<unsigned> VFs);
1021 
1022   /// Collect values we want to ignore in the cost model.
1023   void collectValuesToIgnore();
1024 
1025   /// \returns The smallest bitwidth each instruction can be represented with.
1026   /// The vector equivalents of these instructions should be truncated to this
1027   /// type.
1028   const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
1029     return MinBWs;
1030   }
1031 
1032   /// \returns True if it is more profitable to scalarize instruction \p I for
1033   /// vectorization factor \p VF.
1034   bool isProfitableToScalarize(Instruction *I, unsigned VF) const {
1035     assert(VF > 1 && "Profitable to scalarize relevant only for VF > 1.");
1036 
1037     // Cost model is not run in the VPlan-native path - return conservative
1038     // result until this changes.
1039     if (EnableVPlanNativePath)
1040       return false;
1041 
1042     auto Scalars = InstsToScalarize.find(VF);
1043     assert(Scalars != InstsToScalarize.end() &&
1044            "VF not yet analyzed for scalarization profitability");
1045     return Scalars->second.find(I) != Scalars->second.end();
1046   }
1047 
1048   /// Returns true if \p I is known to be uniform after vectorization.
1049   bool isUniformAfterVectorization(Instruction *I, unsigned VF) const {
1050     if (VF == 1)
1051       return true;
1052 
1053     // Cost model is not run in the VPlan-native path - return conservative
1054     // result until this changes.
1055     if (EnableVPlanNativePath)
1056       return false;
1057 
1058     auto UniformsPerVF = Uniforms.find(VF);
1059     assert(UniformsPerVF != Uniforms.end() &&
1060            "VF not yet analyzed for uniformity");
1061     return UniformsPerVF->second.find(I) != UniformsPerVF->second.end();
1062   }
1063 
1064   /// Returns true if \p I is known to be scalar after vectorization.
1065   bool isScalarAfterVectorization(Instruction *I, unsigned VF) const {
1066     if (VF == 1)
1067       return true;
1068 
1069     // Cost model is not run in the VPlan-native path - return conservative
1070     // result until this changes.
1071     if (EnableVPlanNativePath)
1072       return false;
1073 
1074     auto ScalarsPerVF = Scalars.find(VF);
1075     assert(ScalarsPerVF != Scalars.end() &&
1076            "Scalar values are not calculated for VF");
1077     return ScalarsPerVF->second.find(I) != ScalarsPerVF->second.end();
1078   }
1079 
1080   /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1081   /// for vectorization factor \p VF.
1082   bool canTruncateToMinimalBitwidth(Instruction *I, unsigned VF) const {
1083     return VF > 1 && MinBWs.find(I) != MinBWs.end() &&
1084            !isProfitableToScalarize(I, VF) &&
1085            !isScalarAfterVectorization(I, VF);
1086   }
1087 
1088   /// Decision that was taken during cost calculation for memory instruction.
1089   enum InstWidening {
1090     CM_Unknown,
1091     CM_Widen,         // For consecutive accesses with stride +1.
1092     CM_Widen_Reverse, // For consecutive accesses with stride -1.
1093     CM_Interleave,
1094     CM_GatherScatter,
1095     CM_Scalarize
1096   };
1097 
1098   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1099   /// instruction \p I and vector width \p VF.
1100   void setWideningDecision(Instruction *I, unsigned VF, InstWidening W,
1101                            unsigned Cost) {
1102     assert(VF >= 2 && "Expected VF >=2");
1103     WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1104   }
1105 
1106   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1107   /// interleaving group \p Grp and vector width \p VF.
1108   void setWideningDecision(const InterleaveGroup<Instruction> *Grp, unsigned VF,
1109                            InstWidening W, unsigned Cost) {
1110     assert(VF >= 2 && "Expected VF >=2");
1111     /// Broadcast this decicion to all instructions inside the group.
1112     /// But the cost will be assigned to one instruction only.
1113     for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1114       if (auto *I = Grp->getMember(i)) {
1115         if (Grp->getInsertPos() == I)
1116           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1117         else
1118           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1119       }
1120     }
1121   }
1122 
1123   /// Return the cost model decision for the given instruction \p I and vector
1124   /// width \p VF. Return CM_Unknown if this instruction did not pass
1125   /// through the cost modeling.
1126   InstWidening getWideningDecision(Instruction *I, unsigned VF) {
1127     assert(VF >= 2 && "Expected VF >=2");
1128 
1129     // Cost model is not run in the VPlan-native path - return conservative
1130     // result until this changes.
1131     if (EnableVPlanNativePath)
1132       return CM_GatherScatter;
1133 
1134     std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
1135     auto Itr = WideningDecisions.find(InstOnVF);
1136     if (Itr == WideningDecisions.end())
1137       return CM_Unknown;
1138     return Itr->second.first;
1139   }
1140 
1141   /// Return the vectorization cost for the given instruction \p I and vector
1142   /// width \p VF.
1143   unsigned getWideningCost(Instruction *I, unsigned VF) {
1144     assert(VF >= 2 && "Expected VF >=2");
1145     std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
1146     assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&
1147            "The cost is not calculated");
1148     return WideningDecisions[InstOnVF].second;
1149   }
1150 
1151   /// Return True if instruction \p I is an optimizable truncate whose operand
1152   /// is an induction variable. Such a truncate will be removed by adding a new
1153   /// induction variable with the destination type.
1154   bool isOptimizableIVTruncate(Instruction *I, unsigned VF) {
1155     // If the instruction is not a truncate, return false.
1156     auto *Trunc = dyn_cast<TruncInst>(I);
1157     if (!Trunc)
1158       return false;
1159 
1160     // Get the source and destination types of the truncate.
1161     Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1162     Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1163 
1164     // If the truncate is free for the given types, return false. Replacing a
1165     // free truncate with an induction variable would add an induction variable
1166     // update instruction to each iteration of the loop. We exclude from this
1167     // check the primary induction variable since it will need an update
1168     // instruction regardless.
1169     Value *Op = Trunc->getOperand(0);
1170     if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1171       return false;
1172 
1173     // If the truncated value is not an induction variable, return false.
1174     return Legal->isInductionPhi(Op);
1175   }
1176 
1177   /// Collects the instructions to scalarize for each predicated instruction in
1178   /// the loop.
1179   void collectInstsToScalarize(unsigned VF);
1180 
1181   /// Collect Uniform and Scalar values for the given \p VF.
1182   /// The sets depend on CM decision for Load/Store instructions
1183   /// that may be vectorized as interleave, gather-scatter or scalarized.
1184   void collectUniformsAndScalars(unsigned VF) {
1185     // Do the analysis once.
1186     if (VF == 1 || Uniforms.find(VF) != Uniforms.end())
1187       return;
1188     setCostBasedWideningDecision(VF);
1189     collectLoopUniforms(VF);
1190     collectLoopScalars(VF);
1191   }
1192 
1193   /// Returns true if the target machine supports masked store operation
1194   /// for the given \p DataType and kind of access to \p Ptr.
1195   bool isLegalMaskedStore(Type *DataType, Value *Ptr, MaybeAlign Alignment) {
1196     return Legal->isConsecutivePtr(Ptr) &&
1197            TTI.isLegalMaskedStore(DataType, Alignment);
1198   }
1199 
1200   /// Returns true if the target machine supports masked load operation
1201   /// for the given \p DataType and kind of access to \p Ptr.
1202   bool isLegalMaskedLoad(Type *DataType, Value *Ptr, MaybeAlign Alignment) {
1203     return Legal->isConsecutivePtr(Ptr) &&
1204            TTI.isLegalMaskedLoad(DataType, Alignment);
1205   }
1206 
1207   /// Returns true if the target machine supports masked scatter operation
1208   /// for the given \p DataType.
1209   bool isLegalMaskedScatter(Type *DataType, MaybeAlign Alignment) {
1210     return TTI.isLegalMaskedScatter(DataType, Alignment);
1211   }
1212 
1213   /// Returns true if the target machine supports masked gather operation
1214   /// for the given \p DataType.
1215   bool isLegalMaskedGather(Type *DataType, MaybeAlign Alignment) {
1216     return TTI.isLegalMaskedGather(DataType, Alignment);
1217   }
1218 
1219   /// Returns true if the target machine can represent \p V as a masked gather
1220   /// or scatter operation.
1221   bool isLegalGatherOrScatter(Value *V) {
1222     bool LI = isa<LoadInst>(V);
1223     bool SI = isa<StoreInst>(V);
1224     if (!LI && !SI)
1225       return false;
1226     auto *Ty = getMemInstValueType(V);
1227     MaybeAlign Align = getLoadStoreAlignment(V);
1228     return (LI && isLegalMaskedGather(Ty, Align)) ||
1229            (SI && isLegalMaskedScatter(Ty, Align));
1230   }
1231 
1232   /// Returns true if \p I is an instruction that will be scalarized with
1233   /// predication. Such instructions include conditional stores and
1234   /// instructions that may divide by zero.
1235   /// If a non-zero VF has been calculated, we check if I will be scalarized
1236   /// predication for that VF.
1237   bool isScalarWithPredication(Instruction *I, unsigned VF = 1);
1238 
1239   // Returns true if \p I is an instruction that will be predicated either
1240   // through scalar predication or masked load/store or masked gather/scatter.
1241   // Superset of instructions that return true for isScalarWithPredication.
1242   bool isPredicatedInst(Instruction *I) {
1243     if (!blockNeedsPredication(I->getParent()))
1244       return false;
1245     // Loads and stores that need some form of masked operation are predicated
1246     // instructions.
1247     if (isa<LoadInst>(I) || isa<StoreInst>(I))
1248       return Legal->isMaskRequired(I);
1249     return isScalarWithPredication(I);
1250   }
1251 
1252   /// Returns true if \p I is a memory instruction with consecutive memory
1253   /// access that can be widened.
1254   bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1);
1255 
1256   /// Returns true if \p I is a memory instruction in an interleaved-group
1257   /// of memory accesses that can be vectorized with wide vector loads/stores
1258   /// and shuffles.
1259   bool interleavedAccessCanBeWidened(Instruction *I, unsigned VF = 1);
1260 
1261   /// Check if \p Instr belongs to any interleaved access group.
1262   bool isAccessInterleaved(Instruction *Instr) {
1263     return InterleaveInfo.isInterleaved(Instr);
1264   }
1265 
1266   /// Get the interleaved access group that \p Instr belongs to.
1267   const InterleaveGroup<Instruction> *
1268   getInterleavedAccessGroup(Instruction *Instr) {
1269     return InterleaveInfo.getInterleaveGroup(Instr);
1270   }
1271 
1272   /// Returns true if an interleaved group requires a scalar iteration
1273   /// to handle accesses with gaps, and there is nothing preventing us from
1274   /// creating a scalar epilogue.
1275   bool requiresScalarEpilogue() const {
1276     return isScalarEpilogueAllowed() && InterleaveInfo.requiresScalarEpilogue();
1277   }
1278 
1279   /// Returns true if a scalar epilogue is not allowed due to optsize or a
1280   /// loop hint annotation.
1281   bool isScalarEpilogueAllowed() const {
1282     return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1283   }
1284 
1285   /// Returns true if all loop blocks should be masked to fold tail loop.
1286   bool foldTailByMasking() const { return FoldTailByMasking; }
1287 
1288   bool blockNeedsPredication(BasicBlock *BB) {
1289     return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1290   }
1291 
1292   /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1293   /// with factor VF.  Return the cost of the instruction, including
1294   /// scalarization overhead if it's needed.
1295   unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF);
1296 
1297   /// Estimate cost of a call instruction CI if it were vectorized with factor
1298   /// VF. Return the cost of the instruction, including scalarization overhead
1299   /// if it's needed. The flag NeedToScalarize shows if the call needs to be
1300   /// scalarized -
1301   /// i.e. either vector version isn't available, or is too expensive.
1302   unsigned getVectorCallCost(CallInst *CI, unsigned VF, bool &NeedToScalarize);
1303 
1304 private:
1305   unsigned NumPredStores = 0;
1306 
1307   /// \return An upper bound for the vectorization factor, larger than zero.
1308   /// One is returned if vectorization should best be avoided due to cost.
1309   unsigned computeFeasibleMaxVF(unsigned ConstTripCount);
1310 
1311   /// The vectorization cost is a combination of the cost itself and a boolean
1312   /// indicating whether any of the contributing operations will actually
1313   /// operate on
1314   /// vector values after type legalization in the backend. If this latter value
1315   /// is
1316   /// false, then all operations will be scalarized (i.e. no vectorization has
1317   /// actually taken place).
1318   using VectorizationCostTy = std::pair<unsigned, bool>;
1319 
1320   /// Returns the expected execution cost. The unit of the cost does
1321   /// not matter because we use the 'cost' units to compare different
1322   /// vector widths. The cost that is returned is *not* normalized by
1323   /// the factor width.
1324   VectorizationCostTy expectedCost(unsigned VF);
1325 
1326   /// Returns the execution time cost of an instruction for a given vector
1327   /// width. Vector width of one means scalar.
1328   VectorizationCostTy getInstructionCost(Instruction *I, unsigned VF);
1329 
1330   /// The cost-computation logic from getInstructionCost which provides
1331   /// the vector type as an output parameter.
1332   unsigned getInstructionCost(Instruction *I, unsigned VF, Type *&VectorTy);
1333 
1334   /// Calculate vectorization cost of memory instruction \p I.
1335   unsigned getMemoryInstructionCost(Instruction *I, unsigned VF);
1336 
1337   /// The cost computation for scalarized memory instruction.
1338   unsigned getMemInstScalarizationCost(Instruction *I, unsigned VF);
1339 
1340   /// The cost computation for interleaving group of memory instructions.
1341   unsigned getInterleaveGroupCost(Instruction *I, unsigned VF);
1342 
1343   /// The cost computation for Gather/Scatter instruction.
1344   unsigned getGatherScatterCost(Instruction *I, unsigned VF);
1345 
1346   /// The cost computation for widening instruction \p I with consecutive
1347   /// memory access.
1348   unsigned getConsecutiveMemOpCost(Instruction *I, unsigned VF);
1349 
1350   /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1351   /// Load: scalar load + broadcast.
1352   /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1353   /// element)
1354   unsigned getUniformMemOpCost(Instruction *I, unsigned VF);
1355 
1356   /// Estimate the overhead of scalarizing an instruction. This is a
1357   /// convenience wrapper for the type-based getScalarizationOverhead API.
1358   unsigned getScalarizationOverhead(Instruction *I, unsigned VF);
1359 
1360   /// Returns whether the instruction is a load or store and will be a emitted
1361   /// as a vector operation.
1362   bool isConsecutiveLoadOrStore(Instruction *I);
1363 
1364   /// Returns true if an artificially high cost for emulated masked memrefs
1365   /// should be used.
1366   bool useEmulatedMaskMemRefHack(Instruction *I);
1367 
1368   /// Map of scalar integer values to the smallest bitwidth they can be legally
1369   /// represented as. The vector equivalents of these values should be truncated
1370   /// to this type.
1371   MapVector<Instruction *, uint64_t> MinBWs;
1372 
1373   /// A type representing the costs for instructions if they were to be
1374   /// scalarized rather than vectorized. The entries are Instruction-Cost
1375   /// pairs.
1376   using ScalarCostsTy = DenseMap<Instruction *, unsigned>;
1377 
1378   /// A set containing all BasicBlocks that are known to present after
1379   /// vectorization as a predicated block.
1380   SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;
1381 
1382   /// Records whether it is allowed to have the original scalar loop execute at
1383   /// least once. This may be needed as a fallback loop in case runtime
1384   /// aliasing/dependence checks fail, or to handle the tail/remainder
1385   /// iterations when the trip count is unknown or doesn't divide by the VF,
1386   /// or as a peel-loop to handle gaps in interleave-groups.
1387   /// Under optsize and when the trip count is very small we don't allow any
1388   /// iterations to execute in the scalar loop.
1389   ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1390 
1391   /// All blocks of loop are to be masked to fold tail of scalar iterations.
1392   bool FoldTailByMasking = false;
1393 
1394   /// A map holding scalar costs for different vectorization factors. The
1395   /// presence of a cost for an instruction in the mapping indicates that the
1396   /// instruction will be scalarized when vectorizing with the associated
1397   /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1398   DenseMap<unsigned, ScalarCostsTy> InstsToScalarize;
1399 
1400   /// Holds the instructions known to be uniform after vectorization.
1401   /// The data is collected per VF.
1402   DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Uniforms;
1403 
1404   /// Holds the instructions known to be scalar after vectorization.
1405   /// The data is collected per VF.
1406   DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Scalars;
1407 
1408   /// Holds the instructions (address computations) that are forced to be
1409   /// scalarized.
1410   DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1411 
1412   /// Returns the expected difference in cost from scalarizing the expression
1413   /// feeding a predicated instruction \p PredInst. The instructions to
1414   /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1415   /// non-negative return value implies the expression will be scalarized.
1416   /// Currently, only single-use chains are considered for scalarization.
1417   int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
1418                               unsigned VF);
1419 
1420   /// Collect the instructions that are uniform after vectorization. An
1421   /// instruction is uniform if we represent it with a single scalar value in
1422   /// the vectorized loop corresponding to each vector iteration. Examples of
1423   /// uniform instructions include pointer operands of consecutive or
1424   /// interleaved memory accesses. Note that although uniformity implies an
1425   /// instruction will be scalar, the reverse is not true. In general, a
1426   /// scalarized instruction will be represented by VF scalar values in the
1427   /// vectorized loop, each corresponding to an iteration of the original
1428   /// scalar loop.
1429   void collectLoopUniforms(unsigned VF);
1430 
1431   /// Collect the instructions that are scalar after vectorization. An
1432   /// instruction is scalar if it is known to be uniform or will be scalarized
1433   /// during vectorization. Non-uniform scalarized instructions will be
1434   /// represented by VF values in the vectorized loop, each corresponding to an
1435   /// iteration of the original scalar loop.
1436   void collectLoopScalars(unsigned VF);
1437 
1438   /// Keeps cost model vectorization decision and cost for instructions.
1439   /// Right now it is used for memory instructions only.
1440   using DecisionList = DenseMap<std::pair<Instruction *, unsigned>,
1441                                 std::pair<InstWidening, unsigned>>;
1442 
1443   DecisionList WideningDecisions;
1444 
1445   /// Returns true if \p V is expected to be vectorized and it needs to be
1446   /// extracted.
1447   bool needsExtract(Value *V, unsigned VF) const {
1448     Instruction *I = dyn_cast<Instruction>(V);
1449     if (VF == 1 || !I || !TheLoop->contains(I) || TheLoop->isLoopInvariant(I))
1450       return false;
1451 
1452     // Assume we can vectorize V (and hence we need extraction) if the
1453     // scalars are not computed yet. This can happen, because it is called
1454     // via getScalarizationOverhead from setCostBasedWideningDecision, before
1455     // the scalars are collected. That should be a safe assumption in most
1456     // cases, because we check if the operands have vectorizable types
1457     // beforehand in LoopVectorizationLegality.
1458     return Scalars.find(VF) == Scalars.end() ||
1459            !isScalarAfterVectorization(I, VF);
1460   };
1461 
1462   /// Returns a range containing only operands needing to be extracted.
1463   SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1464                                                    unsigned VF) {
1465     return SmallVector<Value *, 4>(make_filter_range(
1466         Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1467   }
1468 
1469 public:
1470   /// The loop that we evaluate.
1471   Loop *TheLoop;
1472 
1473   /// Predicated scalar evolution analysis.
1474   PredicatedScalarEvolution &PSE;
1475 
1476   /// Loop Info analysis.
1477   LoopInfo *LI;
1478 
1479   /// Vectorization legality.
1480   LoopVectorizationLegality *Legal;
1481 
1482   /// Vector target information.
1483   const TargetTransformInfo &TTI;
1484 
1485   /// Target Library Info.
1486   const TargetLibraryInfo *TLI;
1487 
1488   /// Demanded bits analysis.
1489   DemandedBits *DB;
1490 
1491   /// Assumption cache.
1492   AssumptionCache *AC;
1493 
1494   /// Interface to emit optimization remarks.
1495   OptimizationRemarkEmitter *ORE;
1496 
1497   const Function *TheFunction;
1498 
1499   /// Loop Vectorize Hint.
1500   const LoopVectorizeHints *Hints;
1501 
1502   /// The interleave access information contains groups of interleaved accesses
1503   /// with the same stride and close to each other.
1504   InterleavedAccessInfo &InterleaveInfo;
1505 
1506   /// Values to ignore in the cost model.
1507   SmallPtrSet<const Value *, 16> ValuesToIgnore;
1508 
1509   /// Values to ignore in the cost model when VF > 1.
1510   SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1511 };
1512 
1513 } // end namespace llvm
1514 
1515 // Return true if \p OuterLp is an outer loop annotated with hints for explicit
1516 // vectorization. The loop needs to be annotated with #pragma omp simd
1517 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
1518 // vector length information is not provided, vectorization is not considered
1519 // explicit. Interleave hints are not allowed either. These limitations will be
1520 // relaxed in the future.
1521 // Please, note that we are currently forced to abuse the pragma 'clang
1522 // vectorize' semantics. This pragma provides *auto-vectorization hints*
1523 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
1524 // provides *explicit vectorization hints* (LV can bypass legal checks and
1525 // assume that vectorization is legal). However, both hints are implemented
1526 // using the same metadata (llvm.loop.vectorize, processed by
1527 // LoopVectorizeHints). This will be fixed in the future when the native IR
1528 // representation for pragma 'omp simd' is introduced.
1529 static bool isExplicitVecOuterLoop(Loop *OuterLp,
1530                                    OptimizationRemarkEmitter *ORE) {
1531   assert(!OuterLp->empty() && "This is not an outer loop");
1532   LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
1533 
1534   // Only outer loops with an explicit vectorization hint are supported.
1535   // Unannotated outer loops are ignored.
1536   if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
1537     return false;
1538 
1539   Function *Fn = OuterLp->getHeader()->getParent();
1540   if (!Hints.allowVectorization(Fn, OuterLp,
1541                                 true /*VectorizeOnlyWhenForced*/)) {
1542     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
1543     return false;
1544   }
1545 
1546   if (Hints.getInterleave() > 1) {
1547     // TODO: Interleave support is future work.
1548     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
1549                          "outer loops.\n");
1550     Hints.emitRemarkWithHints();
1551     return false;
1552   }
1553 
1554   return true;
1555 }
1556 
1557 static void collectSupportedLoops(Loop &L, LoopInfo *LI,
1558                                   OptimizationRemarkEmitter *ORE,
1559                                   SmallVectorImpl<Loop *> &V) {
1560   // Collect inner loops and outer loops without irreducible control flow. For
1561   // now, only collect outer loops that have explicit vectorization hints. If we
1562   // are stress testing the VPlan H-CFG construction, we collect the outermost
1563   // loop of every loop nest.
1564   if (L.empty() || VPlanBuildStressTest ||
1565       (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
1566     LoopBlocksRPO RPOT(&L);
1567     RPOT.perform(LI);
1568     if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
1569       V.push_back(&L);
1570       // TODO: Collect inner loops inside marked outer loops in case
1571       // vectorization fails for the outer loop. Do not invoke
1572       // 'containsIrreducibleCFG' again for inner loops when the outer loop is
1573       // already known to be reducible. We can use an inherited attribute for
1574       // that.
1575       return;
1576     }
1577   }
1578   for (Loop *InnerL : L)
1579     collectSupportedLoops(*InnerL, LI, ORE, V);
1580 }
1581 
1582 namespace {
1583 
1584 /// The LoopVectorize Pass.
1585 struct LoopVectorize : public FunctionPass {
1586   /// Pass identification, replacement for typeid
1587   static char ID;
1588 
1589   LoopVectorizePass Impl;
1590 
1591   explicit LoopVectorize(bool InterleaveOnlyWhenForced = false,
1592                          bool VectorizeOnlyWhenForced = false)
1593       : FunctionPass(ID) {
1594     Impl.InterleaveOnlyWhenForced = InterleaveOnlyWhenForced;
1595     Impl.VectorizeOnlyWhenForced = VectorizeOnlyWhenForced;
1596     initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
1597   }
1598 
1599   bool runOnFunction(Function &F) override {
1600     if (skipFunction(F))
1601       return false;
1602 
1603     auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
1604     auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
1605     auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
1606     auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
1607     auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
1608     auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
1609     auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
1610     auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
1611     auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
1612     auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
1613     auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
1614     auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
1615     auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
1616 
1617     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
1618         [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
1619 
1620     return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC,
1621                         GetLAA, *ORE, PSI);
1622   }
1623 
1624   void getAnalysisUsage(AnalysisUsage &AU) const override {
1625     AU.addRequired<AssumptionCacheTracker>();
1626     AU.addRequired<BlockFrequencyInfoWrapperPass>();
1627     AU.addRequired<DominatorTreeWrapperPass>();
1628     AU.addRequired<LoopInfoWrapperPass>();
1629     AU.addRequired<ScalarEvolutionWrapperPass>();
1630     AU.addRequired<TargetTransformInfoWrapperPass>();
1631     AU.addRequired<AAResultsWrapperPass>();
1632     AU.addRequired<LoopAccessLegacyAnalysis>();
1633     AU.addRequired<DemandedBitsWrapperPass>();
1634     AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
1635     AU.addRequired<InjectTLIMappingsLegacy>();
1636 
1637     // We currently do not preserve loopinfo/dominator analyses with outer loop
1638     // vectorization. Until this is addressed, mark these analyses as preserved
1639     // only for non-VPlan-native path.
1640     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
1641     if (!EnableVPlanNativePath) {
1642       AU.addPreserved<LoopInfoWrapperPass>();
1643       AU.addPreserved<DominatorTreeWrapperPass>();
1644     }
1645 
1646     AU.addPreserved<BasicAAWrapperPass>();
1647     AU.addPreserved<GlobalsAAWrapperPass>();
1648     AU.addRequired<ProfileSummaryInfoWrapperPass>();
1649   }
1650 };
1651 
1652 } // end anonymous namespace
1653 
1654 //===----------------------------------------------------------------------===//
1655 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
1656 // LoopVectorizationCostModel and LoopVectorizationPlanner.
1657 //===----------------------------------------------------------------------===//
1658 
1659 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
1660   // We need to place the broadcast of invariant variables outside the loop,
1661   // but only if it's proven safe to do so. Else, broadcast will be inside
1662   // vector loop body.
1663   Instruction *Instr = dyn_cast<Instruction>(V);
1664   bool SafeToHoist = OrigLoop->isLoopInvariant(V) &&
1665                      (!Instr ||
1666                       DT->dominates(Instr->getParent(), LoopVectorPreHeader));
1667   // Place the code for broadcasting invariant variables in the new preheader.
1668   IRBuilder<>::InsertPointGuard Guard(Builder);
1669   if (SafeToHoist)
1670     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
1671 
1672   // Broadcast the scalar into all locations in the vector.
1673   Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
1674 
1675   return Shuf;
1676 }
1677 
1678 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
1679     const InductionDescriptor &II, Value *Step, Instruction *EntryVal) {
1680   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1681          "Expected either an induction phi-node or a truncate of it!");
1682   Value *Start = II.getStartValue();
1683 
1684   // Construct the initial value of the vector IV in the vector loop preheader
1685   auto CurrIP = Builder.saveIP();
1686   Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
1687   if (isa<TruncInst>(EntryVal)) {
1688     assert(Start->getType()->isIntegerTy() &&
1689            "Truncation requires an integer type");
1690     auto *TruncType = cast<IntegerType>(EntryVal->getType());
1691     Step = Builder.CreateTrunc(Step, TruncType);
1692     Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
1693   }
1694   Value *SplatStart = Builder.CreateVectorSplat(VF, Start);
1695   Value *SteppedStart =
1696       getStepVector(SplatStart, 0, Step, II.getInductionOpcode());
1697 
1698   // We create vector phi nodes for both integer and floating-point induction
1699   // variables. Here, we determine the kind of arithmetic we will perform.
1700   Instruction::BinaryOps AddOp;
1701   Instruction::BinaryOps MulOp;
1702   if (Step->getType()->isIntegerTy()) {
1703     AddOp = Instruction::Add;
1704     MulOp = Instruction::Mul;
1705   } else {
1706     AddOp = II.getInductionOpcode();
1707     MulOp = Instruction::FMul;
1708   }
1709 
1710   // Multiply the vectorization factor by the step using integer or
1711   // floating-point arithmetic as appropriate.
1712   Value *ConstVF = getSignedIntOrFpConstant(Step->getType(), VF);
1713   Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF));
1714 
1715   // Create a vector splat to use in the induction update.
1716   //
1717   // FIXME: If the step is non-constant, we create the vector splat with
1718   //        IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
1719   //        handle a constant vector splat.
1720   Value *SplatVF =
1721       isa<Constant>(Mul)
1722           ? ConstantVector::getSplat({VF, false}, cast<Constant>(Mul))
1723           : Builder.CreateVectorSplat(VF, Mul);
1724   Builder.restoreIP(CurrIP);
1725 
1726   // We may need to add the step a number of times, depending on the unroll
1727   // factor. The last of those goes into the PHI.
1728   PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
1729                                     &*LoopVectorBody->getFirstInsertionPt());
1730   VecInd->setDebugLoc(EntryVal->getDebugLoc());
1731   Instruction *LastInduction = VecInd;
1732   for (unsigned Part = 0; Part < UF; ++Part) {
1733     VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction);
1734 
1735     if (isa<TruncInst>(EntryVal))
1736       addMetadata(LastInduction, EntryVal);
1737     recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, Part);
1738 
1739     LastInduction = cast<Instruction>(addFastMathFlag(
1740         Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")));
1741     LastInduction->setDebugLoc(EntryVal->getDebugLoc());
1742   }
1743 
1744   // Move the last step to the end of the latch block. This ensures consistent
1745   // placement of all induction updates.
1746   auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
1747   auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator());
1748   auto *ICmp = cast<Instruction>(Br->getCondition());
1749   LastInduction->moveBefore(ICmp);
1750   LastInduction->setName("vec.ind.next");
1751 
1752   VecInd->addIncoming(SteppedStart, LoopVectorPreHeader);
1753   VecInd->addIncoming(LastInduction, LoopVectorLatch);
1754 }
1755 
1756 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const {
1757   return Cost->isScalarAfterVectorization(I, VF) ||
1758          Cost->isProfitableToScalarize(I, VF);
1759 }
1760 
1761 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const {
1762   if (shouldScalarizeInstruction(IV))
1763     return true;
1764   auto isScalarInst = [&](User *U) -> bool {
1765     auto *I = cast<Instruction>(U);
1766     return (OrigLoop->contains(I) && shouldScalarizeInstruction(I));
1767   };
1768   return llvm::any_of(IV->users(), isScalarInst);
1769 }
1770 
1771 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast(
1772     const InductionDescriptor &ID, const Instruction *EntryVal,
1773     Value *VectorLoopVal, unsigned Part, unsigned Lane) {
1774   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1775          "Expected either an induction phi-node or a truncate of it!");
1776 
1777   // This induction variable is not the phi from the original loop but the
1778   // newly-created IV based on the proof that casted Phi is equal to the
1779   // uncasted Phi in the vectorized loop (under a runtime guard possibly). It
1780   // re-uses the same InductionDescriptor that original IV uses but we don't
1781   // have to do any recording in this case - that is done when original IV is
1782   // processed.
1783   if (isa<TruncInst>(EntryVal))
1784     return;
1785 
1786   const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts();
1787   if (Casts.empty())
1788     return;
1789   // Only the first Cast instruction in the Casts vector is of interest.
1790   // The rest of the Casts (if exist) have no uses outside the
1791   // induction update chain itself.
1792   Instruction *CastInst = *Casts.begin();
1793   if (Lane < UINT_MAX)
1794     VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal);
1795   else
1796     VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal);
1797 }
1798 
1799 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) {
1800   assert((IV->getType()->isIntegerTy() || IV != OldInduction) &&
1801          "Primary induction variable must have an integer type");
1802 
1803   auto II = Legal->getInductionVars().find(IV);
1804   assert(II != Legal->getInductionVars().end() && "IV is not an induction");
1805 
1806   auto ID = II->second;
1807   assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
1808 
1809   // The scalar value to broadcast. This will be derived from the canonical
1810   // induction variable.
1811   Value *ScalarIV = nullptr;
1812 
1813   // The value from the original loop to which we are mapping the new induction
1814   // variable.
1815   Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
1816 
1817   // True if we have vectorized the induction variable.
1818   auto VectorizedIV = false;
1819 
1820   // Determine if we want a scalar version of the induction variable. This is
1821   // true if the induction variable itself is not widened, or if it has at
1822   // least one user in the loop that is not widened.
1823   auto NeedsScalarIV = VF > 1 && needsScalarInduction(EntryVal);
1824 
1825   // Generate code for the induction step. Note that induction steps are
1826   // required to be loop-invariant
1827   assert(PSE.getSE()->isLoopInvariant(ID.getStep(), OrigLoop) &&
1828          "Induction step should be loop invariant");
1829   auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
1830   Value *Step = nullptr;
1831   if (PSE.getSE()->isSCEVable(IV->getType())) {
1832     SCEVExpander Exp(*PSE.getSE(), DL, "induction");
1833     Step = Exp.expandCodeFor(ID.getStep(), ID.getStep()->getType(),
1834                              LoopVectorPreHeader->getTerminator());
1835   } else {
1836     Step = cast<SCEVUnknown>(ID.getStep())->getValue();
1837   }
1838 
1839   // Try to create a new independent vector induction variable. If we can't
1840   // create the phi node, we will splat the scalar induction variable in each
1841   // loop iteration.
1842   if (VF > 1 && !shouldScalarizeInstruction(EntryVal)) {
1843     createVectorIntOrFpInductionPHI(ID, Step, EntryVal);
1844     VectorizedIV = true;
1845   }
1846 
1847   // If we haven't yet vectorized the induction variable, or if we will create
1848   // a scalar one, we need to define the scalar induction variable and step
1849   // values. If we were given a truncation type, truncate the canonical
1850   // induction variable and step. Otherwise, derive these values from the
1851   // induction descriptor.
1852   if (!VectorizedIV || NeedsScalarIV) {
1853     ScalarIV = Induction;
1854     if (IV != OldInduction) {
1855       ScalarIV = IV->getType()->isIntegerTy()
1856                      ? Builder.CreateSExtOrTrunc(Induction, IV->getType())
1857                      : Builder.CreateCast(Instruction::SIToFP, Induction,
1858                                           IV->getType());
1859       ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID);
1860       ScalarIV->setName("offset.idx");
1861     }
1862     if (Trunc) {
1863       auto *TruncType = cast<IntegerType>(Trunc->getType());
1864       assert(Step->getType()->isIntegerTy() &&
1865              "Truncation requires an integer step");
1866       ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType);
1867       Step = Builder.CreateTrunc(Step, TruncType);
1868     }
1869   }
1870 
1871   // If we haven't yet vectorized the induction variable, splat the scalar
1872   // induction variable, and build the necessary step vectors.
1873   // TODO: Don't do it unless the vectorized IV is really required.
1874   if (!VectorizedIV) {
1875     Value *Broadcasted = getBroadcastInstrs(ScalarIV);
1876     for (unsigned Part = 0; Part < UF; ++Part) {
1877       Value *EntryPart =
1878           getStepVector(Broadcasted, VF * Part, Step, ID.getInductionOpcode());
1879       VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart);
1880       if (Trunc)
1881         addMetadata(EntryPart, Trunc);
1882       recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part);
1883     }
1884   }
1885 
1886   // If an induction variable is only used for counting loop iterations or
1887   // calculating addresses, it doesn't need to be widened. Create scalar steps
1888   // that can be used by instructions we will later scalarize. Note that the
1889   // addition of the scalar steps will not increase the number of instructions
1890   // in the loop in the common case prior to InstCombine. We will be trading
1891   // one vector extract for each scalar step.
1892   if (NeedsScalarIV)
1893     buildScalarSteps(ScalarIV, Step, EntryVal, ID);
1894 }
1895 
1896 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step,
1897                                           Instruction::BinaryOps BinOp) {
1898   // Create and check the types.
1899   assert(Val->getType()->isVectorTy() && "Must be a vector");
1900   int VLen = Val->getType()->getVectorNumElements();
1901 
1902   Type *STy = Val->getType()->getScalarType();
1903   assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
1904          "Induction Step must be an integer or FP");
1905   assert(Step->getType() == STy && "Step has wrong type");
1906 
1907   SmallVector<Constant *, 8> Indices;
1908 
1909   if (STy->isIntegerTy()) {
1910     // Create a vector of consecutive numbers from zero to VF.
1911     for (int i = 0; i < VLen; ++i)
1912       Indices.push_back(ConstantInt::get(STy, StartIdx + i));
1913 
1914     // Add the consecutive indices to the vector value.
1915     Constant *Cv = ConstantVector::get(Indices);
1916     assert(Cv->getType() == Val->getType() && "Invalid consecutive vec");
1917     Step = Builder.CreateVectorSplat(VLen, Step);
1918     assert(Step->getType() == Val->getType() && "Invalid step vec");
1919     // FIXME: The newly created binary instructions should contain nsw/nuw flags,
1920     // which can be found from the original scalar operations.
1921     Step = Builder.CreateMul(Cv, Step);
1922     return Builder.CreateAdd(Val, Step, "induction");
1923   }
1924 
1925   // Floating point induction.
1926   assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
1927          "Binary Opcode should be specified for FP induction");
1928   // Create a vector of consecutive numbers from zero to VF.
1929   for (int i = 0; i < VLen; ++i)
1930     Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i)));
1931 
1932   // Add the consecutive indices to the vector value.
1933   Constant *Cv = ConstantVector::get(Indices);
1934 
1935   Step = Builder.CreateVectorSplat(VLen, Step);
1936 
1937   // Floating point operations had to be 'fast' to enable the induction.
1938   FastMathFlags Flags;
1939   Flags.setFast();
1940 
1941   Value *MulOp = Builder.CreateFMul(Cv, Step);
1942   if (isa<Instruction>(MulOp))
1943     // Have to check, MulOp may be a constant
1944     cast<Instruction>(MulOp)->setFastMathFlags(Flags);
1945 
1946   Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
1947   if (isa<Instruction>(BOp))
1948     cast<Instruction>(BOp)->setFastMathFlags(Flags);
1949   return BOp;
1950 }
1951 
1952 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
1953                                            Instruction *EntryVal,
1954                                            const InductionDescriptor &ID) {
1955   // We shouldn't have to build scalar steps if we aren't vectorizing.
1956   assert(VF > 1 && "VF should be greater than one");
1957 
1958   // Get the value type and ensure it and the step have the same integer type.
1959   Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
1960   assert(ScalarIVTy == Step->getType() &&
1961          "Val and Step should have the same type");
1962 
1963   // We build scalar steps for both integer and floating-point induction
1964   // variables. Here, we determine the kind of arithmetic we will perform.
1965   Instruction::BinaryOps AddOp;
1966   Instruction::BinaryOps MulOp;
1967   if (ScalarIVTy->isIntegerTy()) {
1968     AddOp = Instruction::Add;
1969     MulOp = Instruction::Mul;
1970   } else {
1971     AddOp = ID.getInductionOpcode();
1972     MulOp = Instruction::FMul;
1973   }
1974 
1975   // Determine the number of scalars we need to generate for each unroll
1976   // iteration. If EntryVal is uniform, we only need to generate the first
1977   // lane. Otherwise, we generate all VF values.
1978   unsigned Lanes =
1979       Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) ? 1
1980                                                                          : VF;
1981   // Compute the scalar steps and save the results in VectorLoopValueMap.
1982   for (unsigned Part = 0; Part < UF; ++Part) {
1983     for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
1984       auto *StartIdx = getSignedIntOrFpConstant(ScalarIVTy, VF * Part + Lane);
1985       auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step));
1986       auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul));
1987       VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add);
1988       recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane);
1989     }
1990   }
1991 }
1992 
1993 Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) {
1994   assert(V != Induction && "The new induction variable should not be used.");
1995   assert(!V->getType()->isVectorTy() && "Can't widen a vector");
1996   assert(!V->getType()->isVoidTy() && "Type does not produce a value");
1997 
1998   // If we have a stride that is replaced by one, do it here. Defer this for
1999   // the VPlan-native path until we start running Legal checks in that path.
2000   if (!EnableVPlanNativePath && Legal->hasStride(V))
2001     V = ConstantInt::get(V->getType(), 1);
2002 
2003   // If we have a vector mapped to this value, return it.
2004   if (VectorLoopValueMap.hasVectorValue(V, Part))
2005     return VectorLoopValueMap.getVectorValue(V, Part);
2006 
2007   // If the value has not been vectorized, check if it has been scalarized
2008   // instead. If it has been scalarized, and we actually need the value in
2009   // vector form, we will construct the vector values on demand.
2010   if (VectorLoopValueMap.hasAnyScalarValue(V)) {
2011     Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0});
2012 
2013     // If we've scalarized a value, that value should be an instruction.
2014     auto *I = cast<Instruction>(V);
2015 
2016     // If we aren't vectorizing, we can just copy the scalar map values over to
2017     // the vector map.
2018     if (VF == 1) {
2019       VectorLoopValueMap.setVectorValue(V, Part, ScalarValue);
2020       return ScalarValue;
2021     }
2022 
2023     // Get the last scalar instruction we generated for V and Part. If the value
2024     // is known to be uniform after vectorization, this corresponds to lane zero
2025     // of the Part unroll iteration. Otherwise, the last instruction is the one
2026     // we created for the last vector lane of the Part unroll iteration.
2027     unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) ? 0 : VF - 1;
2028     auto *LastInst = cast<Instruction>(
2029         VectorLoopValueMap.getScalarValue(V, {Part, LastLane}));
2030 
2031     // Set the insert point after the last scalarized instruction. This ensures
2032     // the insertelement sequence will directly follow the scalar definitions.
2033     auto OldIP = Builder.saveIP();
2034     auto NewIP = std::next(BasicBlock::iterator(LastInst));
2035     Builder.SetInsertPoint(&*NewIP);
2036 
2037     // However, if we are vectorizing, we need to construct the vector values.
2038     // If the value is known to be uniform after vectorization, we can just
2039     // broadcast the scalar value corresponding to lane zero for each unroll
2040     // iteration. Otherwise, we construct the vector values using insertelement
2041     // instructions. Since the resulting vectors are stored in
2042     // VectorLoopValueMap, we will only generate the insertelements once.
2043     Value *VectorValue = nullptr;
2044     if (Cost->isUniformAfterVectorization(I, VF)) {
2045       VectorValue = getBroadcastInstrs(ScalarValue);
2046       VectorLoopValueMap.setVectorValue(V, Part, VectorValue);
2047     } else {
2048       // Initialize packing with insertelements to start from undef.
2049       Value *Undef = UndefValue::get(VectorType::get(V->getType(), VF));
2050       VectorLoopValueMap.setVectorValue(V, Part, Undef);
2051       for (unsigned Lane = 0; Lane < VF; ++Lane)
2052         packScalarIntoVectorValue(V, {Part, Lane});
2053       VectorValue = VectorLoopValueMap.getVectorValue(V, Part);
2054     }
2055     Builder.restoreIP(OldIP);
2056     return VectorValue;
2057   }
2058 
2059   // If this scalar is unknown, assume that it is a constant or that it is
2060   // loop invariant. Broadcast V and save the value for future uses.
2061   Value *B = getBroadcastInstrs(V);
2062   VectorLoopValueMap.setVectorValue(V, Part, B);
2063   return B;
2064 }
2065 
2066 Value *
2067 InnerLoopVectorizer::getOrCreateScalarValue(Value *V,
2068                                             const VPIteration &Instance) {
2069   // If the value is not an instruction contained in the loop, it should
2070   // already be scalar.
2071   if (OrigLoop->isLoopInvariant(V))
2072     return V;
2073 
2074   assert(Instance.Lane > 0
2075              ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF)
2076              : true && "Uniform values only have lane zero");
2077 
2078   // If the value from the original loop has not been vectorized, it is
2079   // represented by UF x VF scalar values in the new loop. Return the requested
2080   // scalar value.
2081   if (VectorLoopValueMap.hasScalarValue(V, Instance))
2082     return VectorLoopValueMap.getScalarValue(V, Instance);
2083 
2084   // If the value has not been scalarized, get its entry in VectorLoopValueMap
2085   // for the given unroll part. If this entry is not a vector type (i.e., the
2086   // vectorization factor is one), there is no need to generate an
2087   // extractelement instruction.
2088   auto *U = getOrCreateVectorValue(V, Instance.Part);
2089   if (!U->getType()->isVectorTy()) {
2090     assert(VF == 1 && "Value not scalarized has non-vector type");
2091     return U;
2092   }
2093 
2094   // Otherwise, the value from the original loop has been vectorized and is
2095   // represented by UF vector values. Extract and return the requested scalar
2096   // value from the appropriate vector lane.
2097   return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane));
2098 }
2099 
2100 void InnerLoopVectorizer::packScalarIntoVectorValue(
2101     Value *V, const VPIteration &Instance) {
2102   assert(V != Induction && "The new induction variable should not be used.");
2103   assert(!V->getType()->isVectorTy() && "Can't pack a vector");
2104   assert(!V->getType()->isVoidTy() && "Type does not produce a value");
2105 
2106   Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance);
2107   Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part);
2108   VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst,
2109                                             Builder.getInt32(Instance.Lane));
2110   VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue);
2111 }
2112 
2113 Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
2114   assert(Vec->getType()->isVectorTy() && "Invalid type");
2115   SmallVector<Constant *, 8> ShuffleMask;
2116   for (unsigned i = 0; i < VF; ++i)
2117     ShuffleMask.push_back(Builder.getInt32(VF - i - 1));
2118 
2119   return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()),
2120                                      ConstantVector::get(ShuffleMask),
2121                                      "reverse");
2122 }
2123 
2124 // Return whether we allow using masked interleave-groups (for dealing with
2125 // strided loads/stores that reside in predicated blocks, or for dealing
2126 // with gaps).
2127 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2128   // If an override option has been passed in for interleaved accesses, use it.
2129   if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2130     return EnableMaskedInterleavedMemAccesses;
2131 
2132   return TTI.enableMaskedInterleavedAccessVectorization();
2133 }
2134 
2135 // Try to vectorize the interleave group that \p Instr belongs to.
2136 //
2137 // E.g. Translate following interleaved load group (factor = 3):
2138 //   for (i = 0; i < N; i+=3) {
2139 //     R = Pic[i];             // Member of index 0
2140 //     G = Pic[i+1];           // Member of index 1
2141 //     B = Pic[i+2];           // Member of index 2
2142 //     ... // do something to R, G, B
2143 //   }
2144 // To:
2145 //   %wide.vec = load <12 x i32>                       ; Read 4 tuples of R,G,B
2146 //   %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9>   ; R elements
2147 //   %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10>  ; G elements
2148 //   %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11>  ; B elements
2149 //
2150 // Or translate following interleaved store group (factor = 3):
2151 //   for (i = 0; i < N; i+=3) {
2152 //     ... do something to R, G, B
2153 //     Pic[i]   = R;           // Member of index 0
2154 //     Pic[i+1] = G;           // Member of index 1
2155 //     Pic[i+2] = B;           // Member of index 2
2156 //   }
2157 // To:
2158 //   %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2159 //   %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u>
2160 //   %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2161 //        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements
2162 //   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
2163 void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr,
2164                                                    VPTransformState &State,
2165                                                    VPValue *Addr,
2166                                                    VPValue *BlockInMask) {
2167   const InterleaveGroup<Instruction> *Group =
2168       Cost->getInterleavedAccessGroup(Instr);
2169   assert(Group && "Fail to get an interleaved access group.");
2170 
2171   // Skip if current instruction is not the insert position.
2172   if (Instr != Group->getInsertPos())
2173     return;
2174 
2175   const DataLayout &DL = Instr->getModule()->getDataLayout();
2176 
2177   // Prepare for the vector type of the interleaved load/store.
2178   Type *ScalarTy = getMemInstValueType(Instr);
2179   unsigned InterleaveFactor = Group->getFactor();
2180   Type *VecTy = VectorType::get(ScalarTy, InterleaveFactor * VF);
2181 
2182   // Prepare for the new pointers.
2183   SmallVector<Value *, 2> AddrParts;
2184   unsigned Index = Group->getIndex(Instr);
2185 
2186   // TODO: extend the masked interleaved-group support to reversed access.
2187   assert((!BlockInMask || !Group->isReverse()) &&
2188          "Reversed masked interleave-group not supported.");
2189 
2190   // If the group is reverse, adjust the index to refer to the last vector lane
2191   // instead of the first. We adjust the index from the first vector lane,
2192   // rather than directly getting the pointer for lane VF - 1, because the
2193   // pointer operand of the interleaved access is supposed to be uniform. For
2194   // uniform instructions, we're only required to generate a value for the
2195   // first vector lane in each unroll iteration.
2196   if (Group->isReverse())
2197     Index += (VF - 1) * Group->getFactor();
2198 
2199   for (unsigned Part = 0; Part < UF; Part++) {
2200     Value *AddrPart = State.get(Addr, {Part, 0});
2201     setDebugLocFromInst(Builder, AddrPart);
2202 
2203     // Notice current instruction could be any index. Need to adjust the address
2204     // to the member of index 0.
2205     //
2206     // E.g.  a = A[i+1];     // Member of index 1 (Current instruction)
2207     //       b = A[i];       // Member of index 0
2208     // Current pointer is pointed to A[i+1], adjust it to A[i].
2209     //
2210     // E.g.  A[i+1] = a;     // Member of index 1
2211     //       A[i]   = b;     // Member of index 0
2212     //       A[i+2] = c;     // Member of index 2 (Current instruction)
2213     // Current pointer is pointed to A[i+2], adjust it to A[i].
2214 
2215     bool InBounds = false;
2216     if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts()))
2217       InBounds = gep->isInBounds();
2218     AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index));
2219     cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds);
2220 
2221     // Cast to the vector pointer type.
2222     unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace();
2223     Type *PtrTy = VecTy->getPointerTo(AddressSpace);
2224     AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy));
2225   }
2226 
2227   setDebugLocFromInst(Builder, Instr);
2228   Value *UndefVec = UndefValue::get(VecTy);
2229 
2230   Value *MaskForGaps = nullptr;
2231   if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) {
2232     MaskForGaps = createBitMaskForGaps(Builder, VF, *Group);
2233     assert(MaskForGaps && "Mask for Gaps is required but it is null");
2234   }
2235 
2236   // Vectorize the interleaved load group.
2237   if (isa<LoadInst>(Instr)) {
2238     // For each unroll part, create a wide load for the group.
2239     SmallVector<Value *, 2> NewLoads;
2240     for (unsigned Part = 0; Part < UF; Part++) {
2241       Instruction *NewLoad;
2242       if (BlockInMask || MaskForGaps) {
2243         assert(useMaskedInterleavedAccesses(*TTI) &&
2244                "masked interleaved groups are not allowed.");
2245         Value *GroupMask = MaskForGaps;
2246         if (BlockInMask) {
2247           Value *BlockInMaskPart = State.get(BlockInMask, Part);
2248           auto *Undefs = UndefValue::get(BlockInMaskPart->getType());
2249           auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF);
2250           Value *ShuffledMask = Builder.CreateShuffleVector(
2251               BlockInMaskPart, Undefs, RepMask, "interleaved.mask");
2252           GroupMask = MaskForGaps
2253                           ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
2254                                                 MaskForGaps)
2255                           : ShuffledMask;
2256         }
2257         NewLoad =
2258             Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlign(),
2259                                      GroupMask, UndefVec, "wide.masked.vec");
2260       }
2261       else
2262         NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part],
2263                                             Group->getAlign(), "wide.vec");
2264       Group->addMetadata(NewLoad);
2265       NewLoads.push_back(NewLoad);
2266     }
2267 
2268     // For each member in the group, shuffle out the appropriate data from the
2269     // wide loads.
2270     for (unsigned I = 0; I < InterleaveFactor; ++I) {
2271       Instruction *Member = Group->getMember(I);
2272 
2273       // Skip the gaps in the group.
2274       if (!Member)
2275         continue;
2276 
2277       Constant *StrideMask = createStrideMask(Builder, I, InterleaveFactor, VF);
2278       for (unsigned Part = 0; Part < UF; Part++) {
2279         Value *StridedVec = Builder.CreateShuffleVector(
2280             NewLoads[Part], UndefVec, StrideMask, "strided.vec");
2281 
2282         // If this member has different type, cast the result type.
2283         if (Member->getType() != ScalarTy) {
2284           VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2285           StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2286         }
2287 
2288         if (Group->isReverse())
2289           StridedVec = reverseVector(StridedVec);
2290 
2291         VectorLoopValueMap.setVectorValue(Member, Part, StridedVec);
2292       }
2293     }
2294     return;
2295   }
2296 
2297   // The sub vector type for current instruction.
2298   VectorType *SubVT = VectorType::get(ScalarTy, VF);
2299 
2300   // Vectorize the interleaved store group.
2301   for (unsigned Part = 0; Part < UF; Part++) {
2302     // Collect the stored vector from each member.
2303     SmallVector<Value *, 4> StoredVecs;
2304     for (unsigned i = 0; i < InterleaveFactor; i++) {
2305       // Interleaved store group doesn't allow a gap, so each index has a member
2306       Instruction *Member = Group->getMember(i);
2307       assert(Member && "Fail to get a member from an interleaved store group");
2308 
2309       Value *StoredVec = getOrCreateVectorValue(
2310           cast<StoreInst>(Member)->getValueOperand(), Part);
2311       if (Group->isReverse())
2312         StoredVec = reverseVector(StoredVec);
2313 
2314       // If this member has different type, cast it to a unified type.
2315 
2316       if (StoredVec->getType() != SubVT)
2317         StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2318 
2319       StoredVecs.push_back(StoredVec);
2320     }
2321 
2322     // Concatenate all vectors into a wide vector.
2323     Value *WideVec = concatenateVectors(Builder, StoredVecs);
2324 
2325     // Interleave the elements in the wide vector.
2326     Constant *IMask = createInterleaveMask(Builder, VF, InterleaveFactor);
2327     Value *IVec = Builder.CreateShuffleVector(WideVec, UndefVec, IMask,
2328                                               "interleaved.vec");
2329 
2330     Instruction *NewStoreInstr;
2331     if (BlockInMask) {
2332       Value *BlockInMaskPart = State.get(BlockInMask, Part);
2333       auto *Undefs = UndefValue::get(BlockInMaskPart->getType());
2334       auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF);
2335       Value *ShuffledMask = Builder.CreateShuffleVector(
2336           BlockInMaskPart, Undefs, RepMask, "interleaved.mask");
2337       NewStoreInstr = Builder.CreateMaskedStore(
2338           IVec, AddrParts[Part], Group->getAlign(), ShuffledMask);
2339     }
2340     else
2341       NewStoreInstr =
2342           Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign());
2343 
2344     Group->addMetadata(NewStoreInstr);
2345   }
2346 }
2347 
2348 void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
2349                                                      VPTransformState &State,
2350                                                      VPValue *Addr,
2351                                                      VPValue *BlockInMask) {
2352   // Attempt to issue a wide load.
2353   LoadInst *LI = dyn_cast<LoadInst>(Instr);
2354   StoreInst *SI = dyn_cast<StoreInst>(Instr);
2355 
2356   assert((LI || SI) && "Invalid Load/Store instruction");
2357 
2358   LoopVectorizationCostModel::InstWidening Decision =
2359       Cost->getWideningDecision(Instr, VF);
2360   assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
2361          "CM decision should be taken at this point");
2362   if (Decision == LoopVectorizationCostModel::CM_Interleave)
2363     return vectorizeInterleaveGroup(Instr, State, Addr, BlockInMask);
2364 
2365   Type *ScalarDataTy = getMemInstValueType(Instr);
2366   Type *DataTy = VectorType::get(ScalarDataTy, VF);
2367   // An alignment of 0 means target abi alignment. We need to use the scalar's
2368   // target abi alignment in such a case.
2369   const DataLayout &DL = Instr->getModule()->getDataLayout();
2370   const Align Alignment =
2371       DL.getValueOrABITypeAlignment(getLoadStoreAlignment(Instr), ScalarDataTy);
2372 
2373   // Determine if the pointer operand of the access is either consecutive or
2374   // reverse consecutive.
2375   bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse);
2376   bool ConsecutiveStride =
2377       Reverse || (Decision == LoopVectorizationCostModel::CM_Widen);
2378   bool CreateGatherScatter =
2379       (Decision == LoopVectorizationCostModel::CM_GatherScatter);
2380 
2381   // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector
2382   // gather/scatter. Otherwise Decision should have been to Scalarize.
2383   assert((ConsecutiveStride || CreateGatherScatter) &&
2384          "The instruction should be scalarized");
2385   (void)ConsecutiveStride;
2386 
2387   VectorParts BlockInMaskParts(UF);
2388   bool isMaskRequired = BlockInMask;
2389   if (isMaskRequired)
2390     for (unsigned Part = 0; Part < UF; ++Part)
2391       BlockInMaskParts[Part] = State.get(BlockInMask, Part);
2392 
2393   const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
2394     // Calculate the pointer for the specific unroll-part.
2395     GetElementPtrInst *PartPtr = nullptr;
2396 
2397     bool InBounds = false;
2398     if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
2399       InBounds = gep->isInBounds();
2400 
2401     if (Reverse) {
2402       // If the address is consecutive but reversed, then the
2403       // wide store needs to start at the last vector element.
2404       PartPtr = cast<GetElementPtrInst>(
2405           Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(-Part * VF)));
2406       PartPtr->setIsInBounds(InBounds);
2407       PartPtr = cast<GetElementPtrInst>(
2408           Builder.CreateGEP(ScalarDataTy, PartPtr, Builder.getInt32(1 - VF)));
2409       PartPtr->setIsInBounds(InBounds);
2410       if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
2411         BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]);
2412     } else {
2413       PartPtr = cast<GetElementPtrInst>(
2414           Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(Part * VF)));
2415       PartPtr->setIsInBounds(InBounds);
2416     }
2417 
2418     unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
2419     return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
2420   };
2421 
2422   // Handle Stores:
2423   if (SI) {
2424     setDebugLocFromInst(Builder, SI);
2425 
2426     for (unsigned Part = 0; Part < UF; ++Part) {
2427       Instruction *NewSI = nullptr;
2428       Value *StoredVal = getOrCreateVectorValue(SI->getValueOperand(), Part);
2429       if (CreateGatherScatter) {
2430         Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
2431         Value *VectorGep = State.get(Addr, Part);
2432         NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
2433                                             MaskPart);
2434       } else {
2435         if (Reverse) {
2436           // If we store to reverse consecutive memory locations, then we need
2437           // to reverse the order of elements in the stored value.
2438           StoredVal = reverseVector(StoredVal);
2439           // We don't want to update the value in the map as it might be used in
2440           // another expression. So don't call resetVectorValue(StoredVal).
2441         }
2442         auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0}));
2443         if (isMaskRequired)
2444           NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
2445                                             BlockInMaskParts[Part]);
2446         else
2447           NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
2448       }
2449       addMetadata(NewSI, SI);
2450     }
2451     return;
2452   }
2453 
2454   // Handle loads.
2455   assert(LI && "Must have a load instruction");
2456   setDebugLocFromInst(Builder, LI);
2457   for (unsigned Part = 0; Part < UF; ++Part) {
2458     Value *NewLI;
2459     if (CreateGatherScatter) {
2460       Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
2461       Value *VectorGep = State.get(Addr, Part);
2462       NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart,
2463                                          nullptr, "wide.masked.gather");
2464       addMetadata(NewLI, LI);
2465     } else {
2466       auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0}));
2467       if (isMaskRequired)
2468         NewLI = Builder.CreateMaskedLoad(
2469             VecPtr, Alignment, BlockInMaskParts[Part], UndefValue::get(DataTy),
2470             "wide.masked.load");
2471       else
2472         NewLI =
2473             Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");
2474 
2475       // Add metadata to the load, but setVectorValue to the reverse shuffle.
2476       addMetadata(NewLI, LI);
2477       if (Reverse)
2478         NewLI = reverseVector(NewLI);
2479     }
2480     VectorLoopValueMap.setVectorValue(Instr, Part, NewLI);
2481   }
2482 }
2483 
2484 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
2485                                                const VPIteration &Instance,
2486                                                bool IfPredicateInstr) {
2487   assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
2488 
2489   setDebugLocFromInst(Builder, Instr);
2490 
2491   // Does this instruction return a value ?
2492   bool IsVoidRetTy = Instr->getType()->isVoidTy();
2493 
2494   Instruction *Cloned = Instr->clone();
2495   if (!IsVoidRetTy)
2496     Cloned->setName(Instr->getName() + ".cloned");
2497 
2498   // Replace the operands of the cloned instructions with their scalar
2499   // equivalents in the new loop.
2500   for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
2501     auto *NewOp = getOrCreateScalarValue(Instr->getOperand(op), Instance);
2502     Cloned->setOperand(op, NewOp);
2503   }
2504   addNewMetadata(Cloned, Instr);
2505 
2506   // Place the cloned scalar in the new loop.
2507   Builder.Insert(Cloned);
2508 
2509   // Add the cloned scalar to the scalar map entry.
2510   VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned);
2511 
2512   // If we just cloned a new assumption, add it the assumption cache.
2513   if (auto *II = dyn_cast<IntrinsicInst>(Cloned))
2514     if (II->getIntrinsicID() == Intrinsic::assume)
2515       AC->registerAssumption(II);
2516 
2517   // End if-block.
2518   if (IfPredicateInstr)
2519     PredicatedInstructions.push_back(Cloned);
2520 }
2521 
2522 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start,
2523                                                       Value *End, Value *Step,
2524                                                       Instruction *DL) {
2525   BasicBlock *Header = L->getHeader();
2526   BasicBlock *Latch = L->getLoopLatch();
2527   // As we're just creating this loop, it's possible no latch exists
2528   // yet. If so, use the header as this will be a single block loop.
2529   if (!Latch)
2530     Latch = Header;
2531 
2532   IRBuilder<> Builder(&*Header->getFirstInsertionPt());
2533   Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction);
2534   setDebugLocFromInst(Builder, OldInst);
2535   auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index");
2536 
2537   Builder.SetInsertPoint(Latch->getTerminator());
2538   setDebugLocFromInst(Builder, OldInst);
2539 
2540   // Create i+1 and fill the PHINode.
2541   Value *Next = Builder.CreateAdd(Induction, Step, "index.next");
2542   Induction->addIncoming(Start, L->getLoopPreheader());
2543   Induction->addIncoming(Next, Latch);
2544   // Create the compare.
2545   Value *ICmp = Builder.CreateICmpEQ(Next, End);
2546   Builder.CreateCondBr(ICmp, L->getExitBlock(), Header);
2547 
2548   // Now we have two terminators. Remove the old one from the block.
2549   Latch->getTerminator()->eraseFromParent();
2550 
2551   return Induction;
2552 }
2553 
2554 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
2555   if (TripCount)
2556     return TripCount;
2557 
2558   assert(L && "Create Trip Count for null loop.");
2559   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2560   // Find the loop boundaries.
2561   ScalarEvolution *SE = PSE.getSE();
2562   const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
2563   assert(BackedgeTakenCount != SE->getCouldNotCompute() &&
2564          "Invalid loop count");
2565 
2566   Type *IdxTy = Legal->getWidestInductionType();
2567   assert(IdxTy && "No type for induction");
2568 
2569   // The exit count might have the type of i64 while the phi is i32. This can
2570   // happen if we have an induction variable that is sign extended before the
2571   // compare. The only way that we get a backedge taken count is that the
2572   // induction variable was signed and as such will not overflow. In such a case
2573   // truncation is legal.
2574   if (BackedgeTakenCount->getType()->getPrimitiveSizeInBits() >
2575       IdxTy->getPrimitiveSizeInBits())
2576     BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
2577   BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
2578 
2579   // Get the total trip count from the count by adding 1.
2580   const SCEV *ExitCount = SE->getAddExpr(
2581       BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
2582 
2583   const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
2584 
2585   // Expand the trip count and place the new instructions in the preheader.
2586   // Notice that the pre-header does not change, only the loop body.
2587   SCEVExpander Exp(*SE, DL, "induction");
2588 
2589   // Count holds the overall loop count (N).
2590   TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
2591                                 L->getLoopPreheader()->getTerminator());
2592 
2593   if (TripCount->getType()->isPointerTy())
2594     TripCount =
2595         CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
2596                                     L->getLoopPreheader()->getTerminator());
2597 
2598   return TripCount;
2599 }
2600 
2601 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
2602   if (VectorTripCount)
2603     return VectorTripCount;
2604 
2605   Value *TC = getOrCreateTripCount(L);
2606   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2607 
2608   Type *Ty = TC->getType();
2609   Constant *Step = ConstantInt::get(Ty, VF * UF);
2610 
2611   // If the tail is to be folded by masking, round the number of iterations N
2612   // up to a multiple of Step instead of rounding down. This is done by first
2613   // adding Step-1 and then rounding down. Note that it's ok if this addition
2614   // overflows: the vector induction variable will eventually wrap to zero given
2615   // that it starts at zero and its Step is a power of two; the loop will then
2616   // exit, with the last early-exit vector comparison also producing all-true.
2617   if (Cost->foldTailByMasking()) {
2618     assert(isPowerOf2_32(VF * UF) &&
2619            "VF*UF must be a power of 2 when folding tail by masking");
2620     TC = Builder.CreateAdd(TC, ConstantInt::get(Ty, VF * UF - 1), "n.rnd.up");
2621   }
2622 
2623   // Now we need to generate the expression for the part of the loop that the
2624   // vectorized body will execute. This is equal to N - (N % Step) if scalar
2625   // iterations are not required for correctness, or N - Step, otherwise. Step
2626   // is equal to the vectorization factor (number of SIMD elements) times the
2627   // unroll factor (number of SIMD instructions).
2628   Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
2629 
2630   // If there is a non-reversed interleaved group that may speculatively access
2631   // memory out-of-bounds, we need to ensure that there will be at least one
2632   // iteration of the scalar epilogue loop. Thus, if the step evenly divides
2633   // the trip count, we set the remainder to be equal to the step. If the step
2634   // does not evenly divide the trip count, no adjustment is necessary since
2635   // there will already be scalar iterations. Note that the minimum iterations
2636   // check ensures that N >= Step.
2637   if (VF > 1 && Cost->requiresScalarEpilogue()) {
2638     auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
2639     R = Builder.CreateSelect(IsZero, Step, R);
2640   }
2641 
2642   VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
2643 
2644   return VectorTripCount;
2645 }
2646 
2647 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
2648                                                    const DataLayout &DL) {
2649   // Verify that V is a vector type with same number of elements as DstVTy.
2650   unsigned VF = DstVTy->getNumElements();
2651   VectorType *SrcVecTy = cast<VectorType>(V->getType());
2652   assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match");
2653   Type *SrcElemTy = SrcVecTy->getElementType();
2654   Type *DstElemTy = DstVTy->getElementType();
2655   assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
2656          "Vector elements must have same size");
2657 
2658   // Do a direct cast if element types are castable.
2659   if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
2660     return Builder.CreateBitOrPointerCast(V, DstVTy);
2661   }
2662   // V cannot be directly casted to desired vector type.
2663   // May happen when V is a floating point vector but DstVTy is a vector of
2664   // pointers or vice-versa. Handle this using a two-step bitcast using an
2665   // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
2666   assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
2667          "Only one type should be a pointer type");
2668   assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
2669          "Only one type should be a floating point type");
2670   Type *IntTy =
2671       IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
2672   VectorType *VecIntTy = VectorType::get(IntTy, VF);
2673   Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
2674   return Builder.CreateBitOrPointerCast(CastVal, DstVTy);
2675 }
2676 
2677 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
2678                                                          BasicBlock *Bypass) {
2679   Value *Count = getOrCreateTripCount(L);
2680   // Reuse existing vector loop preheader for TC checks.
2681   // Note that new preheader block is generated for vector loop.
2682   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
2683   IRBuilder<> Builder(TCCheckBlock->getTerminator());
2684 
2685   // Generate code to check if the loop's trip count is less than VF * UF, or
2686   // equal to it in case a scalar epilogue is required; this implies that the
2687   // vector trip count is zero. This check also covers the case where adding one
2688   // to the backedge-taken count overflowed leading to an incorrect trip count
2689   // of zero. In this case we will also jump to the scalar loop.
2690   auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE
2691                                           : ICmpInst::ICMP_ULT;
2692 
2693   // If tail is to be folded, vector loop takes care of all iterations.
2694   Value *CheckMinIters = Builder.getFalse();
2695   if (!Cost->foldTailByMasking())
2696     CheckMinIters = Builder.CreateICmp(
2697         P, Count, ConstantInt::get(Count->getType(), VF * UF),
2698         "min.iters.check");
2699 
2700   // Create new preheader for vector loop.
2701   LoopVectorPreHeader =
2702       SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
2703                  "vector.ph");
2704 
2705   assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
2706                                DT->getNode(Bypass)->getIDom()) &&
2707          "TC check is expected to dominate Bypass");
2708 
2709   // Update dominator for Bypass & LoopExit.
2710   DT->changeImmediateDominator(Bypass, TCCheckBlock);
2711   DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
2712 
2713   ReplaceInstWithInst(
2714       TCCheckBlock->getTerminator(),
2715       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
2716   LoopBypassBlocks.push_back(TCCheckBlock);
2717 }
2718 
2719 void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
2720   // Reuse existing vector loop preheader for SCEV checks.
2721   // Note that new preheader block is generated for vector loop.
2722   BasicBlock *const SCEVCheckBlock = LoopVectorPreHeader;
2723 
2724   // Generate the code to check that the SCEV assumptions that we made.
2725   // We want the new basic block to start at the first instruction in a
2726   // sequence of instructions that form a check.
2727   SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(),
2728                    "scev.check");
2729   Value *SCEVCheck = Exp.expandCodeForPredicate(
2730       &PSE.getUnionPredicate(), SCEVCheckBlock->getTerminator());
2731 
2732   if (auto *C = dyn_cast<ConstantInt>(SCEVCheck))
2733     if (C->isZero())
2734       return;
2735 
2736   assert(!SCEVCheckBlock->getParent()->hasOptSize() &&
2737          "Cannot SCEV check stride or overflow when optimizing for size");
2738 
2739   SCEVCheckBlock->setName("vector.scevcheck");
2740   // Create new preheader for vector loop.
2741   LoopVectorPreHeader =
2742       SplitBlock(SCEVCheckBlock, SCEVCheckBlock->getTerminator(), DT, LI,
2743                  nullptr, "vector.ph");
2744 
2745   // Update dominator only if this is first RT check.
2746   if (LoopBypassBlocks.empty()) {
2747     DT->changeImmediateDominator(Bypass, SCEVCheckBlock);
2748     DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock);
2749   }
2750 
2751   ReplaceInstWithInst(
2752       SCEVCheckBlock->getTerminator(),
2753       BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheck));
2754   LoopBypassBlocks.push_back(SCEVCheckBlock);
2755   AddedSafetyChecks = true;
2756 }
2757 
2758 void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {
2759   // VPlan-native path does not do any analysis for runtime checks currently.
2760   if (EnableVPlanNativePath)
2761     return;
2762 
2763   // Reuse existing vector loop preheader for runtime memory checks.
2764   // Note that new preheader block is generated for vector loop.
2765   BasicBlock *const MemCheckBlock = L->getLoopPreheader();
2766 
2767   // Generate the code that checks in runtime if arrays overlap. We put the
2768   // checks into a separate block to make the more common case of few elements
2769   // faster.
2770   Instruction *FirstCheckInst;
2771   Instruction *MemRuntimeCheck;
2772   std::tie(FirstCheckInst, MemRuntimeCheck) =
2773       Legal->getLAI()->addRuntimeChecks(MemCheckBlock->getTerminator());
2774   if (!MemRuntimeCheck)
2775     return;
2776 
2777   if (MemCheckBlock->getParent()->hasOptSize()) {
2778     assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
2779            "Cannot emit memory checks when optimizing for size, unless forced "
2780            "to vectorize.");
2781     ORE->emit([&]() {
2782       return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
2783                                         L->getStartLoc(), L->getHeader())
2784              << "Code-size may be reduced by not forcing "
2785                 "vectorization, or by source-code modifications "
2786                 "eliminating the need for runtime checks "
2787                 "(e.g., adding 'restrict').";
2788     });
2789   }
2790 
2791   MemCheckBlock->setName("vector.memcheck");
2792   // Create new preheader for vector loop.
2793   LoopVectorPreHeader =
2794       SplitBlock(MemCheckBlock, MemCheckBlock->getTerminator(), DT, LI, nullptr,
2795                  "vector.ph");
2796 
2797   // Update dominator only if this is first RT check.
2798   if (LoopBypassBlocks.empty()) {
2799     DT->changeImmediateDominator(Bypass, MemCheckBlock);
2800     DT->changeImmediateDominator(LoopExitBlock, MemCheckBlock);
2801   }
2802 
2803   ReplaceInstWithInst(
2804       MemCheckBlock->getTerminator(),
2805       BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheck));
2806   LoopBypassBlocks.push_back(MemCheckBlock);
2807   AddedSafetyChecks = true;
2808 
2809   // We currently don't use LoopVersioning for the actual loop cloning but we
2810   // still use it to add the noalias metadata.
2811   LVer = std::make_unique<LoopVersioning>(*Legal->getLAI(), OrigLoop, LI, DT,
2812                                           PSE.getSE());
2813   LVer->prepareNoAliasMetadata();
2814 }
2815 
2816 Value *InnerLoopVectorizer::emitTransformedIndex(
2817     IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL,
2818     const InductionDescriptor &ID) const {
2819 
2820   SCEVExpander Exp(*SE, DL, "induction");
2821   auto Step = ID.getStep();
2822   auto StartValue = ID.getStartValue();
2823   assert(Index->getType() == Step->getType() &&
2824          "Index type does not match StepValue type");
2825 
2826   // Note: the IR at this point is broken. We cannot use SE to create any new
2827   // SCEV and then expand it, hoping that SCEV's simplification will give us
2828   // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
2829   // lead to various SCEV crashes. So all we can do is to use builder and rely
2830   // on InstCombine for future simplifications. Here we handle some trivial
2831   // cases only.
2832   auto CreateAdd = [&B](Value *X, Value *Y) {
2833     assert(X->getType() == Y->getType() && "Types don't match!");
2834     if (auto *CX = dyn_cast<ConstantInt>(X))
2835       if (CX->isZero())
2836         return Y;
2837     if (auto *CY = dyn_cast<ConstantInt>(Y))
2838       if (CY->isZero())
2839         return X;
2840     return B.CreateAdd(X, Y);
2841   };
2842 
2843   auto CreateMul = [&B](Value *X, Value *Y) {
2844     assert(X->getType() == Y->getType() && "Types don't match!");
2845     if (auto *CX = dyn_cast<ConstantInt>(X))
2846       if (CX->isOne())
2847         return Y;
2848     if (auto *CY = dyn_cast<ConstantInt>(Y))
2849       if (CY->isOne())
2850         return X;
2851     return B.CreateMul(X, Y);
2852   };
2853 
2854   switch (ID.getKind()) {
2855   case InductionDescriptor::IK_IntInduction: {
2856     assert(Index->getType() == StartValue->getType() &&
2857            "Index type does not match StartValue type");
2858     if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne())
2859       return B.CreateSub(StartValue, Index);
2860     auto *Offset = CreateMul(
2861         Index, Exp.expandCodeFor(Step, Index->getType(), &*B.GetInsertPoint()));
2862     return CreateAdd(StartValue, Offset);
2863   }
2864   case InductionDescriptor::IK_PtrInduction: {
2865     assert(isa<SCEVConstant>(Step) &&
2866            "Expected constant step for pointer induction");
2867     return B.CreateGEP(
2868         StartValue->getType()->getPointerElementType(), StartValue,
2869         CreateMul(Index, Exp.expandCodeFor(Step, Index->getType(),
2870                                            &*B.GetInsertPoint())));
2871   }
2872   case InductionDescriptor::IK_FpInduction: {
2873     assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
2874     auto InductionBinOp = ID.getInductionBinOp();
2875     assert(InductionBinOp &&
2876            (InductionBinOp->getOpcode() == Instruction::FAdd ||
2877             InductionBinOp->getOpcode() == Instruction::FSub) &&
2878            "Original bin op should be defined for FP induction");
2879 
2880     Value *StepValue = cast<SCEVUnknown>(Step)->getValue();
2881 
2882     // Floating point operations had to be 'fast' to enable the induction.
2883     FastMathFlags Flags;
2884     Flags.setFast();
2885 
2886     Value *MulExp = B.CreateFMul(StepValue, Index);
2887     if (isa<Instruction>(MulExp))
2888       // We have to check, the MulExp may be a constant.
2889       cast<Instruction>(MulExp)->setFastMathFlags(Flags);
2890 
2891     Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
2892                                "induction");
2893     if (isa<Instruction>(BOp))
2894       cast<Instruction>(BOp)->setFastMathFlags(Flags);
2895 
2896     return BOp;
2897   }
2898   case InductionDescriptor::IK_NoInduction:
2899     return nullptr;
2900   }
2901   llvm_unreachable("invalid enum");
2902 }
2903 
2904 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
2905   /*
2906    In this function we generate a new loop. The new loop will contain
2907    the vectorized instructions while the old loop will continue to run the
2908    scalar remainder.
2909 
2910        [ ] <-- loop iteration number check.
2911     /   |
2912    /    v
2913   |    [ ] <-- vector loop bypass (may consist of multiple blocks).
2914   |  /  |
2915   | /   v
2916   ||   [ ]     <-- vector pre header.
2917   |/    |
2918   |     v
2919   |    [  ] \
2920   |    [  ]_|   <-- vector loop.
2921   |     |
2922   |     v
2923   |   -[ ]   <--- middle-block.
2924   |  /  |
2925   | /   v
2926   -|- >[ ]     <--- new preheader.
2927    |    |
2928    |    v
2929    |   [ ] \
2930    |   [ ]_|   <-- old scalar loop to handle remainder.
2931     \   |
2932      \  v
2933       >[ ]     <-- exit block.
2934    ...
2935    */
2936 
2937   MDNode *OrigLoopID = OrigLoop->getLoopID();
2938 
2939   // Some loops have a single integer induction variable, while other loops
2940   // don't. One example is c++ iterators that often have multiple pointer
2941   // induction variables. In the code below we also support a case where we
2942   // don't have a single induction variable.
2943   //
2944   // We try to obtain an induction variable from the original loop as hard
2945   // as possible. However if we don't find one that:
2946   //   - is an integer
2947   //   - counts from zero, stepping by one
2948   //   - is the size of the widest induction variable type
2949   // then we create a new one.
2950   OldInduction = Legal->getPrimaryInduction();
2951   Type *IdxTy = Legal->getWidestInductionType();
2952 
2953   // Split the single block loop into the two loop structure described above.
2954   LoopScalarBody = OrigLoop->getHeader();
2955   LoopVectorPreHeader = OrigLoop->getLoopPreheader();
2956   LoopExitBlock = OrigLoop->getExitBlock();
2957   assert(LoopExitBlock && "Must have an exit block");
2958   assert(LoopVectorPreHeader && "Invalid loop structure");
2959 
2960   LoopMiddleBlock =
2961       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
2962                  LI, nullptr, "middle.block");
2963   LoopScalarPreHeader =
2964       SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,
2965                  nullptr, "scalar.ph");
2966   // We intentionally don't let SplitBlock to update LoopInfo since
2967   // LoopVectorBody should belong to another loop than LoopVectorPreHeader.
2968   // LoopVectorBody is explicitly added to the correct place few lines later.
2969   LoopVectorBody =
2970       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
2971                  nullptr, nullptr, "vector.body");
2972 
2973   // Update dominator for loop exit.
2974   DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
2975 
2976   // Create and register the new vector loop.
2977   Loop *Lp = LI->AllocateLoop();
2978   Loop *ParentLoop = OrigLoop->getParentLoop();
2979 
2980   // Insert the new loop into the loop nest and register the new basic blocks
2981   // before calling any utilities such as SCEV that require valid LoopInfo.
2982   if (ParentLoop) {
2983     ParentLoop->addChildLoop(Lp);
2984   } else {
2985     LI->addTopLevelLoop(Lp);
2986   }
2987   Lp->addBasicBlockToLoop(LoopVectorBody, *LI);
2988 
2989   // Find the loop boundaries.
2990   Value *Count = getOrCreateTripCount(Lp);
2991 
2992   Value *StartIdx = ConstantInt::get(IdxTy, 0);
2993 
2994   // Now, compare the new count to zero. If it is zero skip the vector loop and
2995   // jump to the scalar loop. This check also covers the case where the
2996   // backedge-taken count is uint##_max: adding one to it will overflow leading
2997   // to an incorrect trip count of zero. In this (rare) case we will also jump
2998   // to the scalar loop.
2999   emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader);
3000 
3001   // Generate the code to check any assumptions that we've made for SCEV
3002   // expressions.
3003   emitSCEVChecks(Lp, LoopScalarPreHeader);
3004 
3005   // Generate the code that checks in runtime if arrays overlap. We put the
3006   // checks into a separate block to make the more common case of few elements
3007   // faster.
3008   emitMemRuntimeChecks(Lp, LoopScalarPreHeader);
3009 
3010   // Generate the induction variable.
3011   // The loop step is equal to the vectorization factor (num of SIMD elements)
3012   // times the unroll factor (num of SIMD instructions).
3013   Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
3014   Constant *Step = ConstantInt::get(IdxTy, VF * UF);
3015   Induction =
3016       createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
3017                               getDebugLocFromInstOrOperands(OldInduction));
3018 
3019   // We are going to resume the execution of the scalar loop.
3020   // Go over all of the induction variables that we found and fix the
3021   // PHIs that are left in the scalar version of the loop.
3022   // The starting values of PHI nodes depend on the counter of the last
3023   // iteration in the vectorized loop.
3024   // If we come from a bypass edge then we need to start from the original
3025   // start value.
3026 
3027   // This variable saves the new starting index for the scalar loop. It is used
3028   // to test if there are any tail iterations left once the vector loop has
3029   // completed.
3030   for (auto &InductionEntry : Legal->getInductionVars()) {
3031     PHINode *OrigPhi = InductionEntry.first;
3032     InductionDescriptor II = InductionEntry.second;
3033 
3034     // Create phi nodes to merge from the  backedge-taken check block.
3035     PHINode *BCResumeVal =
3036         PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
3037                         LoopScalarPreHeader->getTerminator());
3038     // Copy original phi DL over to the new one.
3039     BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
3040     Value *&EndValue = IVEndValues[OrigPhi];
3041     if (OrigPhi == OldInduction) {
3042       // We know what the end value is.
3043       EndValue = CountRoundDown;
3044     } else {
3045       IRBuilder<> B(Lp->getLoopPreheader()->getTerminator());
3046       Type *StepType = II.getStep()->getType();
3047       Instruction::CastOps CastOp =
3048           CastInst::getCastOpcode(CountRoundDown, true, StepType, true);
3049       Value *CRD = B.CreateCast(CastOp, CountRoundDown, StepType, "cast.crd");
3050       const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout();
3051       EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
3052       EndValue->setName("ind.end");
3053     }
3054 
3055     // The new PHI merges the original incoming value, in case of a bypass,
3056     // or the value at the end of the vectorized loop.
3057     BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);
3058 
3059     // Fix the scalar body counter (PHI node).
3060     // The old induction's phi node in the scalar body needs the truncated
3061     // value.
3062     for (BasicBlock *BB : LoopBypassBlocks)
3063       BCResumeVal->addIncoming(II.getStartValue(), BB);
3064     OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
3065   }
3066 
3067   // We need the OrigLoop (scalar loop part) latch terminator to help
3068   // produce correct debug info for the middle block BB instructions.
3069   // The legality check stage guarantees that the loop will have a single
3070   // latch.
3071   assert(isa<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()) &&
3072          "Scalar loop latch terminator isn't a branch");
3073   BranchInst *ScalarLatchBr =
3074       cast<BranchInst>(OrigLoop->getLoopLatch()->getTerminator());
3075 
3076   // Add a check in the middle block to see if we have completed
3077   // all of the iterations in the first vector loop.
3078   // If (N - N%VF) == N, then we *don't* need to run the remainder.
3079   // If tail is to be folded, we know we don't need to run the remainder.
3080   Value *CmpN = Builder.getTrue();
3081   if (!Cost->foldTailByMasking()) {
3082     CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count,
3083                            CountRoundDown, "cmp.n",
3084                            LoopMiddleBlock->getTerminator());
3085 
3086     // Here we use the same DebugLoc as the scalar loop latch branch instead
3087     // of the corresponding compare because they may have ended up with
3088     // different line numbers and we want to avoid awkward line stepping while
3089     // debugging. Eg. if the compare has got a line number inside the loop.
3090     cast<Instruction>(CmpN)->setDebugLoc(ScalarLatchBr->getDebugLoc());
3091   }
3092 
3093   BranchInst *BrInst =
3094       BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, CmpN);
3095   BrInst->setDebugLoc(ScalarLatchBr->getDebugLoc());
3096   ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst);
3097 
3098   // Get ready to start creating new instructions into the vectorized body.
3099   assert(LoopVectorPreHeader == Lp->getLoopPreheader() &&
3100          "Inconsistent vector loop preheader");
3101   Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt());
3102 
3103   Optional<MDNode *> VectorizedLoopID =
3104       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
3105                                       LLVMLoopVectorizeFollowupVectorized});
3106   if (VectorizedLoopID.hasValue()) {
3107     Lp->setLoopID(VectorizedLoopID.getValue());
3108 
3109     // Do not setAlreadyVectorized if loop attributes have been defined
3110     // explicitly.
3111     return LoopVectorPreHeader;
3112   }
3113 
3114   // Keep all loop hints from the original loop on the vector loop (we'll
3115   // replace the vectorizer-specific hints below).
3116   if (MDNode *LID = OrigLoop->getLoopID())
3117     Lp->setLoopID(LID);
3118 
3119   LoopVectorizeHints Hints(Lp, true, *ORE);
3120   Hints.setAlreadyVectorized();
3121 
3122 #ifdef EXPENSIVE_CHECKS
3123   assert(DT->verify(DominatorTree::VerificationLevel::Fast));
3124   LI->verify(*DT);
3125 #endif
3126 
3127   return LoopVectorPreHeader;
3128 }
3129 
3130 // Fix up external users of the induction variable. At this point, we are
3131 // in LCSSA form, with all external PHIs that use the IV having one input value,
3132 // coming from the remainder loop. We need those PHIs to also have a correct
3133 // value for the IV when arriving directly from the middle block.
3134 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
3135                                        const InductionDescriptor &II,
3136                                        Value *CountRoundDown, Value *EndValue,
3137                                        BasicBlock *MiddleBlock) {
3138   // There are two kinds of external IV usages - those that use the value
3139   // computed in the last iteration (the PHI) and those that use the penultimate
3140   // value (the value that feeds into the phi from the loop latch).
3141   // We allow both, but they, obviously, have different values.
3142 
3143   assert(OrigLoop->getExitBlock() && "Expected a single exit block");
3144 
3145   DenseMap<Value *, Value *> MissingVals;
3146 
3147   // An external user of the last iteration's value should see the value that
3148   // the remainder loop uses to initialize its own IV.
3149   Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
3150   for (User *U : PostInc->users()) {
3151     Instruction *UI = cast<Instruction>(U);
3152     if (!OrigLoop->contains(UI)) {
3153       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3154       MissingVals[UI] = EndValue;
3155     }
3156   }
3157 
3158   // An external user of the penultimate value need to see EndValue - Step.
3159   // The simplest way to get this is to recompute it from the constituent SCEVs,
3160   // that is Start + (Step * (CRD - 1)).
3161   for (User *U : OrigPhi->users()) {
3162     auto *UI = cast<Instruction>(U);
3163     if (!OrigLoop->contains(UI)) {
3164       const DataLayout &DL =
3165           OrigLoop->getHeader()->getModule()->getDataLayout();
3166       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3167 
3168       IRBuilder<> B(MiddleBlock->getTerminator());
3169       Value *CountMinusOne = B.CreateSub(
3170           CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1));
3171       Value *CMO =
3172           !II.getStep()->getType()->isIntegerTy()
3173               ? B.CreateCast(Instruction::SIToFP, CountMinusOne,
3174                              II.getStep()->getType())
3175               : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType());
3176       CMO->setName("cast.cmo");
3177       Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II);
3178       Escape->setName("ind.escape");
3179       MissingVals[UI] = Escape;
3180     }
3181   }
3182 
3183   for (auto &I : MissingVals) {
3184     PHINode *PHI = cast<PHINode>(I.first);
3185     // One corner case we have to handle is two IVs "chasing" each-other,
3186     // that is %IV2 = phi [...], [ %IV1, %latch ]
3187     // In this case, if IV1 has an external use, we need to avoid adding both
3188     // "last value of IV1" and "penultimate value of IV2". So, verify that we
3189     // don't already have an incoming value for the middle block.
3190     if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
3191       PHI->addIncoming(I.second, MiddleBlock);
3192   }
3193 }
3194 
3195 namespace {
3196 
3197 struct CSEDenseMapInfo {
3198   static bool canHandle(const Instruction *I) {
3199     return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
3200            isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
3201   }
3202 
3203   static inline Instruction *getEmptyKey() {
3204     return DenseMapInfo<Instruction *>::getEmptyKey();
3205   }
3206 
3207   static inline Instruction *getTombstoneKey() {
3208     return DenseMapInfo<Instruction *>::getTombstoneKey();
3209   }
3210 
3211   static unsigned getHashValue(const Instruction *I) {
3212     assert(canHandle(I) && "Unknown instruction!");
3213     return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
3214                                                            I->value_op_end()));
3215   }
3216 
3217   static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
3218     if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3219         LHS == getTombstoneKey() || RHS == getTombstoneKey())
3220       return LHS == RHS;
3221     return LHS->isIdenticalTo(RHS);
3222   }
3223 };
3224 
3225 } // end anonymous namespace
3226 
3227 ///Perform cse of induction variable instructions.
3228 static void cse(BasicBlock *BB) {
3229   // Perform simple cse.
3230   SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
3231   for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
3232     Instruction *In = &*I++;
3233 
3234     if (!CSEDenseMapInfo::canHandle(In))
3235       continue;
3236 
3237     // Check if we can replace this instruction with any of the
3238     // visited instructions.
3239     if (Instruction *V = CSEMap.lookup(In)) {
3240       In->replaceAllUsesWith(V);
3241       In->eraseFromParent();
3242       continue;
3243     }
3244 
3245     CSEMap[In] = In;
3246   }
3247 }
3248 
3249 unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
3250                                                        unsigned VF,
3251                                                        bool &NeedToScalarize) {
3252   Function *F = CI->getCalledFunction();
3253   Type *ScalarRetTy = CI->getType();
3254   SmallVector<Type *, 4> Tys, ScalarTys;
3255   for (auto &ArgOp : CI->arg_operands())
3256     ScalarTys.push_back(ArgOp->getType());
3257 
3258   // Estimate cost of scalarized vector call. The source operands are assumed
3259   // to be vectors, so we need to extract individual elements from there,
3260   // execute VF scalar calls, and then gather the result into the vector return
3261   // value.
3262   unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys);
3263   if (VF == 1)
3264     return ScalarCallCost;
3265 
3266   // Compute corresponding vector type for return value and arguments.
3267   Type *RetTy = ToVectorTy(ScalarRetTy, VF);
3268   for (Type *ScalarTy : ScalarTys)
3269     Tys.push_back(ToVectorTy(ScalarTy, VF));
3270 
3271   // Compute costs of unpacking argument values for the scalar calls and
3272   // packing the return values to a vector.
3273   unsigned ScalarizationCost = getScalarizationOverhead(CI, VF);
3274 
3275   unsigned Cost = ScalarCallCost * VF + ScalarizationCost;
3276 
3277   // If we can't emit a vector call for this function, then the currently found
3278   // cost is the cost we need to return.
3279   NeedToScalarize = true;
3280   VFShape Shape = VFShape::get(*CI, {VF, false}, false /*HasGlobalPred*/);
3281   Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
3282 
3283   if (!TLI || CI->isNoBuiltin() || !VecFunc)
3284     return Cost;
3285 
3286   // If the corresponding vector cost is cheaper, return its cost.
3287   unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys);
3288   if (VectorCallCost < Cost) {
3289     NeedToScalarize = false;
3290     return VectorCallCost;
3291   }
3292   return Cost;
3293 }
3294 
3295 unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
3296                                                             unsigned VF) {
3297   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
3298   assert(ID && "Expected intrinsic call!");
3299 
3300   FastMathFlags FMF;
3301   if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
3302     FMF = FPMO->getFastMathFlags();
3303 
3304   SmallVector<Value *, 4> Operands(CI->arg_operands());
3305   return TTI.getIntrinsicInstrCost(ID, CI->getType(), Operands, FMF, VF, CI);
3306 }
3307 
3308 static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
3309   auto *I1 = cast<IntegerType>(T1->getVectorElementType());
3310   auto *I2 = cast<IntegerType>(T2->getVectorElementType());
3311   return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3312 }
3313 static Type *largestIntegerVectorType(Type *T1, Type *T2) {
3314   auto *I1 = cast<IntegerType>(T1->getVectorElementType());
3315   auto *I2 = cast<IntegerType>(T2->getVectorElementType());
3316   return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3317 }
3318 
3319 void InnerLoopVectorizer::truncateToMinimalBitwidths() {
3320   // For every instruction `I` in MinBWs, truncate the operands, create a
3321   // truncated version of `I` and reextend its result. InstCombine runs
3322   // later and will remove any ext/trunc pairs.
3323   SmallPtrSet<Value *, 4> Erased;
3324   for (const auto &KV : Cost->getMinimalBitwidths()) {
3325     // If the value wasn't vectorized, we must maintain the original scalar
3326     // type. The absence of the value from VectorLoopValueMap indicates that it
3327     // wasn't vectorized.
3328     if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3329       continue;
3330     for (unsigned Part = 0; Part < UF; ++Part) {
3331       Value *I = getOrCreateVectorValue(KV.first, Part);
3332       if (Erased.find(I) != Erased.end() || I->use_empty() ||
3333           !isa<Instruction>(I))
3334         continue;
3335       Type *OriginalTy = I->getType();
3336       Type *ScalarTruncatedTy =
3337           IntegerType::get(OriginalTy->getContext(), KV.second);
3338       Type *TruncatedTy = VectorType::get(ScalarTruncatedTy,
3339                                           OriginalTy->getVectorNumElements());
3340       if (TruncatedTy == OriginalTy)
3341         continue;
3342 
3343       IRBuilder<> B(cast<Instruction>(I));
3344       auto ShrinkOperand = [&](Value *V) -> Value * {
3345         if (auto *ZI = dyn_cast<ZExtInst>(V))
3346           if (ZI->getSrcTy() == TruncatedTy)
3347             return ZI->getOperand(0);
3348         return B.CreateZExtOrTrunc(V, TruncatedTy);
3349       };
3350 
3351       // The actual instruction modification depends on the instruction type,
3352       // unfortunately.
3353       Value *NewI = nullptr;
3354       if (auto *BO = dyn_cast<BinaryOperator>(I)) {
3355         NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
3356                              ShrinkOperand(BO->getOperand(1)));
3357 
3358         // Any wrapping introduced by shrinking this operation shouldn't be
3359         // considered undefined behavior. So, we can't unconditionally copy
3360         // arithmetic wrapping flags to NewI.
3361         cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
3362       } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
3363         NewI =
3364             B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
3365                          ShrinkOperand(CI->getOperand(1)));
3366       } else if (auto *SI = dyn_cast<SelectInst>(I)) {
3367         NewI = B.CreateSelect(SI->getCondition(),
3368                               ShrinkOperand(SI->getTrueValue()),
3369                               ShrinkOperand(SI->getFalseValue()));
3370       } else if (auto *CI = dyn_cast<CastInst>(I)) {
3371         switch (CI->getOpcode()) {
3372         default:
3373           llvm_unreachable("Unhandled cast!");
3374         case Instruction::Trunc:
3375           NewI = ShrinkOperand(CI->getOperand(0));
3376           break;
3377         case Instruction::SExt:
3378           NewI = B.CreateSExtOrTrunc(
3379               CI->getOperand(0),
3380               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3381           break;
3382         case Instruction::ZExt:
3383           NewI = B.CreateZExtOrTrunc(
3384               CI->getOperand(0),
3385               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3386           break;
3387         }
3388       } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
3389         auto Elements0 = SI->getOperand(0)->getType()->getVectorNumElements();
3390         auto *O0 = B.CreateZExtOrTrunc(
3391             SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0));
3392         auto Elements1 = SI->getOperand(1)->getType()->getVectorNumElements();
3393         auto *O1 = B.CreateZExtOrTrunc(
3394             SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1));
3395 
3396         NewI = B.CreateShuffleVector(O0, O1, SI->getMask());
3397       } else if (isa<LoadInst>(I) || isa<PHINode>(I)) {
3398         // Don't do anything with the operands, just extend the result.
3399         continue;
3400       } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
3401         auto Elements = IE->getOperand(0)->getType()->getVectorNumElements();
3402         auto *O0 = B.CreateZExtOrTrunc(
3403             IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3404         auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
3405         NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
3406       } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
3407         auto Elements = EE->getOperand(0)->getType()->getVectorNumElements();
3408         auto *O0 = B.CreateZExtOrTrunc(
3409             EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3410         NewI = B.CreateExtractElement(O0, EE->getOperand(2));
3411       } else {
3412         // If we don't know what to do, be conservative and don't do anything.
3413         continue;
3414       }
3415 
3416       // Lastly, extend the result.
3417       NewI->takeName(cast<Instruction>(I));
3418       Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
3419       I->replaceAllUsesWith(Res);
3420       cast<Instruction>(I)->eraseFromParent();
3421       Erased.insert(I);
3422       VectorLoopValueMap.resetVectorValue(KV.first, Part, Res);
3423     }
3424   }
3425 
3426   // We'll have created a bunch of ZExts that are now parentless. Clean up.
3427   for (const auto &KV : Cost->getMinimalBitwidths()) {
3428     // If the value wasn't vectorized, we must maintain the original scalar
3429     // type. The absence of the value from VectorLoopValueMap indicates that it
3430     // wasn't vectorized.
3431     if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3432       continue;
3433     for (unsigned Part = 0; Part < UF; ++Part) {
3434       Value *I = getOrCreateVectorValue(KV.first, Part);
3435       ZExtInst *Inst = dyn_cast<ZExtInst>(I);
3436       if (Inst && Inst->use_empty()) {
3437         Value *NewI = Inst->getOperand(0);
3438         Inst->eraseFromParent();
3439         VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI);
3440       }
3441     }
3442   }
3443 }
3444 
3445 void InnerLoopVectorizer::fixVectorizedLoop() {
3446   // Insert truncates and extends for any truncated instructions as hints to
3447   // InstCombine.
3448   if (VF > 1)
3449     truncateToMinimalBitwidths();
3450 
3451   // Fix widened non-induction PHIs by setting up the PHI operands.
3452   if (OrigPHIsToFix.size()) {
3453     assert(EnableVPlanNativePath &&
3454            "Unexpected non-induction PHIs for fixup in non VPlan-native path");
3455     fixNonInductionPHIs();
3456   }
3457 
3458   // At this point every instruction in the original loop is widened to a
3459   // vector form. Now we need to fix the recurrences in the loop. These PHI
3460   // nodes are currently empty because we did not want to introduce cycles.
3461   // This is the second stage of vectorizing recurrences.
3462   fixCrossIterationPHIs();
3463 
3464   // Forget the original basic block.
3465   PSE.getSE()->forgetLoop(OrigLoop);
3466 
3467   // Fix-up external users of the induction variables.
3468   for (auto &Entry : Legal->getInductionVars())
3469     fixupIVUsers(Entry.first, Entry.second,
3470                  getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)),
3471                  IVEndValues[Entry.first], LoopMiddleBlock);
3472 
3473   fixLCSSAPHIs();
3474   for (Instruction *PI : PredicatedInstructions)
3475     sinkScalarOperands(&*PI);
3476 
3477   // Remove redundant induction instructions.
3478   cse(LoopVectorBody);
3479 
3480   // Set/update profile weights for the vector and remainder loops as original
3481   // loop iterations are now distributed among them. Note that original loop
3482   // represented by LoopScalarBody becomes remainder loop after vectorization.
3483   //
3484   // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
3485   // end up getting slightly roughened result but that should be OK since
3486   // profile is not inherently precise anyway. Note also possible bypass of
3487   // vector code caused by legality checks is ignored, assigning all the weight
3488   // to the vector loop, optimistically.
3489   setProfileInfoAfterUnrolling(LI->getLoopFor(LoopScalarBody),
3490                                LI->getLoopFor(LoopVectorBody),
3491                                LI->getLoopFor(LoopScalarBody), VF * UF);
3492 }
3493 
3494 void InnerLoopVectorizer::fixCrossIterationPHIs() {
3495   // In order to support recurrences we need to be able to vectorize Phi nodes.
3496   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3497   // stage #2: We now need to fix the recurrences by adding incoming edges to
3498   // the currently empty PHI nodes. At this point every instruction in the
3499   // original loop is widened to a vector form so we can use them to construct
3500   // the incoming edges.
3501   for (PHINode &Phi : OrigLoop->getHeader()->phis()) {
3502     // Handle first-order recurrences and reductions that need to be fixed.
3503     if (Legal->isFirstOrderRecurrence(&Phi))
3504       fixFirstOrderRecurrence(&Phi);
3505     else if (Legal->isReductionVariable(&Phi))
3506       fixReduction(&Phi);
3507   }
3508 }
3509 
3510 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
3511   // This is the second phase of vectorizing first-order recurrences. An
3512   // overview of the transformation is described below. Suppose we have the
3513   // following loop.
3514   //
3515   //   for (int i = 0; i < n; ++i)
3516   //     b[i] = a[i] - a[i - 1];
3517   //
3518   // There is a first-order recurrence on "a". For this loop, the shorthand
3519   // scalar IR looks like:
3520   //
3521   //   scalar.ph:
3522   //     s_init = a[-1]
3523   //     br scalar.body
3524   //
3525   //   scalar.body:
3526   //     i = phi [0, scalar.ph], [i+1, scalar.body]
3527   //     s1 = phi [s_init, scalar.ph], [s2, scalar.body]
3528   //     s2 = a[i]
3529   //     b[i] = s2 - s1
3530   //     br cond, scalar.body, ...
3531   //
3532   // In this example, s1 is a recurrence because it's value depends on the
3533   // previous iteration. In the first phase of vectorization, we created a
3534   // temporary value for s1. We now complete the vectorization and produce the
3535   // shorthand vector IR shown below (for VF = 4, UF = 1).
3536   //
3537   //   vector.ph:
3538   //     v_init = vector(..., ..., ..., a[-1])
3539   //     br vector.body
3540   //
3541   //   vector.body
3542   //     i = phi [0, vector.ph], [i+4, vector.body]
3543   //     v1 = phi [v_init, vector.ph], [v2, vector.body]
3544   //     v2 = a[i, i+1, i+2, i+3];
3545   //     v3 = vector(v1(3), v2(0, 1, 2))
3546   //     b[i, i+1, i+2, i+3] = v2 - v3
3547   //     br cond, vector.body, middle.block
3548   //
3549   //   middle.block:
3550   //     x = v2(3)
3551   //     br scalar.ph
3552   //
3553   //   scalar.ph:
3554   //     s_init = phi [x, middle.block], [a[-1], otherwise]
3555   //     br scalar.body
3556   //
3557   // After execution completes the vector loop, we extract the next value of
3558   // the recurrence (x) to use as the initial value in the scalar loop.
3559 
3560   // Get the original loop preheader and single loop latch.
3561   auto *Preheader = OrigLoop->getLoopPreheader();
3562   auto *Latch = OrigLoop->getLoopLatch();
3563 
3564   // Get the initial and previous values of the scalar recurrence.
3565   auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader);
3566   auto *Previous = Phi->getIncomingValueForBlock(Latch);
3567 
3568   // Create a vector from the initial value.
3569   auto *VectorInit = ScalarInit;
3570   if (VF > 1) {
3571     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
3572     VectorInit = Builder.CreateInsertElement(
3573         UndefValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit,
3574         Builder.getInt32(VF - 1), "vector.recur.init");
3575   }
3576 
3577   // We constructed a temporary phi node in the first phase of vectorization.
3578   // This phi node will eventually be deleted.
3579   Builder.SetInsertPoint(
3580       cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0)));
3581 
3582   // Create a phi node for the new recurrence. The current value will either be
3583   // the initial value inserted into a vector or loop-varying vector value.
3584   auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur");
3585   VecPhi->addIncoming(VectorInit, LoopVectorPreHeader);
3586 
3587   // Get the vectorized previous value of the last part UF - 1. It appears last
3588   // among all unrolled iterations, due to the order of their construction.
3589   Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1);
3590 
3591   // Find and set the insertion point after the previous value if it is an
3592   // instruction.
3593   BasicBlock::iterator InsertPt;
3594   // Note that the previous value may have been constant-folded so it is not
3595   // guaranteed to be an instruction in the vector loop.
3596   // FIXME: Loop invariant values do not form recurrences. We should deal with
3597   //        them earlier.
3598   if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart))
3599     InsertPt = LoopVectorBody->getFirstInsertionPt();
3600   else {
3601     Instruction *PreviousInst = cast<Instruction>(PreviousLastPart);
3602     if (isa<PHINode>(PreviousLastPart))
3603       // If the previous value is a phi node, we should insert after all the phi
3604       // nodes in the block containing the PHI to avoid breaking basic block
3605       // verification. Note that the basic block may be different to
3606       // LoopVectorBody, in case we predicate the loop.
3607       InsertPt = PreviousInst->getParent()->getFirstInsertionPt();
3608     else
3609       InsertPt = ++PreviousInst->getIterator();
3610   }
3611   Builder.SetInsertPoint(&*InsertPt);
3612 
3613   // We will construct a vector for the recurrence by combining the values for
3614   // the current and previous iterations. This is the required shuffle mask.
3615   SmallVector<Constant *, 8> ShuffleMask(VF);
3616   ShuffleMask[0] = Builder.getInt32(VF - 1);
3617   for (unsigned I = 1; I < VF; ++I)
3618     ShuffleMask[I] = Builder.getInt32(I + VF - 1);
3619 
3620   // The vector from which to take the initial value for the current iteration
3621   // (actual or unrolled). Initially, this is the vector phi node.
3622   Value *Incoming = VecPhi;
3623 
3624   // Shuffle the current and previous vector and update the vector parts.
3625   for (unsigned Part = 0; Part < UF; ++Part) {
3626     Value *PreviousPart = getOrCreateVectorValue(Previous, Part);
3627     Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part);
3628     auto *Shuffle =
3629         VF > 1 ? Builder.CreateShuffleVector(Incoming, PreviousPart,
3630                                              ConstantVector::get(ShuffleMask))
3631                : Incoming;
3632     PhiPart->replaceAllUsesWith(Shuffle);
3633     cast<Instruction>(PhiPart)->eraseFromParent();
3634     VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle);
3635     Incoming = PreviousPart;
3636   }
3637 
3638   // Fix the latch value of the new recurrence in the vector loop.
3639   VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
3640 
3641   // Extract the last vector element in the middle block. This will be the
3642   // initial value for the recurrence when jumping to the scalar loop.
3643   auto *ExtractForScalar = Incoming;
3644   if (VF > 1) {
3645     Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
3646     ExtractForScalar = Builder.CreateExtractElement(
3647         ExtractForScalar, Builder.getInt32(VF - 1), "vector.recur.extract");
3648   }
3649   // Extract the second last element in the middle block if the
3650   // Phi is used outside the loop. We need to extract the phi itself
3651   // and not the last element (the phi update in the current iteration). This
3652   // will be the value when jumping to the exit block from the LoopMiddleBlock,
3653   // when the scalar loop is not run at all.
3654   Value *ExtractForPhiUsedOutsideLoop = nullptr;
3655   if (VF > 1)
3656     ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
3657         Incoming, Builder.getInt32(VF - 2), "vector.recur.extract.for.phi");
3658   // When loop is unrolled without vectorizing, initialize
3659   // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of
3660   // `Incoming`. This is analogous to the vectorized case above: extracting the
3661   // second last element when VF > 1.
3662   else if (UF > 1)
3663     ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2);
3664 
3665   // Fix the initial value of the original recurrence in the scalar loop.
3666   Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
3667   auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
3668   for (auto *BB : predecessors(LoopScalarPreHeader)) {
3669     auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
3670     Start->addIncoming(Incoming, BB);
3671   }
3672 
3673   Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start);
3674   Phi->setName("scalar.recur");
3675 
3676   // Finally, fix users of the recurrence outside the loop. The users will need
3677   // either the last value of the scalar recurrence or the last value of the
3678   // vector recurrence we extracted in the middle block. Since the loop is in
3679   // LCSSA form, we just need to find all the phi nodes for the original scalar
3680   // recurrence in the exit block, and then add an edge for the middle block.
3681   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3682     if (LCSSAPhi.getIncomingValue(0) == Phi) {
3683       LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
3684     }
3685   }
3686 }
3687 
3688 void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
3689   Constant *Zero = Builder.getInt32(0);
3690 
3691   // Get it's reduction variable descriptor.
3692   assert(Legal->isReductionVariable(Phi) &&
3693          "Unable to find the reduction variable");
3694   RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi];
3695 
3696   RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();
3697   TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
3698   Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
3699   RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind =
3700     RdxDesc.getMinMaxRecurrenceKind();
3701   setDebugLocFromInst(Builder, ReductionStartValue);
3702 
3703   // We need to generate a reduction vector from the incoming scalar.
3704   // To do so, we need to generate the 'identity' vector and override
3705   // one of the elements with the incoming scalar reduction. We need
3706   // to do it in the vector-loop preheader.
3707   Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
3708 
3709   // This is the vector-clone of the value that leaves the loop.
3710   Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType();
3711 
3712   // Find the reduction identity variable. Zero for addition, or, xor,
3713   // one for multiplication, -1 for And.
3714   Value *Identity;
3715   Value *VectorStart;
3716   if (RK == RecurrenceDescriptor::RK_IntegerMinMax ||
3717       RK == RecurrenceDescriptor::RK_FloatMinMax) {
3718     // MinMax reduction have the start value as their identify.
3719     if (VF == 1) {
3720       VectorStart = Identity = ReductionStartValue;
3721     } else {
3722       VectorStart = Identity =
3723         Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident");
3724     }
3725   } else {
3726     // Handle other reduction kinds:
3727     Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity(
3728         RK, VecTy->getScalarType());
3729     if (VF == 1) {
3730       Identity = Iden;
3731       // This vector is the Identity vector where the first element is the
3732       // incoming scalar reduction.
3733       VectorStart = ReductionStartValue;
3734     } else {
3735       Identity = ConstantVector::getSplat({VF, false}, Iden);
3736 
3737       // This vector is the Identity vector where the first element is the
3738       // incoming scalar reduction.
3739       VectorStart =
3740         Builder.CreateInsertElement(Identity, ReductionStartValue, Zero);
3741     }
3742   }
3743 
3744   // Wrap flags are in general invalid after vectorization, clear them.
3745   clearReductionWrapFlags(RdxDesc);
3746 
3747   // Fix the vector-loop phi.
3748 
3749   // Reductions do not have to start at zero. They can start with
3750   // any loop invariant values.
3751   BasicBlock *Latch = OrigLoop->getLoopLatch();
3752   Value *LoopVal = Phi->getIncomingValueForBlock(Latch);
3753 
3754   for (unsigned Part = 0; Part < UF; ++Part) {
3755     Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part);
3756     Value *Val = getOrCreateVectorValue(LoopVal, Part);
3757     // Make sure to add the reduction start value only to the
3758     // first unroll part.
3759     Value *StartVal = (Part == 0) ? VectorStart : Identity;
3760     cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader);
3761     cast<PHINode>(VecRdxPhi)
3762       ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
3763   }
3764 
3765   // Before each round, move the insertion point right between
3766   // the PHIs and the values we are going to write.
3767   // This allows us to write both PHINodes and the extractelement
3768   // instructions.
3769   Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3770 
3771   setDebugLocFromInst(Builder, LoopExitInst);
3772 
3773   // If tail is folded by masking, the vector value to leave the loop should be
3774   // a Select choosing between the vectorized LoopExitInst and vectorized Phi,
3775   // instead of the former.
3776   if (Cost->foldTailByMasking()) {
3777     for (unsigned Part = 0; Part < UF; ++Part) {
3778       Value *VecLoopExitInst =
3779           VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3780       Value *Sel = nullptr;
3781       for (User *U : VecLoopExitInst->users()) {
3782         if (isa<SelectInst>(U)) {
3783           assert(!Sel && "Reduction exit feeding two selects");
3784           Sel = U;
3785         } else
3786           assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select");
3787       }
3788       assert(Sel && "Reduction exit feeds no select");
3789       VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, Sel);
3790     }
3791   }
3792 
3793   // If the vector reduction can be performed in a smaller type, we truncate
3794   // then extend the loop exit value to enable InstCombine to evaluate the
3795   // entire expression in the smaller type.
3796   if (VF > 1 && Phi->getType() != RdxDesc.getRecurrenceType()) {
3797     Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
3798     Builder.SetInsertPoint(
3799         LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator());
3800     VectorParts RdxParts(UF);
3801     for (unsigned Part = 0; Part < UF; ++Part) {
3802       RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3803       Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3804       Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
3805                                         : Builder.CreateZExt(Trunc, VecTy);
3806       for (Value::user_iterator UI = RdxParts[Part]->user_begin();
3807            UI != RdxParts[Part]->user_end();)
3808         if (*UI != Trunc) {
3809           (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd);
3810           RdxParts[Part] = Extnd;
3811         } else {
3812           ++UI;
3813         }
3814     }
3815     Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3816     for (unsigned Part = 0; Part < UF; ++Part) {
3817       RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3818       VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]);
3819     }
3820   }
3821 
3822   // Reduce all of the unrolled parts into a single vector.
3823   Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0);
3824   unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK);
3825 
3826   // The middle block terminator has already been assigned a DebugLoc here (the
3827   // OrigLoop's single latch terminator). We want the whole middle block to
3828   // appear to execute on this line because: (a) it is all compiler generated,
3829   // (b) these instructions are always executed after evaluating the latch
3830   // conditional branch, and (c) other passes may add new predecessors which
3831   // terminate on this line. This is the easiest way to ensure we don't
3832   // accidentally cause an extra step back into the loop while debugging.
3833   setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator());
3834   for (unsigned Part = 1; Part < UF; ++Part) {
3835     Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3836     if (Op != Instruction::ICmp && Op != Instruction::FCmp)
3837       // Floating point operations had to be 'fast' to enable the reduction.
3838       ReducedPartRdx = addFastMathFlag(
3839           Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxPart,
3840                               ReducedPartRdx, "bin.rdx"),
3841           RdxDesc.getFastMathFlags());
3842     else
3843       ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx,
3844                                       RdxPart);
3845   }
3846 
3847   if (VF > 1) {
3848     bool NoNaN = Legal->hasFunNoNaNAttr();
3849     ReducedPartRdx =
3850         createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN);
3851     // If the reduction can be performed in a smaller type, we need to extend
3852     // the reduction to the wider type before we branch to the original loop.
3853     if (Phi->getType() != RdxDesc.getRecurrenceType())
3854       ReducedPartRdx =
3855         RdxDesc.isSigned()
3856         ? Builder.CreateSExt(ReducedPartRdx, Phi->getType())
3857         : Builder.CreateZExt(ReducedPartRdx, Phi->getType());
3858   }
3859 
3860   // Create a phi node that merges control-flow from the backedge-taken check
3861   // block and the middle block.
3862   PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx",
3863                                         LoopScalarPreHeader->getTerminator());
3864   for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
3865     BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]);
3866   BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
3867 
3868   // Now, we need to fix the users of the reduction variable
3869   // inside and outside of the scalar remainder loop.
3870   // We know that the loop is in LCSSA form. We need to update the
3871   // PHI nodes in the exit blocks.
3872   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3873     // All PHINodes need to have a single entry edge, or two if
3874     // we already fixed them.
3875     assert(LCSSAPhi.getNumIncomingValues() < 3 && "Invalid LCSSA PHI");
3876 
3877     // We found a reduction value exit-PHI. Update it with the
3878     // incoming bypass edge.
3879     if (LCSSAPhi.getIncomingValue(0) == LoopExitInst)
3880       LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
3881   } // end of the LCSSA phi scan.
3882 
3883     // Fix the scalar loop reduction variable with the incoming reduction sum
3884     // from the vector body and from the backedge value.
3885   int IncomingEdgeBlockIdx =
3886     Phi->getBasicBlockIndex(OrigLoop->getLoopLatch());
3887   assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
3888   // Pick the other block.
3889   int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
3890   Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
3891   Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
3892 }
3893 
3894 void InnerLoopVectorizer::clearReductionWrapFlags(
3895     RecurrenceDescriptor &RdxDesc) {
3896   RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();
3897   if (RK != RecurrenceDescriptor::RK_IntegerAdd &&
3898       RK != RecurrenceDescriptor::RK_IntegerMult)
3899     return;
3900 
3901   Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr();
3902   assert(LoopExitInstr && "null loop exit instruction");
3903   SmallVector<Instruction *, 8> Worklist;
3904   SmallPtrSet<Instruction *, 8> Visited;
3905   Worklist.push_back(LoopExitInstr);
3906   Visited.insert(LoopExitInstr);
3907 
3908   while (!Worklist.empty()) {
3909     Instruction *Cur = Worklist.pop_back_val();
3910     if (isa<OverflowingBinaryOperator>(Cur))
3911       for (unsigned Part = 0; Part < UF; ++Part) {
3912         Value *V = getOrCreateVectorValue(Cur, Part);
3913         cast<Instruction>(V)->dropPoisonGeneratingFlags();
3914       }
3915 
3916     for (User *U : Cur->users()) {
3917       Instruction *UI = cast<Instruction>(U);
3918       if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) &&
3919           Visited.insert(UI).second)
3920         Worklist.push_back(UI);
3921     }
3922   }
3923 }
3924 
3925 void InnerLoopVectorizer::fixLCSSAPHIs() {
3926   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3927     if (LCSSAPhi.getNumIncomingValues() == 1) {
3928       auto *IncomingValue = LCSSAPhi.getIncomingValue(0);
3929       // Non-instruction incoming values will have only one value.
3930       unsigned LastLane = 0;
3931       if (isa<Instruction>(IncomingValue))
3932           LastLane = Cost->isUniformAfterVectorization(
3933                          cast<Instruction>(IncomingValue), VF)
3934                          ? 0
3935                          : VF - 1;
3936       // Can be a loop invariant incoming value or the last scalar value to be
3937       // extracted from the vectorized loop.
3938       Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
3939       Value *lastIncomingValue =
3940           getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane });
3941       LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock);
3942     }
3943   }
3944 }
3945 
3946 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
3947   // The basic block and loop containing the predicated instruction.
3948   auto *PredBB = PredInst->getParent();
3949   auto *VectorLoop = LI->getLoopFor(PredBB);
3950 
3951   // Initialize a worklist with the operands of the predicated instruction.
3952   SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
3953 
3954   // Holds instructions that we need to analyze again. An instruction may be
3955   // reanalyzed if we don't yet know if we can sink it or not.
3956   SmallVector<Instruction *, 8> InstsToReanalyze;
3957 
3958   // Returns true if a given use occurs in the predicated block. Phi nodes use
3959   // their operands in their corresponding predecessor blocks.
3960   auto isBlockOfUsePredicated = [&](Use &U) -> bool {
3961     auto *I = cast<Instruction>(U.getUser());
3962     BasicBlock *BB = I->getParent();
3963     if (auto *Phi = dyn_cast<PHINode>(I))
3964       BB = Phi->getIncomingBlock(
3965           PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
3966     return BB == PredBB;
3967   };
3968 
3969   // Iteratively sink the scalarized operands of the predicated instruction
3970   // into the block we created for it. When an instruction is sunk, it's
3971   // operands are then added to the worklist. The algorithm ends after one pass
3972   // through the worklist doesn't sink a single instruction.
3973   bool Changed;
3974   do {
3975     // Add the instructions that need to be reanalyzed to the worklist, and
3976     // reset the changed indicator.
3977     Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
3978     InstsToReanalyze.clear();
3979     Changed = false;
3980 
3981     while (!Worklist.empty()) {
3982       auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
3983 
3984       // We can't sink an instruction if it is a phi node, is already in the
3985       // predicated block, is not in the loop, or may have side effects.
3986       if (!I || isa<PHINode>(I) || I->getParent() == PredBB ||
3987           !VectorLoop->contains(I) || I->mayHaveSideEffects())
3988         continue;
3989 
3990       // It's legal to sink the instruction if all its uses occur in the
3991       // predicated block. Otherwise, there's nothing to do yet, and we may
3992       // need to reanalyze the instruction.
3993       if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
3994         InstsToReanalyze.push_back(I);
3995         continue;
3996       }
3997 
3998       // Move the instruction to the beginning of the predicated block, and add
3999       // it's operands to the worklist.
4000       I->moveBefore(&*PredBB->getFirstInsertionPt());
4001       Worklist.insert(I->op_begin(), I->op_end());
4002 
4003       // The sinking may have enabled other instructions to be sunk, so we will
4004       // need to iterate.
4005       Changed = true;
4006     }
4007   } while (Changed);
4008 }
4009 
4010 void InnerLoopVectorizer::fixNonInductionPHIs() {
4011   for (PHINode *OrigPhi : OrigPHIsToFix) {
4012     PHINode *NewPhi =
4013         cast<PHINode>(VectorLoopValueMap.getVectorValue(OrigPhi, 0));
4014     unsigned NumIncomingValues = OrigPhi->getNumIncomingValues();
4015 
4016     SmallVector<BasicBlock *, 2> ScalarBBPredecessors(
4017         predecessors(OrigPhi->getParent()));
4018     SmallVector<BasicBlock *, 2> VectorBBPredecessors(
4019         predecessors(NewPhi->getParent()));
4020     assert(ScalarBBPredecessors.size() == VectorBBPredecessors.size() &&
4021            "Scalar and Vector BB should have the same number of predecessors");
4022 
4023     // The insertion point in Builder may be invalidated by the time we get
4024     // here. Force the Builder insertion point to something valid so that we do
4025     // not run into issues during insertion point restore in
4026     // getOrCreateVectorValue calls below.
4027     Builder.SetInsertPoint(NewPhi);
4028 
4029     // The predecessor order is preserved and we can rely on mapping between
4030     // scalar and vector block predecessors.
4031     for (unsigned i = 0; i < NumIncomingValues; ++i) {
4032       BasicBlock *NewPredBB = VectorBBPredecessors[i];
4033 
4034       // When looking up the new scalar/vector values to fix up, use incoming
4035       // values from original phi.
4036       Value *ScIncV =
4037           OrigPhi->getIncomingValueForBlock(ScalarBBPredecessors[i]);
4038 
4039       // Scalar incoming value may need a broadcast
4040       Value *NewIncV = getOrCreateVectorValue(ScIncV, 0);
4041       NewPhi->addIncoming(NewIncV, NewPredBB);
4042     }
4043   }
4044 }
4045 
4046 void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, unsigned UF,
4047                                    unsigned VF, bool IsPtrLoopInvariant,
4048                                    SmallBitVector &IsIndexLoopInvariant) {
4049   // Construct a vector GEP by widening the operands of the scalar GEP as
4050   // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
4051   // results in a vector of pointers when at least one operand of the GEP
4052   // is vector-typed. Thus, to keep the representation compact, we only use
4053   // vector-typed operands for loop-varying values.
4054 
4055   if (VF > 1 && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) {
4056     // If we are vectorizing, but the GEP has only loop-invariant operands,
4057     // the GEP we build (by only using vector-typed operands for
4058     // loop-varying values) would be a scalar pointer. Thus, to ensure we
4059     // produce a vector of pointers, we need to either arbitrarily pick an
4060     // operand to broadcast, or broadcast a clone of the original GEP.
4061     // Here, we broadcast a clone of the original.
4062     //
4063     // TODO: If at some point we decide to scalarize instructions having
4064     //       loop-invariant operands, this special case will no longer be
4065     //       required. We would add the scalarization decision to
4066     //       collectLoopScalars() and teach getVectorValue() to broadcast
4067     //       the lane-zero scalar value.
4068     auto *Clone = Builder.Insert(GEP->clone());
4069     for (unsigned Part = 0; Part < UF; ++Part) {
4070       Value *EntryPart = Builder.CreateVectorSplat(VF, Clone);
4071       VectorLoopValueMap.setVectorValue(GEP, Part, EntryPart);
4072       addMetadata(EntryPart, GEP);
4073     }
4074   } else {
4075     // If the GEP has at least one loop-varying operand, we are sure to
4076     // produce a vector of pointers. But if we are only unrolling, we want
4077     // to produce a scalar GEP for each unroll part. Thus, the GEP we
4078     // produce with the code below will be scalar (if VF == 1) or vector
4079     // (otherwise). Note that for the unroll-only case, we still maintain
4080     // values in the vector mapping with initVector, as we do for other
4081     // instructions.
4082     for (unsigned Part = 0; Part < UF; ++Part) {
4083       // The pointer operand of the new GEP. If it's loop-invariant, we
4084       // won't broadcast it.
4085       auto *Ptr = IsPtrLoopInvariant
4086                       ? GEP->getPointerOperand()
4087                       : getOrCreateVectorValue(GEP->getPointerOperand(), Part);
4088 
4089       // Collect all the indices for the new GEP. If any index is
4090       // loop-invariant, we won't broadcast it.
4091       SmallVector<Value *, 4> Indices;
4092       for (auto Index : enumerate(GEP->indices())) {
4093         Value *User = Index.value().get();
4094         if (IsIndexLoopInvariant[Index.index()])
4095           Indices.push_back(User);
4096         else
4097           Indices.push_back(getOrCreateVectorValue(User, Part));
4098       }
4099 
4100       // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
4101       // but it should be a vector, otherwise.
4102       auto *NewGEP =
4103           GEP->isInBounds()
4104               ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr,
4105                                           Indices)
4106               : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices);
4107       assert((VF == 1 || NewGEP->getType()->isVectorTy()) &&
4108              "NewGEP is not a pointer vector");
4109       VectorLoopValueMap.setVectorValue(GEP, Part, NewGEP);
4110       addMetadata(NewGEP, GEP);
4111     }
4112   }
4113 }
4114 
4115 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
4116                                               unsigned VF) {
4117   PHINode *P = cast<PHINode>(PN);
4118   if (EnableVPlanNativePath) {
4119     // Currently we enter here in the VPlan-native path for non-induction
4120     // PHIs where all control flow is uniform. We simply widen these PHIs.
4121     // Create a vector phi with no operands - the vector phi operands will be
4122     // set at the end of vector code generation.
4123     Type *VecTy =
4124         (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF);
4125     Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi");
4126     VectorLoopValueMap.setVectorValue(P, 0, VecPhi);
4127     OrigPHIsToFix.push_back(P);
4128 
4129     return;
4130   }
4131 
4132   assert(PN->getParent() == OrigLoop->getHeader() &&
4133          "Non-header phis should have been handled elsewhere");
4134 
4135   // In order to support recurrences we need to be able to vectorize Phi nodes.
4136   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
4137   // stage #1: We create a new vector PHI node with no incoming edges. We'll use
4138   // this value when we vectorize all of the instructions that use the PHI.
4139   if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) {
4140     for (unsigned Part = 0; Part < UF; ++Part) {
4141       // This is phase one of vectorizing PHIs.
4142       Type *VecTy =
4143           (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF);
4144       Value *EntryPart = PHINode::Create(
4145           VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt());
4146       VectorLoopValueMap.setVectorValue(P, Part, EntryPart);
4147     }
4148     return;
4149   }
4150 
4151   setDebugLocFromInst(Builder, P);
4152 
4153   // This PHINode must be an induction variable.
4154   // Make sure that we know about it.
4155   assert(Legal->getInductionVars().count(P) && "Not an induction variable");
4156 
4157   InductionDescriptor II = Legal->getInductionVars().lookup(P);
4158   const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
4159 
4160   // FIXME: The newly created binary instructions should contain nsw/nuw flags,
4161   // which can be found from the original scalar operations.
4162   switch (II.getKind()) {
4163   case InductionDescriptor::IK_NoInduction:
4164     llvm_unreachable("Unknown induction");
4165   case InductionDescriptor::IK_IntInduction:
4166   case InductionDescriptor::IK_FpInduction:
4167     llvm_unreachable("Integer/fp induction is handled elsewhere.");
4168   case InductionDescriptor::IK_PtrInduction: {
4169     // Handle the pointer induction variable case.
4170     assert(P->getType()->isPointerTy() && "Unexpected type.");
4171     // This is the normalized GEP that starts counting at zero.
4172     Value *PtrInd = Induction;
4173     PtrInd = Builder.CreateSExtOrTrunc(PtrInd, II.getStep()->getType());
4174     // Determine the number of scalars we need to generate for each unroll
4175     // iteration. If the instruction is uniform, we only need to generate the
4176     // first lane. Otherwise, we generate all VF values.
4177     unsigned Lanes = Cost->isUniformAfterVectorization(P, VF) ? 1 : VF;
4178     // These are the scalar results. Notice that we don't generate vector GEPs
4179     // because scalar GEPs result in better code.
4180     for (unsigned Part = 0; Part < UF; ++Part) {
4181       for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
4182         Constant *Idx = ConstantInt::get(PtrInd->getType(), Lane + Part * VF);
4183         Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
4184         Value *SclrGep =
4185             emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II);
4186         SclrGep->setName("next.gep");
4187         VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep);
4188       }
4189     }
4190     return;
4191   }
4192   }
4193 }
4194 
4195 /// A helper function for checking whether an integer division-related
4196 /// instruction may divide by zero (in which case it must be predicated if
4197 /// executed conditionally in the scalar code).
4198 /// TODO: It may be worthwhile to generalize and check isKnownNonZero().
4199 /// Non-zero divisors that are non compile-time constants will not be
4200 /// converted into multiplication, so we will still end up scalarizing
4201 /// the division, but can do so w/o predication.
4202 static bool mayDivideByZero(Instruction &I) {
4203   assert((I.getOpcode() == Instruction::UDiv ||
4204           I.getOpcode() == Instruction::SDiv ||
4205           I.getOpcode() == Instruction::URem ||
4206           I.getOpcode() == Instruction::SRem) &&
4207          "Unexpected instruction");
4208   Value *Divisor = I.getOperand(1);
4209   auto *CInt = dyn_cast<ConstantInt>(Divisor);
4210   return !CInt || CInt->isZero();
4211 }
4212 
4213 void InnerLoopVectorizer::widenInstruction(Instruction &I) {
4214   switch (I.getOpcode()) {
4215   case Instruction::Br:
4216   case Instruction::PHI:
4217   case Instruction::GetElementPtr:
4218     llvm_unreachable("This instruction is handled by a different recipe.");
4219   case Instruction::UDiv:
4220   case Instruction::SDiv:
4221   case Instruction::SRem:
4222   case Instruction::URem:
4223   case Instruction::Add:
4224   case Instruction::FAdd:
4225   case Instruction::Sub:
4226   case Instruction::FSub:
4227   case Instruction::FNeg:
4228   case Instruction::Mul:
4229   case Instruction::FMul:
4230   case Instruction::FDiv:
4231   case Instruction::FRem:
4232   case Instruction::Shl:
4233   case Instruction::LShr:
4234   case Instruction::AShr:
4235   case Instruction::And:
4236   case Instruction::Or:
4237   case Instruction::Xor: {
4238     // Just widen unops and binops.
4239     setDebugLocFromInst(Builder, &I);
4240 
4241     for (unsigned Part = 0; Part < UF; ++Part) {
4242       SmallVector<Value *, 2> Ops;
4243       for (Value *Op : I.operands())
4244         Ops.push_back(getOrCreateVectorValue(Op, Part));
4245 
4246       Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops);
4247 
4248       if (auto *VecOp = dyn_cast<Instruction>(V))
4249         VecOp->copyIRFlags(&I);
4250 
4251       // Use this vector value for all users of the original instruction.
4252       VectorLoopValueMap.setVectorValue(&I, Part, V);
4253       addMetadata(V, &I);
4254     }
4255 
4256     break;
4257   }
4258   case Instruction::Select: {
4259     // Widen selects.
4260     // If the selector is loop invariant we can create a select
4261     // instruction with a scalar condition. Otherwise, use vector-select.
4262     auto *SE = PSE.getSE();
4263     bool InvariantCond =
4264         SE->isLoopInvariant(PSE.getSCEV(I.getOperand(0)), OrigLoop);
4265     setDebugLocFromInst(Builder, &I);
4266 
4267     // The condition can be loop invariant  but still defined inside the
4268     // loop. This means that we can't just use the original 'cond' value.
4269     // We have to take the 'vectorized' value and pick the first lane.
4270     // Instcombine will make this a no-op.
4271 
4272     auto *ScalarCond = getOrCreateScalarValue(I.getOperand(0), {0, 0});
4273 
4274     for (unsigned Part = 0; Part < UF; ++Part) {
4275       Value *Cond = getOrCreateVectorValue(I.getOperand(0), Part);
4276       Value *Op0 = getOrCreateVectorValue(I.getOperand(1), Part);
4277       Value *Op1 = getOrCreateVectorValue(I.getOperand(2), Part);
4278       Value *Sel =
4279           Builder.CreateSelect(InvariantCond ? ScalarCond : Cond, Op0, Op1);
4280       VectorLoopValueMap.setVectorValue(&I, Part, Sel);
4281       addMetadata(Sel, &I);
4282     }
4283 
4284     break;
4285   }
4286 
4287   case Instruction::ICmp:
4288   case Instruction::FCmp: {
4289     // Widen compares. Generate vector compares.
4290     bool FCmp = (I.getOpcode() == Instruction::FCmp);
4291     auto *Cmp = cast<CmpInst>(&I);
4292     setDebugLocFromInst(Builder, Cmp);
4293     for (unsigned Part = 0; Part < UF; ++Part) {
4294       Value *A = getOrCreateVectorValue(Cmp->getOperand(0), Part);
4295       Value *B = getOrCreateVectorValue(Cmp->getOperand(1), Part);
4296       Value *C = nullptr;
4297       if (FCmp) {
4298         // Propagate fast math flags.
4299         IRBuilder<>::FastMathFlagGuard FMFG(Builder);
4300         Builder.setFastMathFlags(Cmp->getFastMathFlags());
4301         C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
4302       } else {
4303         C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
4304       }
4305       VectorLoopValueMap.setVectorValue(&I, Part, C);
4306       addMetadata(C, &I);
4307     }
4308 
4309     break;
4310   }
4311 
4312   case Instruction::ZExt:
4313   case Instruction::SExt:
4314   case Instruction::FPToUI:
4315   case Instruction::FPToSI:
4316   case Instruction::FPExt:
4317   case Instruction::PtrToInt:
4318   case Instruction::IntToPtr:
4319   case Instruction::SIToFP:
4320   case Instruction::UIToFP:
4321   case Instruction::Trunc:
4322   case Instruction::FPTrunc:
4323   case Instruction::BitCast: {
4324     auto *CI = cast<CastInst>(&I);
4325     setDebugLocFromInst(Builder, CI);
4326 
4327     /// Vectorize casts.
4328     Type *DestTy =
4329         (VF == 1) ? CI->getType() : VectorType::get(CI->getType(), VF);
4330 
4331     for (unsigned Part = 0; Part < UF; ++Part) {
4332       Value *A = getOrCreateVectorValue(CI->getOperand(0), Part);
4333       Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
4334       VectorLoopValueMap.setVectorValue(&I, Part, Cast);
4335       addMetadata(Cast, &I);
4336     }
4337     break;
4338   }
4339 
4340   case Instruction::Call: {
4341     // Ignore dbg intrinsics.
4342     if (isa<DbgInfoIntrinsic>(I))
4343       break;
4344     setDebugLocFromInst(Builder, &I);
4345 
4346     Module *M = I.getParent()->getParent()->getParent();
4347     auto *CI = cast<CallInst>(&I);
4348 
4349     SmallVector<Type *, 4> Tys;
4350     for (Value *ArgOperand : CI->arg_operands())
4351       Tys.push_back(ToVectorTy(ArgOperand->getType(), VF));
4352 
4353     Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
4354 
4355     // The flag shows whether we use Intrinsic or a usual Call for vectorized
4356     // version of the instruction.
4357     // Is it beneficial to perform intrinsic call compared to lib call?
4358     bool NeedToScalarize = false;
4359     unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize);
4360     bool UseVectorIntrinsic =
4361         ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost;
4362     assert((UseVectorIntrinsic || !NeedToScalarize) &&
4363            "Instruction should be scalarized elsewhere.");
4364 
4365     for (unsigned Part = 0; Part < UF; ++Part) {
4366       SmallVector<Value *, 4> Args;
4367       for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) {
4368         Value *Arg = CI->getArgOperand(i);
4369         // Some intrinsics have a scalar argument - don't replace it with a
4370         // vector.
4371         if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, i))
4372           Arg = getOrCreateVectorValue(CI->getArgOperand(i), Part);
4373         Args.push_back(Arg);
4374       }
4375 
4376       Function *VectorF;
4377       if (UseVectorIntrinsic) {
4378         // Use vector version of the intrinsic.
4379         Type *TysForDecl[] = {CI->getType()};
4380         if (VF > 1)
4381           TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
4382         VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
4383       } else {
4384         // Use vector version of the function call.
4385         const VFShape Shape =
4386             VFShape::get(*CI, {VF, false} /*EC*/, false /*HasGlobalPred*/);
4387 #ifndef NDEBUG
4388         const SmallVector<VFInfo, 8> Infos = VFDatabase::getMappings(*CI);
4389         assert(std::find_if(Infos.begin(), Infos.end(),
4390                             [&Shape](const VFInfo &Info) {
4391                               return Info.Shape == Shape;
4392                             }) != Infos.end() &&
4393                "Vector function shape is missing from the database.");
4394 #endif
4395         VectorF = VFDatabase(*CI).getVectorizedFunction(Shape);
4396       }
4397       assert(VectorF && "Can't create vector function.");
4398 
4399       SmallVector<OperandBundleDef, 1> OpBundles;
4400       CI->getOperandBundlesAsDefs(OpBundles);
4401       CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
4402 
4403       if (isa<FPMathOperator>(V))
4404         V->copyFastMathFlags(CI);
4405 
4406       VectorLoopValueMap.setVectorValue(&I, Part, V);
4407       addMetadata(V, &I);
4408     }
4409 
4410     break;
4411   }
4412 
4413   default:
4414     // This instruction is not vectorized by simple widening.
4415     LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
4416     llvm_unreachable("Unhandled instruction!");
4417   } // end of switch.
4418 }
4419 
4420 void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
4421   // We should not collect Scalars more than once per VF. Right now, this
4422   // function is called from collectUniformsAndScalars(), which already does
4423   // this check. Collecting Scalars for VF=1 does not make any sense.
4424   assert(VF >= 2 && Scalars.find(VF) == Scalars.end() &&
4425          "This function should not be visited twice for the same VF");
4426 
4427   SmallSetVector<Instruction *, 8> Worklist;
4428 
4429   // These sets are used to seed the analysis with pointers used by memory
4430   // accesses that will remain scalar.
4431   SmallSetVector<Instruction *, 8> ScalarPtrs;
4432   SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
4433 
4434   // A helper that returns true if the use of Ptr by MemAccess will be scalar.
4435   // The pointer operands of loads and stores will be scalar as long as the
4436   // memory access is not a gather or scatter operation. The value operand of a
4437   // store will remain scalar if the store is scalarized.
4438   auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
4439     InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
4440     assert(WideningDecision != CM_Unknown &&
4441            "Widening decision should be ready at this moment");
4442     if (auto *Store = dyn_cast<StoreInst>(MemAccess))
4443       if (Ptr == Store->getValueOperand())
4444         return WideningDecision == CM_Scalarize;
4445     assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
4446            "Ptr is neither a value or pointer operand");
4447     return WideningDecision != CM_GatherScatter;
4448   };
4449 
4450   // A helper that returns true if the given value is a bitcast or
4451   // getelementptr instruction contained in the loop.
4452   auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
4453     return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
4454             isa<GetElementPtrInst>(V)) &&
4455            !TheLoop->isLoopInvariant(V);
4456   };
4457 
4458   // A helper that evaluates a memory access's use of a pointer. If the use
4459   // will be a scalar use, and the pointer is only used by memory accesses, we
4460   // place the pointer in ScalarPtrs. Otherwise, the pointer is placed in
4461   // PossibleNonScalarPtrs.
4462   auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
4463     // We only care about bitcast and getelementptr instructions contained in
4464     // the loop.
4465     if (!isLoopVaryingBitCastOrGEP(Ptr))
4466       return;
4467 
4468     // If the pointer has already been identified as scalar (e.g., if it was
4469     // also identified as uniform), there's nothing to do.
4470     auto *I = cast<Instruction>(Ptr);
4471     if (Worklist.count(I))
4472       return;
4473 
4474     // If the use of the pointer will be a scalar use, and all users of the
4475     // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
4476     // place the pointer in PossibleNonScalarPtrs.
4477     if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
4478           return isa<LoadInst>(U) || isa<StoreInst>(U);
4479         }))
4480       ScalarPtrs.insert(I);
4481     else
4482       PossibleNonScalarPtrs.insert(I);
4483   };
4484 
4485   // We seed the scalars analysis with three classes of instructions: (1)
4486   // instructions marked uniform-after-vectorization, (2) bitcast and
4487   // getelementptr instructions used by memory accesses requiring a scalar use,
4488   // and (3) pointer induction variables and their update instructions (we
4489   // currently only scalarize these).
4490   //
4491   // (1) Add to the worklist all instructions that have been identified as
4492   // uniform-after-vectorization.
4493   Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
4494 
4495   // (2) Add to the worklist all bitcast and getelementptr instructions used by
4496   // memory accesses requiring a scalar use. The pointer operands of loads and
4497   // stores will be scalar as long as the memory accesses is not a gather or
4498   // scatter operation. The value operand of a store will remain scalar if the
4499   // store is scalarized.
4500   for (auto *BB : TheLoop->blocks())
4501     for (auto &I : *BB) {
4502       if (auto *Load = dyn_cast<LoadInst>(&I)) {
4503         evaluatePtrUse(Load, Load->getPointerOperand());
4504       } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
4505         evaluatePtrUse(Store, Store->getPointerOperand());
4506         evaluatePtrUse(Store, Store->getValueOperand());
4507       }
4508     }
4509   for (auto *I : ScalarPtrs)
4510     if (PossibleNonScalarPtrs.find(I) == PossibleNonScalarPtrs.end()) {
4511       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
4512       Worklist.insert(I);
4513     }
4514 
4515   // (3) Add to the worklist all pointer induction variables and their update
4516   // instructions.
4517   //
4518   // TODO: Once we are able to vectorize pointer induction variables we should
4519   //       no longer insert them into the worklist here.
4520   auto *Latch = TheLoop->getLoopLatch();
4521   for (auto &Induction : Legal->getInductionVars()) {
4522     auto *Ind = Induction.first;
4523     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4524     if (Induction.second.getKind() != InductionDescriptor::IK_PtrInduction)
4525       continue;
4526     Worklist.insert(Ind);
4527     Worklist.insert(IndUpdate);
4528     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
4529     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
4530                       << "\n");
4531   }
4532 
4533   // Insert the forced scalars.
4534   // FIXME: Currently widenPHIInstruction() often creates a dead vector
4535   // induction variable when the PHI user is scalarized.
4536   auto ForcedScalar = ForcedScalars.find(VF);
4537   if (ForcedScalar != ForcedScalars.end())
4538     for (auto *I : ForcedScalar->second)
4539       Worklist.insert(I);
4540 
4541   // Expand the worklist by looking through any bitcasts and getelementptr
4542   // instructions we've already identified as scalar. This is similar to the
4543   // expansion step in collectLoopUniforms(); however, here we're only
4544   // expanding to include additional bitcasts and getelementptr instructions.
4545   unsigned Idx = 0;
4546   while (Idx != Worklist.size()) {
4547     Instruction *Dst = Worklist[Idx++];
4548     if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
4549       continue;
4550     auto *Src = cast<Instruction>(Dst->getOperand(0));
4551     if (llvm::all_of(Src->users(), [&](User *U) -> bool {
4552           auto *J = cast<Instruction>(U);
4553           return !TheLoop->contains(J) || Worklist.count(J) ||
4554                  ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
4555                   isScalarUse(J, Src));
4556         })) {
4557       Worklist.insert(Src);
4558       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
4559     }
4560   }
4561 
4562   // An induction variable will remain scalar if all users of the induction
4563   // variable and induction variable update remain scalar.
4564   for (auto &Induction : Legal->getInductionVars()) {
4565     auto *Ind = Induction.first;
4566     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4567 
4568     // We already considered pointer induction variables, so there's no reason
4569     // to look at their users again.
4570     //
4571     // TODO: Once we are able to vectorize pointer induction variables we
4572     //       should no longer skip over them here.
4573     if (Induction.second.getKind() == InductionDescriptor::IK_PtrInduction)
4574       continue;
4575 
4576     // Determine if all users of the induction variable are scalar after
4577     // vectorization.
4578     auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4579       auto *I = cast<Instruction>(U);
4580       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I);
4581     });
4582     if (!ScalarInd)
4583       continue;
4584 
4585     // Determine if all users of the induction variable update instruction are
4586     // scalar after vectorization.
4587     auto ScalarIndUpdate =
4588         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4589           auto *I = cast<Instruction>(U);
4590           return I == Ind || !TheLoop->contains(I) || Worklist.count(I);
4591         });
4592     if (!ScalarIndUpdate)
4593       continue;
4594 
4595     // The induction variable and its update instruction will remain scalar.
4596     Worklist.insert(Ind);
4597     Worklist.insert(IndUpdate);
4598     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
4599     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
4600                       << "\n");
4601   }
4602 
4603   Scalars[VF].insert(Worklist.begin(), Worklist.end());
4604 }
4605 
4606 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, unsigned VF) {
4607   if (!blockNeedsPredication(I->getParent()))
4608     return false;
4609   switch(I->getOpcode()) {
4610   default:
4611     break;
4612   case Instruction::Load:
4613   case Instruction::Store: {
4614     if (!Legal->isMaskRequired(I))
4615       return false;
4616     auto *Ptr = getLoadStorePointerOperand(I);
4617     auto *Ty = getMemInstValueType(I);
4618     // We have already decided how to vectorize this instruction, get that
4619     // result.
4620     if (VF > 1) {
4621       InstWidening WideningDecision = getWideningDecision(I, VF);
4622       assert(WideningDecision != CM_Unknown &&
4623              "Widening decision should be ready at this moment");
4624       return WideningDecision == CM_Scalarize;
4625     }
4626     const MaybeAlign Alignment = getLoadStoreAlignment(I);
4627     return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
4628                                 isLegalMaskedGather(Ty, Alignment))
4629                             : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
4630                                 isLegalMaskedScatter(Ty, Alignment));
4631   }
4632   case Instruction::UDiv:
4633   case Instruction::SDiv:
4634   case Instruction::SRem:
4635   case Instruction::URem:
4636     return mayDivideByZero(*I);
4637   }
4638   return false;
4639 }
4640 
4641 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(Instruction *I,
4642                                                                unsigned VF) {
4643   assert(isAccessInterleaved(I) && "Expecting interleaved access.");
4644   assert(getWideningDecision(I, VF) == CM_Unknown &&
4645          "Decision should not be set yet.");
4646   auto *Group = getInterleavedAccessGroup(I);
4647   assert(Group && "Must have a group.");
4648 
4649   // If the instruction's allocated size doesn't equal it's type size, it
4650   // requires padding and will be scalarized.
4651   auto &DL = I->getModule()->getDataLayout();
4652   auto *ScalarTy = getMemInstValueType(I);
4653   if (hasIrregularType(ScalarTy, DL, VF))
4654     return false;
4655 
4656   // Check if masking is required.
4657   // A Group may need masking for one of two reasons: it resides in a block that
4658   // needs predication, or it was decided to use masking to deal with gaps.
4659   bool PredicatedAccessRequiresMasking =
4660       Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I);
4661   bool AccessWithGapsRequiresMasking =
4662       Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
4663   if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking)
4664     return true;
4665 
4666   // If masked interleaving is required, we expect that the user/target had
4667   // enabled it, because otherwise it either wouldn't have been created or
4668   // it should have been invalidated by the CostModel.
4669   assert(useMaskedInterleavedAccesses(TTI) &&
4670          "Masked interleave-groups for predicated accesses are not enabled.");
4671 
4672   auto *Ty = getMemInstValueType(I);
4673   const MaybeAlign Alignment = getLoadStoreAlignment(I);
4674   return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
4675                           : TTI.isLegalMaskedStore(Ty, Alignment);
4676 }
4677 
4678 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(Instruction *I,
4679                                                                unsigned VF) {
4680   // Get and ensure we have a valid memory instruction.
4681   LoadInst *LI = dyn_cast<LoadInst>(I);
4682   StoreInst *SI = dyn_cast<StoreInst>(I);
4683   assert((LI || SI) && "Invalid memory instruction");
4684 
4685   auto *Ptr = getLoadStorePointerOperand(I);
4686 
4687   // In order to be widened, the pointer should be consecutive, first of all.
4688   if (!Legal->isConsecutivePtr(Ptr))
4689     return false;
4690 
4691   // If the instruction is a store located in a predicated block, it will be
4692   // scalarized.
4693   if (isScalarWithPredication(I))
4694     return false;
4695 
4696   // If the instruction's allocated size doesn't equal it's type size, it
4697   // requires padding and will be scalarized.
4698   auto &DL = I->getModule()->getDataLayout();
4699   auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType();
4700   if (hasIrregularType(ScalarTy, DL, VF))
4701     return false;
4702 
4703   return true;
4704 }
4705 
4706 void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
4707   // We should not collect Uniforms more than once per VF. Right now,
4708   // this function is called from collectUniformsAndScalars(), which
4709   // already does this check. Collecting Uniforms for VF=1 does not make any
4710   // sense.
4711 
4712   assert(VF >= 2 && Uniforms.find(VF) == Uniforms.end() &&
4713          "This function should not be visited twice for the same VF");
4714 
4715   // Visit the list of Uniforms. If we'll not find any uniform value, we'll
4716   // not analyze again.  Uniforms.count(VF) will return 1.
4717   Uniforms[VF].clear();
4718 
4719   // We now know that the loop is vectorizable!
4720   // Collect instructions inside the loop that will remain uniform after
4721   // vectorization.
4722 
4723   // Global values, params and instructions outside of current loop are out of
4724   // scope.
4725   auto isOutOfScope = [&](Value *V) -> bool {
4726     Instruction *I = dyn_cast<Instruction>(V);
4727     return (!I || !TheLoop->contains(I));
4728   };
4729 
4730   SetVector<Instruction *> Worklist;
4731   BasicBlock *Latch = TheLoop->getLoopLatch();
4732 
4733   // Instructions that are scalar with predication must not be considered
4734   // uniform after vectorization, because that would create an erroneous
4735   // replicating region where only a single instance out of VF should be formed.
4736   // TODO: optimize such seldom cases if found important, see PR40816.
4737   auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
4738     if (isScalarWithPredication(I, VF)) {
4739       LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "
4740                         << *I << "\n");
4741       return;
4742     }
4743     LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
4744     Worklist.insert(I);
4745   };
4746 
4747   // Start with the conditional branch. If the branch condition is an
4748   // instruction contained in the loop that is only used by the branch, it is
4749   // uniform.
4750   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
4751   if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
4752     addToWorklistIfAllowed(Cmp);
4753 
4754   // Holds consecutive and consecutive-like pointers. Consecutive-like pointers
4755   // are pointers that are treated like consecutive pointers during
4756   // vectorization. The pointer operands of interleaved accesses are an
4757   // example.
4758   SmallSetVector<Instruction *, 8> ConsecutiveLikePtrs;
4759 
4760   // Holds pointer operands of instructions that are possibly non-uniform.
4761   SmallPtrSet<Instruction *, 8> PossibleNonUniformPtrs;
4762 
4763   auto isUniformDecision = [&](Instruction *I, unsigned VF) {
4764     InstWidening WideningDecision = getWideningDecision(I, VF);
4765     assert(WideningDecision != CM_Unknown &&
4766            "Widening decision should be ready at this moment");
4767 
4768     return (WideningDecision == CM_Widen ||
4769             WideningDecision == CM_Widen_Reverse ||
4770             WideningDecision == CM_Interleave);
4771   };
4772   // Iterate over the instructions in the loop, and collect all
4773   // consecutive-like pointer operands in ConsecutiveLikePtrs. If it's possible
4774   // that a consecutive-like pointer operand will be scalarized, we collect it
4775   // in PossibleNonUniformPtrs instead. We use two sets here because a single
4776   // getelementptr instruction can be used by both vectorized and scalarized
4777   // memory instructions. For example, if a loop loads and stores from the same
4778   // location, but the store is conditional, the store will be scalarized, and
4779   // the getelementptr won't remain uniform.
4780   for (auto *BB : TheLoop->blocks())
4781     for (auto &I : *BB) {
4782       // If there's no pointer operand, there's nothing to do.
4783       auto *Ptr = dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
4784       if (!Ptr)
4785         continue;
4786 
4787       // True if all users of Ptr are memory accesses that have Ptr as their
4788       // pointer operand.
4789       auto UsersAreMemAccesses =
4790           llvm::all_of(Ptr->users(), [&](User *U) -> bool {
4791             return getLoadStorePointerOperand(U) == Ptr;
4792           });
4793 
4794       // Ensure the memory instruction will not be scalarized or used by
4795       // gather/scatter, making its pointer operand non-uniform. If the pointer
4796       // operand is used by any instruction other than a memory access, we
4797       // conservatively assume the pointer operand may be non-uniform.
4798       if (!UsersAreMemAccesses || !isUniformDecision(&I, VF))
4799         PossibleNonUniformPtrs.insert(Ptr);
4800 
4801       // If the memory instruction will be vectorized and its pointer operand
4802       // is consecutive-like, or interleaving - the pointer operand should
4803       // remain uniform.
4804       else
4805         ConsecutiveLikePtrs.insert(Ptr);
4806     }
4807 
4808   // Add to the Worklist all consecutive and consecutive-like pointers that
4809   // aren't also identified as possibly non-uniform.
4810   for (auto *V : ConsecutiveLikePtrs)
4811     if (PossibleNonUniformPtrs.find(V) == PossibleNonUniformPtrs.end())
4812       addToWorklistIfAllowed(V);
4813 
4814   // Expand Worklist in topological order: whenever a new instruction
4815   // is added , its users should be already inside Worklist.  It ensures
4816   // a uniform instruction will only be used by uniform instructions.
4817   unsigned idx = 0;
4818   while (idx != Worklist.size()) {
4819     Instruction *I = Worklist[idx++];
4820 
4821     for (auto OV : I->operand_values()) {
4822       // isOutOfScope operands cannot be uniform instructions.
4823       if (isOutOfScope(OV))
4824         continue;
4825       // First order recurrence Phi's should typically be considered
4826       // non-uniform.
4827       auto *OP = dyn_cast<PHINode>(OV);
4828       if (OP && Legal->isFirstOrderRecurrence(OP))
4829         continue;
4830       // If all the users of the operand are uniform, then add the
4831       // operand into the uniform worklist.
4832       auto *OI = cast<Instruction>(OV);
4833       if (llvm::all_of(OI->users(), [&](User *U) -> bool {
4834             auto *J = cast<Instruction>(U);
4835             return Worklist.count(J) ||
4836                    (OI == getLoadStorePointerOperand(J) &&
4837                     isUniformDecision(J, VF));
4838           }))
4839         addToWorklistIfAllowed(OI);
4840     }
4841   }
4842 
4843   // Returns true if Ptr is the pointer operand of a memory access instruction
4844   // I, and I is known to not require scalarization.
4845   auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
4846     return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF);
4847   };
4848 
4849   // For an instruction to be added into Worklist above, all its users inside
4850   // the loop should also be in Worklist. However, this condition cannot be
4851   // true for phi nodes that form a cyclic dependence. We must process phi
4852   // nodes separately. An induction variable will remain uniform if all users
4853   // of the induction variable and induction variable update remain uniform.
4854   // The code below handles both pointer and non-pointer induction variables.
4855   for (auto &Induction : Legal->getInductionVars()) {
4856     auto *Ind = Induction.first;
4857     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4858 
4859     // Determine if all users of the induction variable are uniform after
4860     // vectorization.
4861     auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4862       auto *I = cast<Instruction>(U);
4863       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
4864              isVectorizedMemAccessUse(I, Ind);
4865     });
4866     if (!UniformInd)
4867       continue;
4868 
4869     // Determine if all users of the induction variable update instruction are
4870     // uniform after vectorization.
4871     auto UniformIndUpdate =
4872         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4873           auto *I = cast<Instruction>(U);
4874           return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
4875                  isVectorizedMemAccessUse(I, IndUpdate);
4876         });
4877     if (!UniformIndUpdate)
4878       continue;
4879 
4880     // The induction variable and its update instruction will remain uniform.
4881     addToWorklistIfAllowed(Ind);
4882     addToWorklistIfAllowed(IndUpdate);
4883   }
4884 
4885   Uniforms[VF].insert(Worklist.begin(), Worklist.end());
4886 }
4887 
4888 bool LoopVectorizationCostModel::runtimeChecksRequired() {
4889   LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
4890 
4891   if (Legal->getRuntimePointerChecking()->Need) {
4892     reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
4893         "runtime pointer checks needed. Enable vectorization of this "
4894         "loop with '#pragma clang loop vectorize(enable)' when "
4895         "compiling with -Os/-Oz",
4896         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4897     return true;
4898   }
4899 
4900   if (!PSE.getUnionPredicate().getPredicates().empty()) {
4901     reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
4902         "runtime SCEV checks needed. Enable vectorization of this "
4903         "loop with '#pragma clang loop vectorize(enable)' when "
4904         "compiling with -Os/-Oz",
4905         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4906     return true;
4907   }
4908 
4909   // FIXME: Avoid specializing for stride==1 instead of bailing out.
4910   if (!Legal->getLAI()->getSymbolicStrides().empty()) {
4911     reportVectorizationFailure("Runtime stride check is required with -Os/-Oz",
4912         "runtime stride == 1 checks needed. Enable vectorization of "
4913         "this loop with '#pragma clang loop vectorize(enable)' when "
4914         "compiling with -Os/-Oz",
4915         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4916     return true;
4917   }
4918 
4919   return false;
4920 }
4921 
4922 Optional<unsigned> LoopVectorizationCostModel::computeMaxVF() {
4923   if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
4924     // TODO: It may by useful to do since it's still likely to be dynamically
4925     // uniform if the target can skip.
4926     reportVectorizationFailure(
4927         "Not inserting runtime ptr check for divergent target",
4928         "runtime pointer checks needed. Not enabled for divergent target",
4929         "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
4930     return None;
4931   }
4932 
4933   unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
4934   LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
4935   if (TC == 1) {
4936     reportVectorizationFailure("Single iteration (non) loop",
4937         "loop trip count is one, irrelevant for vectorization",
4938         "SingleIterationLoop", ORE, TheLoop);
4939     return None;
4940   }
4941 
4942   switch (ScalarEpilogueStatus) {
4943   case CM_ScalarEpilogueAllowed:
4944     return computeFeasibleMaxVF(TC);
4945   case CM_ScalarEpilogueNotNeededUsePredicate:
4946     LLVM_DEBUG(
4947         dbgs() << "LV: vector predicate hint/switch found.\n"
4948                << "LV: Not allowing scalar epilogue, creating predicated "
4949                << "vector loop.\n");
4950     break;
4951   case CM_ScalarEpilogueNotAllowedLowTripLoop:
4952     // fallthrough as a special case of OptForSize
4953   case CM_ScalarEpilogueNotAllowedOptSize:
4954     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
4955       LLVM_DEBUG(
4956           dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
4957     else
4958       LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
4959                         << "count.\n");
4960 
4961     // Bail if runtime checks are required, which are not good when optimising
4962     // for size.
4963     if (runtimeChecksRequired())
4964       return None;
4965     break;
4966   }
4967 
4968   // Now try the tail folding
4969 
4970   // Invalidate interleave groups that require an epilogue if we can't mask
4971   // the interleave-group.
4972   if (!useMaskedInterleavedAccesses(TTI))
4973     InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
4974 
4975   unsigned MaxVF = computeFeasibleMaxVF(TC);
4976   if (TC > 0 && TC % MaxVF == 0) {
4977     // Accept MaxVF if we do not have a tail.
4978     LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
4979     return MaxVF;
4980   }
4981 
4982   // If we don't know the precise trip count, or if the trip count that we
4983   // found modulo the vectorization factor is not zero, try to fold the tail
4984   // by masking.
4985   // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
4986   if (Legal->prepareToFoldTailByMasking()) {
4987     FoldTailByMasking = true;
4988     return MaxVF;
4989   }
4990 
4991   if (TC == 0) {
4992     reportVectorizationFailure(
4993         "Unable to calculate the loop count due to complex control flow",
4994         "unable to calculate the loop count due to complex control flow",
4995         "UnknownLoopCountComplexCFG", ORE, TheLoop);
4996     return None;
4997   }
4998 
4999   reportVectorizationFailure(
5000       "Cannot optimize for size and vectorize at the same time.",
5001       "cannot optimize for size and vectorize at the same time. "
5002       "Enable vectorization of this loop with '#pragma clang loop "
5003       "vectorize(enable)' when compiling with -Os/-Oz",
5004       "NoTailLoopWithOptForSize", ORE, TheLoop);
5005   return None;
5006 }
5007 
5008 unsigned
5009 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) {
5010   MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
5011   unsigned SmallestType, WidestType;
5012   std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
5013   unsigned WidestRegister = TTI.getRegisterBitWidth(true);
5014 
5015   // Get the maximum safe dependence distance in bits computed by LAA.
5016   // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
5017   // the memory accesses that is most restrictive (involved in the smallest
5018   // dependence distance).
5019   unsigned MaxSafeRegisterWidth = Legal->getMaxSafeRegisterWidth();
5020 
5021   WidestRegister = std::min(WidestRegister, MaxSafeRegisterWidth);
5022 
5023   unsigned MaxVectorSize = WidestRegister / WidestType;
5024 
5025   LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
5026                     << " / " << WidestType << " bits.\n");
5027   LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
5028                     << WidestRegister << " bits.\n");
5029 
5030   assert(MaxVectorSize <= 256 && "Did not expect to pack so many elements"
5031                                  " into one vector!");
5032   if (MaxVectorSize == 0) {
5033     LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n");
5034     MaxVectorSize = 1;
5035     return MaxVectorSize;
5036   } else if (ConstTripCount && ConstTripCount < MaxVectorSize &&
5037              isPowerOf2_32(ConstTripCount)) {
5038     // We need to clamp the VF to be the ConstTripCount. There is no point in
5039     // choosing a higher viable VF as done in the loop below.
5040     LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: "
5041                       << ConstTripCount << "\n");
5042     MaxVectorSize = ConstTripCount;
5043     return MaxVectorSize;
5044   }
5045 
5046   unsigned MaxVF = MaxVectorSize;
5047   if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) ||
5048       (MaximizeBandwidth && isScalarEpilogueAllowed())) {
5049     // Collect all viable vectorization factors larger than the default MaxVF
5050     // (i.e. MaxVectorSize).
5051     SmallVector<unsigned, 8> VFs;
5052     unsigned NewMaxVectorSize = WidestRegister / SmallestType;
5053     for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2)
5054       VFs.push_back(VS);
5055 
5056     // For each VF calculate its register usage.
5057     auto RUs = calculateRegisterUsage(VFs);
5058 
5059     // Select the largest VF which doesn't require more registers than existing
5060     // ones.
5061     for (int i = RUs.size() - 1; i >= 0; --i) {
5062       bool Selected = true;
5063       for (auto& pair : RUs[i].MaxLocalUsers) {
5064         unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5065         if (pair.second > TargetNumRegisters)
5066           Selected = false;
5067       }
5068       if (Selected) {
5069         MaxVF = VFs[i];
5070         break;
5071       }
5072     }
5073     if (unsigned MinVF = TTI.getMinimumVF(SmallestType)) {
5074       if (MaxVF < MinVF) {
5075         LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
5076                           << ") with target's minimum: " << MinVF << '\n');
5077         MaxVF = MinVF;
5078       }
5079     }
5080   }
5081   return MaxVF;
5082 }
5083 
5084 VectorizationFactor
5085 LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) {
5086   float Cost = expectedCost(1).first;
5087   const float ScalarCost = Cost;
5088   unsigned Width = 1;
5089   LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n");
5090 
5091   bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
5092   if (ForceVectorization && MaxVF > 1) {
5093     // Ignore scalar width, because the user explicitly wants vectorization.
5094     // Initialize cost to max so that VF = 2 is, at least, chosen during cost
5095     // evaluation.
5096     Cost = std::numeric_limits<float>::max();
5097   }
5098 
5099   for (unsigned i = 2; i <= MaxVF; i *= 2) {
5100     // Notice that the vector loop needs to be executed less times, so
5101     // we need to divide the cost of the vector loops by the width of
5102     // the vector elements.
5103     VectorizationCostTy C = expectedCost(i);
5104     float VectorCost = C.first / (float)i;
5105     LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
5106                       << " costs: " << (int)VectorCost << ".\n");
5107     if (!C.second && !ForceVectorization) {
5108       LLVM_DEBUG(
5109           dbgs() << "LV: Not considering vector loop of width " << i
5110                  << " because it will not generate any vector instructions.\n");
5111       continue;
5112     }
5113     if (VectorCost < Cost) {
5114       Cost = VectorCost;
5115       Width = i;
5116     }
5117   }
5118 
5119   if (!EnableCondStoresVectorization && NumPredStores) {
5120     reportVectorizationFailure("There are conditional stores.",
5121         "store that is conditionally executed prevents vectorization",
5122         "ConditionalStore", ORE, TheLoop);
5123     Width = 1;
5124     Cost = ScalarCost;
5125   }
5126 
5127   LLVM_DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs()
5128              << "LV: Vectorization seems to be not beneficial, "
5129              << "but was forced by a user.\n");
5130   LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n");
5131   VectorizationFactor Factor = {Width, (unsigned)(Width * Cost)};
5132   return Factor;
5133 }
5134 
5135 std::pair<unsigned, unsigned>
5136 LoopVectorizationCostModel::getSmallestAndWidestTypes() {
5137   unsigned MinWidth = -1U;
5138   unsigned MaxWidth = 8;
5139   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5140 
5141   // For each block.
5142   for (BasicBlock *BB : TheLoop->blocks()) {
5143     // For each instruction in the loop.
5144     for (Instruction &I : BB->instructionsWithoutDebug()) {
5145       Type *T = I.getType();
5146 
5147       // Skip ignored values.
5148       if (ValuesToIgnore.find(&I) != ValuesToIgnore.end())
5149         continue;
5150 
5151       // Only examine Loads, Stores and PHINodes.
5152       if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
5153         continue;
5154 
5155       // Examine PHI nodes that are reduction variables. Update the type to
5156       // account for the recurrence type.
5157       if (auto *PN = dyn_cast<PHINode>(&I)) {
5158         if (!Legal->isReductionVariable(PN))
5159           continue;
5160         RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[PN];
5161         T = RdxDesc.getRecurrenceType();
5162       }
5163 
5164       // Examine the stored values.
5165       if (auto *ST = dyn_cast<StoreInst>(&I))
5166         T = ST->getValueOperand()->getType();
5167 
5168       // Ignore loaded pointer types and stored pointer types that are not
5169       // vectorizable.
5170       //
5171       // FIXME: The check here attempts to predict whether a load or store will
5172       //        be vectorized. We only know this for certain after a VF has
5173       //        been selected. Here, we assume that if an access can be
5174       //        vectorized, it will be. We should also look at extending this
5175       //        optimization to non-pointer types.
5176       //
5177       if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) &&
5178           !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I))
5179         continue;
5180 
5181       MinWidth = std::min(MinWidth,
5182                           (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
5183       MaxWidth = std::max(MaxWidth,
5184                           (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
5185     }
5186   }
5187 
5188   return {MinWidth, MaxWidth};
5189 }
5190 
5191 unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF,
5192                                                            unsigned LoopCost) {
5193   // -- The interleave heuristics --
5194   // We interleave the loop in order to expose ILP and reduce the loop overhead.
5195   // There are many micro-architectural considerations that we can't predict
5196   // at this level. For example, frontend pressure (on decode or fetch) due to
5197   // code size, or the number and capabilities of the execution ports.
5198   //
5199   // We use the following heuristics to select the interleave count:
5200   // 1. If the code has reductions, then we interleave to break the cross
5201   // iteration dependency.
5202   // 2. If the loop is really small, then we interleave to reduce the loop
5203   // overhead.
5204   // 3. We don't interleave if we think that we will spill registers to memory
5205   // due to the increased register pressure.
5206 
5207   if (!isScalarEpilogueAllowed())
5208     return 1;
5209 
5210   // We used the distance for the interleave count.
5211   if (Legal->getMaxSafeDepDistBytes() != -1U)
5212     return 1;
5213 
5214   // Do not interleave loops with a relatively small known or estimated trip
5215   // count.
5216   auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
5217   if (BestKnownTC && *BestKnownTC < TinyTripCountInterleaveThreshold)
5218     return 1;
5219 
5220   RegisterUsage R = calculateRegisterUsage({VF})[0];
5221   // We divide by these constants so assume that we have at least one
5222   // instruction that uses at least one register.
5223   for (auto& pair : R.MaxLocalUsers) {
5224     pair.second = std::max(pair.second, 1U);
5225   }
5226 
5227   // We calculate the interleave count using the following formula.
5228   // Subtract the number of loop invariants from the number of available
5229   // registers. These registers are used by all of the interleaved instances.
5230   // Next, divide the remaining registers by the number of registers that is
5231   // required by the loop, in order to estimate how many parallel instances
5232   // fit without causing spills. All of this is rounded down if necessary to be
5233   // a power of two. We want power of two interleave count to simplify any
5234   // addressing operations or alignment considerations.
5235   // We also want power of two interleave counts to ensure that the induction
5236   // variable of the vector loop wraps to zero, when tail is folded by masking;
5237   // this currently happens when OptForSize, in which case IC is set to 1 above.
5238   unsigned IC = UINT_MAX;
5239 
5240   for (auto& pair : R.MaxLocalUsers) {
5241     unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5242     LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
5243                       << " registers of "
5244                       << TTI.getRegisterClassName(pair.first) << " register class\n");
5245     if (VF == 1) {
5246       if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
5247         TargetNumRegisters = ForceTargetNumScalarRegs;
5248     } else {
5249       if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
5250         TargetNumRegisters = ForceTargetNumVectorRegs;
5251     }
5252     unsigned MaxLocalUsers = pair.second;
5253     unsigned LoopInvariantRegs = 0;
5254     if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end())
5255       LoopInvariantRegs = R.LoopInvariantRegs[pair.first];
5256 
5257     unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers);
5258     // Don't count the induction variable as interleaved.
5259     if (EnableIndVarRegisterHeur) {
5260       TmpIC =
5261           PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) /
5262                         std::max(1U, (MaxLocalUsers - 1)));
5263     }
5264 
5265     IC = std::min(IC, TmpIC);
5266   }
5267 
5268   // Clamp the interleave ranges to reasonable counts.
5269   unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF);
5270 
5271   // Check if the user has overridden the max.
5272   if (VF == 1) {
5273     if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
5274       MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
5275   } else {
5276     if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
5277       MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
5278   }
5279 
5280   // If trip count is known or estimated compile time constant, limit the
5281   // interleave count to be less than the trip count divided by VF.
5282   if (BestKnownTC) {
5283     MaxInterleaveCount = std::min(*BestKnownTC / VF, MaxInterleaveCount);
5284   }
5285 
5286   // If we did not calculate the cost for VF (because the user selected the VF)
5287   // then we calculate the cost of VF here.
5288   if (LoopCost == 0)
5289     LoopCost = expectedCost(VF).first;
5290 
5291   assert(LoopCost && "Non-zero loop cost expected");
5292 
5293   // Clamp the calculated IC to be between the 1 and the max interleave count
5294   // that the target and trip count allows.
5295   if (IC > MaxInterleaveCount)
5296     IC = MaxInterleaveCount;
5297   else if (IC < 1)
5298     IC = 1;
5299 
5300   // Interleave if we vectorized this loop and there is a reduction that could
5301   // benefit from interleaving.
5302   if (VF > 1 && !Legal->getReductionVars().empty()) {
5303     LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
5304     return IC;
5305   }
5306 
5307   // Note that if we've already vectorized the loop we will have done the
5308   // runtime check and so interleaving won't require further checks.
5309   bool InterleavingRequiresRuntimePointerCheck =
5310       (VF == 1 && Legal->getRuntimePointerChecking()->Need);
5311 
5312   // We want to interleave small loops in order to reduce the loop overhead and
5313   // potentially expose ILP opportunities.
5314   LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n');
5315   if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) {
5316     // We assume that the cost overhead is 1 and we use the cost model
5317     // to estimate the cost of the loop and interleave until the cost of the
5318     // loop overhead is about 5% of the cost of the loop.
5319     unsigned SmallIC =
5320         std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost));
5321 
5322     // Interleave until store/load ports (estimated by max interleave count) are
5323     // saturated.
5324     unsigned NumStores = Legal->getNumStores();
5325     unsigned NumLoads = Legal->getNumLoads();
5326     unsigned StoresIC = IC / (NumStores ? NumStores : 1);
5327     unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
5328 
5329     // If we have a scalar reduction (vector reductions are already dealt with
5330     // by this point), we can increase the critical path length if the loop
5331     // we're interleaving is inside another loop. Limit, by default to 2, so the
5332     // critical path only gets increased by one reduction operation.
5333     if (!Legal->getReductionVars().empty() && TheLoop->getLoopDepth() > 1) {
5334       unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
5335       SmallIC = std::min(SmallIC, F);
5336       StoresIC = std::min(StoresIC, F);
5337       LoadsIC = std::min(LoadsIC, F);
5338     }
5339 
5340     if (EnableLoadStoreRuntimeInterleave &&
5341         std::max(StoresIC, LoadsIC) > SmallIC) {
5342       LLVM_DEBUG(
5343           dbgs() << "LV: Interleaving to saturate store or load ports.\n");
5344       return std::max(StoresIC, LoadsIC);
5345     }
5346 
5347     LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
5348     return SmallIC;
5349   }
5350 
5351   // Interleave if this is a large loop (small loops are already dealt with by
5352   // this point) that could benefit from interleaving.
5353   bool HasReductions = !Legal->getReductionVars().empty();
5354   if (TTI.enableAggressiveInterleaving(HasReductions)) {
5355     LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5356     return IC;
5357   }
5358 
5359   LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
5360   return 1;
5361 }
5362 
5363 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
5364 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {
5365   // This function calculates the register usage by measuring the highest number
5366   // of values that are alive at a single location. Obviously, this is a very
5367   // rough estimation. We scan the loop in a topological order in order and
5368   // assign a number to each instruction. We use RPO to ensure that defs are
5369   // met before their users. We assume that each instruction that has in-loop
5370   // users starts an interval. We record every time that an in-loop value is
5371   // used, so we have a list of the first and last occurrences of each
5372   // instruction. Next, we transpose this data structure into a multi map that
5373   // holds the list of intervals that *end* at a specific location. This multi
5374   // map allows us to perform a linear search. We scan the instructions linearly
5375   // and record each time that a new interval starts, by placing it in a set.
5376   // If we find this value in the multi-map then we remove it from the set.
5377   // The max register usage is the maximum size of the set.
5378   // We also search for instructions that are defined outside the loop, but are
5379   // used inside the loop. We need this number separately from the max-interval
5380   // usage number because when we unroll, loop-invariant values do not take
5381   // more register.
5382   LoopBlocksDFS DFS(TheLoop);
5383   DFS.perform(LI);
5384 
5385   RegisterUsage RU;
5386 
5387   // Each 'key' in the map opens a new interval. The values
5388   // of the map are the index of the 'last seen' usage of the
5389   // instruction that is the key.
5390   using IntervalMap = DenseMap<Instruction *, unsigned>;
5391 
5392   // Maps instruction to its index.
5393   SmallVector<Instruction *, 64> IdxToInstr;
5394   // Marks the end of each interval.
5395   IntervalMap EndPoint;
5396   // Saves the list of instruction indices that are used in the loop.
5397   SmallPtrSet<Instruction *, 8> Ends;
5398   // Saves the list of values that are used in the loop but are
5399   // defined outside the loop, such as arguments and constants.
5400   SmallPtrSet<Value *, 8> LoopInvariants;
5401 
5402   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
5403     for (Instruction &I : BB->instructionsWithoutDebug()) {
5404       IdxToInstr.push_back(&I);
5405 
5406       // Save the end location of each USE.
5407       for (Value *U : I.operands()) {
5408         auto *Instr = dyn_cast<Instruction>(U);
5409 
5410         // Ignore non-instruction values such as arguments, constants, etc.
5411         if (!Instr)
5412           continue;
5413 
5414         // If this instruction is outside the loop then record it and continue.
5415         if (!TheLoop->contains(Instr)) {
5416           LoopInvariants.insert(Instr);
5417           continue;
5418         }
5419 
5420         // Overwrite previous end points.
5421         EndPoint[Instr] = IdxToInstr.size();
5422         Ends.insert(Instr);
5423       }
5424     }
5425   }
5426 
5427   // Saves the list of intervals that end with the index in 'key'.
5428   using InstrList = SmallVector<Instruction *, 2>;
5429   DenseMap<unsigned, InstrList> TransposeEnds;
5430 
5431   // Transpose the EndPoints to a list of values that end at each index.
5432   for (auto &Interval : EndPoint)
5433     TransposeEnds[Interval.second].push_back(Interval.first);
5434 
5435   SmallPtrSet<Instruction *, 8> OpenIntervals;
5436 
5437   // Get the size of the widest register.
5438   unsigned MaxSafeDepDist = -1U;
5439   if (Legal->getMaxSafeDepDistBytes() != -1U)
5440     MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8;
5441   unsigned WidestRegister =
5442       std::min(TTI.getRegisterBitWidth(true), MaxSafeDepDist);
5443   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5444 
5445   SmallVector<RegisterUsage, 8> RUs(VFs.size());
5446   SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size());
5447 
5448   LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
5449 
5450   // A lambda that gets the register usage for the given type and VF.
5451   auto GetRegUsage = [&DL, WidestRegister](Type *Ty, unsigned VF) {
5452     if (Ty->isTokenTy())
5453       return 0U;
5454     unsigned TypeSize = DL.getTypeSizeInBits(Ty->getScalarType());
5455     return std::max<unsigned>(1, VF * TypeSize / WidestRegister);
5456   };
5457 
5458   for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
5459     Instruction *I = IdxToInstr[i];
5460 
5461     // Remove all of the instructions that end at this location.
5462     InstrList &List = TransposeEnds[i];
5463     for (Instruction *ToRemove : List)
5464       OpenIntervals.erase(ToRemove);
5465 
5466     // Ignore instructions that are never used within the loop.
5467     if (Ends.find(I) == Ends.end())
5468       continue;
5469 
5470     // Skip ignored values.
5471     if (ValuesToIgnore.find(I) != ValuesToIgnore.end())
5472       continue;
5473 
5474     // For each VF find the maximum usage of registers.
5475     for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
5476       // Count the number of live intervals.
5477       SmallMapVector<unsigned, unsigned, 4> RegUsage;
5478 
5479       if (VFs[j] == 1) {
5480         for (auto Inst : OpenIntervals) {
5481           unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
5482           if (RegUsage.find(ClassID) == RegUsage.end())
5483             RegUsage[ClassID] = 1;
5484           else
5485             RegUsage[ClassID] += 1;
5486         }
5487       } else {
5488         collectUniformsAndScalars(VFs[j]);
5489         for (auto Inst : OpenIntervals) {
5490           // Skip ignored values for VF > 1.
5491           if (VecValuesToIgnore.find(Inst) != VecValuesToIgnore.end())
5492             continue;
5493           if (isScalarAfterVectorization(Inst, VFs[j])) {
5494             unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
5495             if (RegUsage.find(ClassID) == RegUsage.end())
5496               RegUsage[ClassID] = 1;
5497             else
5498               RegUsage[ClassID] += 1;
5499           } else {
5500             unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType());
5501             if (RegUsage.find(ClassID) == RegUsage.end())
5502               RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]);
5503             else
5504               RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]);
5505           }
5506         }
5507       }
5508 
5509       for (auto& pair : RegUsage) {
5510         if (MaxUsages[j].find(pair.first) != MaxUsages[j].end())
5511           MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second);
5512         else
5513           MaxUsages[j][pair.first] = pair.second;
5514       }
5515     }
5516 
5517     LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
5518                       << OpenIntervals.size() << '\n');
5519 
5520     // Add the current instruction to the list of open intervals.
5521     OpenIntervals.insert(I);
5522   }
5523 
5524   for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
5525     SmallMapVector<unsigned, unsigned, 4> Invariant;
5526 
5527     for (auto Inst : LoopInvariants) {
5528       unsigned Usage = VFs[i] == 1 ? 1 : GetRegUsage(Inst->getType(), VFs[i]);
5529       unsigned ClassID = TTI.getRegisterClassForType(VFs[i] > 1, Inst->getType());
5530       if (Invariant.find(ClassID) == Invariant.end())
5531         Invariant[ClassID] = Usage;
5532       else
5533         Invariant[ClassID] += Usage;
5534     }
5535 
5536     LLVM_DEBUG({
5537       dbgs() << "LV(REG): VF = " << VFs[i] << '\n';
5538       dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size()
5539              << " item\n";
5540       for (const auto &pair : MaxUsages[i]) {
5541         dbgs() << "LV(REG): RegisterClass: "
5542                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5543                << " registers\n";
5544       }
5545       dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()
5546              << " item\n";
5547       for (const auto &pair : Invariant) {
5548         dbgs() << "LV(REG): RegisterClass: "
5549                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5550                << " registers\n";
5551       }
5552     });
5553 
5554     RU.LoopInvariantRegs = Invariant;
5555     RU.MaxLocalUsers = MaxUsages[i];
5556     RUs[i] = RU;
5557   }
5558 
5559   return RUs;
5560 }
5561 
5562 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){
5563   // TODO: Cost model for emulated masked load/store is completely
5564   // broken. This hack guides the cost model to use an artificially
5565   // high enough value to practically disable vectorization with such
5566   // operations, except where previously deployed legality hack allowed
5567   // using very low cost values. This is to avoid regressions coming simply
5568   // from moving "masked load/store" check from legality to cost model.
5569   // Masked Load/Gather emulation was previously never allowed.
5570   // Limited number of Masked Store/Scatter emulation was allowed.
5571   assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction");
5572   return isa<LoadInst>(I) ||
5573          (isa<StoreInst>(I) &&
5574           NumPredStores > NumberOfStoresToPredicate);
5575 }
5576 
5577 void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) {
5578   // If we aren't vectorizing the loop, or if we've already collected the
5579   // instructions to scalarize, there's nothing to do. Collection may already
5580   // have occurred if we have a user-selected VF and are now computing the
5581   // expected cost for interleaving.
5582   if (VF < 2 || InstsToScalarize.find(VF) != InstsToScalarize.end())
5583     return;
5584 
5585   // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
5586   // not profitable to scalarize any instructions, the presence of VF in the
5587   // map will indicate that we've analyzed it already.
5588   ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
5589 
5590   // Find all the instructions that are scalar with predication in the loop and
5591   // determine if it would be better to not if-convert the blocks they are in.
5592   // If so, we also record the instructions to scalarize.
5593   for (BasicBlock *BB : TheLoop->blocks()) {
5594     if (!blockNeedsPredication(BB))
5595       continue;
5596     for (Instruction &I : *BB)
5597       if (isScalarWithPredication(&I)) {
5598         ScalarCostsTy ScalarCosts;
5599         // Do not apply discount logic if hacked cost is needed
5600         // for emulated masked memrefs.
5601         if (!useEmulatedMaskMemRefHack(&I) &&
5602             computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
5603           ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
5604         // Remember that BB will remain after vectorization.
5605         PredicatedBBsAfterVectorization.insert(BB);
5606       }
5607   }
5608 }
5609 
5610 int LoopVectorizationCostModel::computePredInstDiscount(
5611     Instruction *PredInst, DenseMap<Instruction *, unsigned> &ScalarCosts,
5612     unsigned VF) {
5613   assert(!isUniformAfterVectorization(PredInst, VF) &&
5614          "Instruction marked uniform-after-vectorization will be predicated");
5615 
5616   // Initialize the discount to zero, meaning that the scalar version and the
5617   // vector version cost the same.
5618   int Discount = 0;
5619 
5620   // Holds instructions to analyze. The instructions we visit are mapped in
5621   // ScalarCosts. Those instructions are the ones that would be scalarized if
5622   // we find that the scalar version costs less.
5623   SmallVector<Instruction *, 8> Worklist;
5624 
5625   // Returns true if the given instruction can be scalarized.
5626   auto canBeScalarized = [&](Instruction *I) -> bool {
5627     // We only attempt to scalarize instructions forming a single-use chain
5628     // from the original predicated block that would otherwise be vectorized.
5629     // Although not strictly necessary, we give up on instructions we know will
5630     // already be scalar to avoid traversing chains that are unlikely to be
5631     // beneficial.
5632     if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
5633         isScalarAfterVectorization(I, VF))
5634       return false;
5635 
5636     // If the instruction is scalar with predication, it will be analyzed
5637     // separately. We ignore it within the context of PredInst.
5638     if (isScalarWithPredication(I))
5639       return false;
5640 
5641     // If any of the instruction's operands are uniform after vectorization,
5642     // the instruction cannot be scalarized. This prevents, for example, a
5643     // masked load from being scalarized.
5644     //
5645     // We assume we will only emit a value for lane zero of an instruction
5646     // marked uniform after vectorization, rather than VF identical values.
5647     // Thus, if we scalarize an instruction that uses a uniform, we would
5648     // create uses of values corresponding to the lanes we aren't emitting code
5649     // for. This behavior can be changed by allowing getScalarValue to clone
5650     // the lane zero values for uniforms rather than asserting.
5651     for (Use &U : I->operands())
5652       if (auto *J = dyn_cast<Instruction>(U.get()))
5653         if (isUniformAfterVectorization(J, VF))
5654           return false;
5655 
5656     // Otherwise, we can scalarize the instruction.
5657     return true;
5658   };
5659 
5660   // Compute the expected cost discount from scalarizing the entire expression
5661   // feeding the predicated instruction. We currently only consider expressions
5662   // that are single-use instruction chains.
5663   Worklist.push_back(PredInst);
5664   while (!Worklist.empty()) {
5665     Instruction *I = Worklist.pop_back_val();
5666 
5667     // If we've already analyzed the instruction, there's nothing to do.
5668     if (ScalarCosts.find(I) != ScalarCosts.end())
5669       continue;
5670 
5671     // Compute the cost of the vector instruction. Note that this cost already
5672     // includes the scalarization overhead of the predicated instruction.
5673     unsigned VectorCost = getInstructionCost(I, VF).first;
5674 
5675     // Compute the cost of the scalarized instruction. This cost is the cost of
5676     // the instruction as if it wasn't if-converted and instead remained in the
5677     // predicated block. We will scale this cost by block probability after
5678     // computing the scalarization overhead.
5679     unsigned ScalarCost = VF * getInstructionCost(I, 1).first;
5680 
5681     // Compute the scalarization overhead of needed insertelement instructions
5682     // and phi nodes.
5683     if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) {
5684       ScalarCost += TTI.getScalarizationOverhead(ToVectorTy(I->getType(), VF),
5685                                                  true, false);
5686       ScalarCost += VF * TTI.getCFInstrCost(Instruction::PHI);
5687     }
5688 
5689     // Compute the scalarization overhead of needed extractelement
5690     // instructions. For each of the instruction's operands, if the operand can
5691     // be scalarized, add it to the worklist; otherwise, account for the
5692     // overhead.
5693     for (Use &U : I->operands())
5694       if (auto *J = dyn_cast<Instruction>(U.get())) {
5695         assert(VectorType::isValidElementType(J->getType()) &&
5696                "Instruction has non-scalar type");
5697         if (canBeScalarized(J))
5698           Worklist.push_back(J);
5699         else if (needsExtract(J, VF))
5700           ScalarCost += TTI.getScalarizationOverhead(
5701                               ToVectorTy(J->getType(),VF), false, true);
5702       }
5703 
5704     // Scale the total scalar cost by block probability.
5705     ScalarCost /= getReciprocalPredBlockProb();
5706 
5707     // Compute the discount. A non-negative discount means the vector version
5708     // of the instruction costs more, and scalarizing would be beneficial.
5709     Discount += VectorCost - ScalarCost;
5710     ScalarCosts[I] = ScalarCost;
5711   }
5712 
5713   return Discount;
5714 }
5715 
5716 LoopVectorizationCostModel::VectorizationCostTy
5717 LoopVectorizationCostModel::expectedCost(unsigned VF) {
5718   VectorizationCostTy Cost;
5719 
5720   // For each block.
5721   for (BasicBlock *BB : TheLoop->blocks()) {
5722     VectorizationCostTy BlockCost;
5723 
5724     // For each instruction in the old loop.
5725     for (Instruction &I : BB->instructionsWithoutDebug()) {
5726       // Skip ignored values.
5727       if (ValuesToIgnore.find(&I) != ValuesToIgnore.end() ||
5728           (VF > 1 && VecValuesToIgnore.find(&I) != VecValuesToIgnore.end()))
5729         continue;
5730 
5731       VectorizationCostTy C = getInstructionCost(&I, VF);
5732 
5733       // Check if we should override the cost.
5734       if (ForceTargetInstructionCost.getNumOccurrences() > 0)
5735         C.first = ForceTargetInstructionCost;
5736 
5737       BlockCost.first += C.first;
5738       BlockCost.second |= C.second;
5739       LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first
5740                         << " for VF " << VF << " For instruction: " << I
5741                         << '\n');
5742     }
5743 
5744     // If we are vectorizing a predicated block, it will have been
5745     // if-converted. This means that the block's instructions (aside from
5746     // stores and instructions that may divide by zero) will now be
5747     // unconditionally executed. For the scalar case, we may not always execute
5748     // the predicated block. Thus, scale the block's cost by the probability of
5749     // executing it.
5750     if (VF == 1 && blockNeedsPredication(BB))
5751       BlockCost.first /= getReciprocalPredBlockProb();
5752 
5753     Cost.first += BlockCost.first;
5754     Cost.second |= BlockCost.second;
5755   }
5756 
5757   return Cost;
5758 }
5759 
5760 /// Gets Address Access SCEV after verifying that the access pattern
5761 /// is loop invariant except the induction variable dependence.
5762 ///
5763 /// This SCEV can be sent to the Target in order to estimate the address
5764 /// calculation cost.
5765 static const SCEV *getAddressAccessSCEV(
5766               Value *Ptr,
5767               LoopVectorizationLegality *Legal,
5768               PredicatedScalarEvolution &PSE,
5769               const Loop *TheLoop) {
5770 
5771   auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
5772   if (!Gep)
5773     return nullptr;
5774 
5775   // We are looking for a gep with all loop invariant indices except for one
5776   // which should be an induction variable.
5777   auto SE = PSE.getSE();
5778   unsigned NumOperands = Gep->getNumOperands();
5779   for (unsigned i = 1; i < NumOperands; ++i) {
5780     Value *Opd = Gep->getOperand(i);
5781     if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
5782         !Legal->isInductionVariable(Opd))
5783       return nullptr;
5784   }
5785 
5786   // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
5787   return PSE.getSCEV(Ptr);
5788 }
5789 
5790 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
5791   return Legal->hasStride(I->getOperand(0)) ||
5792          Legal->hasStride(I->getOperand(1));
5793 }
5794 
5795 unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
5796                                                                  unsigned VF) {
5797   assert(VF > 1 && "Scalarization cost of instruction implies vectorization.");
5798   Type *ValTy = getMemInstValueType(I);
5799   auto SE = PSE.getSE();
5800 
5801   unsigned AS = getLoadStoreAddressSpace(I);
5802   Value *Ptr = getLoadStorePointerOperand(I);
5803   Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
5804 
5805   // Figure out whether the access is strided and get the stride value
5806   // if it's known in compile time
5807   const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
5808 
5809   // Get the cost of the scalar memory instruction and address computation.
5810   unsigned Cost = VF * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
5811 
5812   // Don't pass *I here, since it is scalar but will actually be part of a
5813   // vectorized loop where the user of it is a vectorized instruction.
5814   const MaybeAlign Alignment = getLoadStoreAlignment(I);
5815   Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(),
5816                                    Alignment, AS);
5817 
5818   // Get the overhead of the extractelement and insertelement instructions
5819   // we might create due to scalarization.
5820   Cost += getScalarizationOverhead(I, VF);
5821 
5822   // If we have a predicated store, it may not be executed for each vector
5823   // lane. Scale the cost by the probability of executing the predicated
5824   // block.
5825   if (isPredicatedInst(I)) {
5826     Cost /= getReciprocalPredBlockProb();
5827 
5828     if (useEmulatedMaskMemRefHack(I))
5829       // Artificially setting to a high enough value to practically disable
5830       // vectorization with such operations.
5831       Cost = 3000000;
5832   }
5833 
5834   return Cost;
5835 }
5836 
5837 unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
5838                                                              unsigned VF) {
5839   Type *ValTy = getMemInstValueType(I);
5840   Type *VectorTy = ToVectorTy(ValTy, VF);
5841   Value *Ptr = getLoadStorePointerOperand(I);
5842   unsigned AS = getLoadStoreAddressSpace(I);
5843   int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
5844 
5845   assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
5846          "Stride should be 1 or -1 for consecutive memory access");
5847   const MaybeAlign Alignment = getLoadStoreAlignment(I);
5848   unsigned Cost = 0;
5849   if (Legal->isMaskRequired(I))
5850     Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy,
5851                                       Alignment ? Alignment->value() : 0, AS);
5852   else
5853     Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, I);
5854 
5855   bool Reverse = ConsecutiveStride < 0;
5856   if (Reverse)
5857     Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
5858   return Cost;
5859 }
5860 
5861 unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
5862                                                          unsigned VF) {
5863   Type *ValTy = getMemInstValueType(I);
5864   Type *VectorTy = ToVectorTy(ValTy, VF);
5865   const MaybeAlign Alignment = getLoadStoreAlignment(I);
5866   unsigned AS = getLoadStoreAddressSpace(I);
5867   if (isa<LoadInst>(I)) {
5868     return TTI.getAddressComputationCost(ValTy) +
5869            TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS) +
5870            TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
5871   }
5872   StoreInst *SI = cast<StoreInst>(I);
5873 
5874   bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand());
5875   return TTI.getAddressComputationCost(ValTy) +
5876          TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS) +
5877          (isLoopInvariantStoreValue
5878               ? 0
5879               : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
5880                                        VF - 1));
5881 }
5882 
5883 unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
5884                                                           unsigned VF) {
5885   Type *ValTy = getMemInstValueType(I);
5886   Type *VectorTy = ToVectorTy(ValTy, VF);
5887   const MaybeAlign Alignment = getLoadStoreAlignment(I);
5888   Value *Ptr = getLoadStorePointerOperand(I);
5889 
5890   return TTI.getAddressComputationCost(VectorTy) +
5891          TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr,
5892                                     Legal->isMaskRequired(I),
5893                                     Alignment ? Alignment->value() : 0, I);
5894 }
5895 
5896 unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
5897                                                             unsigned VF) {
5898   Type *ValTy = getMemInstValueType(I);
5899   Type *VectorTy = ToVectorTy(ValTy, VF);
5900   unsigned AS = getLoadStoreAddressSpace(I);
5901 
5902   auto Group = getInterleavedAccessGroup(I);
5903   assert(Group && "Fail to get an interleaved access group.");
5904 
5905   unsigned InterleaveFactor = Group->getFactor();
5906   Type *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
5907 
5908   // Holds the indices of existing members in an interleaved load group.
5909   // An interleaved store group doesn't need this as it doesn't allow gaps.
5910   SmallVector<unsigned, 4> Indices;
5911   if (isa<LoadInst>(I)) {
5912     for (unsigned i = 0; i < InterleaveFactor; i++)
5913       if (Group->getMember(i))
5914         Indices.push_back(i);
5915   }
5916 
5917   // Calculate the cost of the whole interleaved group.
5918   bool UseMaskForGaps =
5919       Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
5920   unsigned Cost = TTI.getInterleavedMemoryOpCost(
5921       I->getOpcode(), WideVecTy, Group->getFactor(), Indices,
5922       Group->getAlign().value(), AS, Legal->isMaskRequired(I), UseMaskForGaps);
5923 
5924   if (Group->isReverse()) {
5925     // TODO: Add support for reversed masked interleaved access.
5926     assert(!Legal->isMaskRequired(I) &&
5927            "Reverse masked interleaved access not supported.");
5928     Cost += Group->getNumMembers() *
5929             TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
5930   }
5931   return Cost;
5932 }
5933 
5934 unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
5935                                                               unsigned VF) {
5936   // Calculate scalar cost only. Vectorization cost should be ready at this
5937   // moment.
5938   if (VF == 1) {
5939     Type *ValTy = getMemInstValueType(I);
5940     const MaybeAlign Alignment = getLoadStoreAlignment(I);
5941     unsigned AS = getLoadStoreAddressSpace(I);
5942 
5943     return TTI.getAddressComputationCost(ValTy) +
5944            TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, I);
5945   }
5946   return getWideningCost(I, VF);
5947 }
5948 
5949 LoopVectorizationCostModel::VectorizationCostTy
5950 LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
5951   // If we know that this instruction will remain uniform, check the cost of
5952   // the scalar version.
5953   if (isUniformAfterVectorization(I, VF))
5954     VF = 1;
5955 
5956   if (VF > 1 && isProfitableToScalarize(I, VF))
5957     return VectorizationCostTy(InstsToScalarize[VF][I], false);
5958 
5959   // Forced scalars do not have any scalarization overhead.
5960   auto ForcedScalar = ForcedScalars.find(VF);
5961   if (VF > 1 && ForcedScalar != ForcedScalars.end()) {
5962     auto InstSet = ForcedScalar->second;
5963     if (InstSet.find(I) != InstSet.end())
5964       return VectorizationCostTy((getInstructionCost(I, 1).first * VF), false);
5965   }
5966 
5967   Type *VectorTy;
5968   unsigned C = getInstructionCost(I, VF, VectorTy);
5969 
5970   bool TypeNotScalarized =
5971       VF > 1 && VectorTy->isVectorTy() && TTI.getNumberOfParts(VectorTy) < VF;
5972   return VectorizationCostTy(C, TypeNotScalarized);
5973 }
5974 
5975 unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
5976                                                               unsigned VF) {
5977 
5978   if (VF == 1)
5979     return 0;
5980 
5981   unsigned Cost = 0;
5982   Type *RetTy = ToVectorTy(I->getType(), VF);
5983   if (!RetTy->isVoidTy() &&
5984       (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
5985     Cost += TTI.getScalarizationOverhead(RetTy, true, false);
5986 
5987   // Some targets keep addresses scalar.
5988   if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
5989     return Cost;
5990 
5991   // Some targets support efficient element stores.
5992   if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
5993     return Cost;
5994 
5995   // Collect operands to consider.
5996   CallInst *CI = dyn_cast<CallInst>(I);
5997   Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands();
5998 
5999   // Skip operands that do not require extraction/scalarization and do not incur
6000   // any overhead.
6001   return Cost + TTI.getOperandsScalarizationOverhead(
6002                     filterExtractingOperands(Ops, VF), VF);
6003 }
6004 
6005 void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {
6006   if (VF == 1)
6007     return;
6008   NumPredStores = 0;
6009   for (BasicBlock *BB : TheLoop->blocks()) {
6010     // For each instruction in the old loop.
6011     for (Instruction &I : *BB) {
6012       Value *Ptr =  getLoadStorePointerOperand(&I);
6013       if (!Ptr)
6014         continue;
6015 
6016       // TODO: We should generate better code and update the cost model for
6017       // predicated uniform stores. Today they are treated as any other
6018       // predicated store (see added test cases in
6019       // invariant-store-vectorization.ll).
6020       if (isa<StoreInst>(&I) && isScalarWithPredication(&I))
6021         NumPredStores++;
6022 
6023       if (Legal->isUniform(Ptr) &&
6024           // Conditional loads and stores should be scalarized and predicated.
6025           // isScalarWithPredication cannot be used here since masked
6026           // gather/scatters are not considered scalar with predication.
6027           !Legal->blockNeedsPredication(I.getParent())) {
6028         // TODO: Avoid replicating loads and stores instead of
6029         // relying on instcombine to remove them.
6030         // Load: Scalar load + broadcast
6031         // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
6032         unsigned Cost = getUniformMemOpCost(&I, VF);
6033         setWideningDecision(&I, VF, CM_Scalarize, Cost);
6034         continue;
6035       }
6036 
6037       // We assume that widening is the best solution when possible.
6038       if (memoryInstructionCanBeWidened(&I, VF)) {
6039         unsigned Cost = getConsecutiveMemOpCost(&I, VF);
6040         int ConsecutiveStride =
6041                Legal->isConsecutivePtr(getLoadStorePointerOperand(&I));
6042         assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6043                "Expected consecutive stride.");
6044         InstWidening Decision =
6045             ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
6046         setWideningDecision(&I, VF, Decision, Cost);
6047         continue;
6048       }
6049 
6050       // Choose between Interleaving, Gather/Scatter or Scalarization.
6051       unsigned InterleaveCost = std::numeric_limits<unsigned>::max();
6052       unsigned NumAccesses = 1;
6053       if (isAccessInterleaved(&I)) {
6054         auto Group = getInterleavedAccessGroup(&I);
6055         assert(Group && "Fail to get an interleaved access group.");
6056 
6057         // Make one decision for the whole group.
6058         if (getWideningDecision(&I, VF) != CM_Unknown)
6059           continue;
6060 
6061         NumAccesses = Group->getNumMembers();
6062         if (interleavedAccessCanBeWidened(&I, VF))
6063           InterleaveCost = getInterleaveGroupCost(&I, VF);
6064       }
6065 
6066       unsigned GatherScatterCost =
6067           isLegalGatherOrScatter(&I)
6068               ? getGatherScatterCost(&I, VF) * NumAccesses
6069               : std::numeric_limits<unsigned>::max();
6070 
6071       unsigned ScalarizationCost =
6072           getMemInstScalarizationCost(&I, VF) * NumAccesses;
6073 
6074       // Choose better solution for the current VF,
6075       // write down this decision and use it during vectorization.
6076       unsigned Cost;
6077       InstWidening Decision;
6078       if (InterleaveCost <= GatherScatterCost &&
6079           InterleaveCost < ScalarizationCost) {
6080         Decision = CM_Interleave;
6081         Cost = InterleaveCost;
6082       } else if (GatherScatterCost < ScalarizationCost) {
6083         Decision = CM_GatherScatter;
6084         Cost = GatherScatterCost;
6085       } else {
6086         Decision = CM_Scalarize;
6087         Cost = ScalarizationCost;
6088       }
6089       // If the instructions belongs to an interleave group, the whole group
6090       // receives the same decision. The whole group receives the cost, but
6091       // the cost will actually be assigned to one instruction.
6092       if (auto Group = getInterleavedAccessGroup(&I))
6093         setWideningDecision(Group, VF, Decision, Cost);
6094       else
6095         setWideningDecision(&I, VF, Decision, Cost);
6096     }
6097   }
6098 
6099   // Make sure that any load of address and any other address computation
6100   // remains scalar unless there is gather/scatter support. This avoids
6101   // inevitable extracts into address registers, and also has the benefit of
6102   // activating LSR more, since that pass can't optimize vectorized
6103   // addresses.
6104   if (TTI.prefersVectorizedAddressing())
6105     return;
6106 
6107   // Start with all scalar pointer uses.
6108   SmallPtrSet<Instruction *, 8> AddrDefs;
6109   for (BasicBlock *BB : TheLoop->blocks())
6110     for (Instruction &I : *BB) {
6111       Instruction *PtrDef =
6112         dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
6113       if (PtrDef && TheLoop->contains(PtrDef) &&
6114           getWideningDecision(&I, VF) != CM_GatherScatter)
6115         AddrDefs.insert(PtrDef);
6116     }
6117 
6118   // Add all instructions used to generate the addresses.
6119   SmallVector<Instruction *, 4> Worklist;
6120   for (auto *I : AddrDefs)
6121     Worklist.push_back(I);
6122   while (!Worklist.empty()) {
6123     Instruction *I = Worklist.pop_back_val();
6124     for (auto &Op : I->operands())
6125       if (auto *InstOp = dyn_cast<Instruction>(Op))
6126         if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
6127             AddrDefs.insert(InstOp).second)
6128           Worklist.push_back(InstOp);
6129   }
6130 
6131   for (auto *I : AddrDefs) {
6132     if (isa<LoadInst>(I)) {
6133       // Setting the desired widening decision should ideally be handled in
6134       // by cost functions, but since this involves the task of finding out
6135       // if the loaded register is involved in an address computation, it is
6136       // instead changed here when we know this is the case.
6137       InstWidening Decision = getWideningDecision(I, VF);
6138       if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
6139         // Scalarize a widened load of address.
6140         setWideningDecision(I, VF, CM_Scalarize,
6141                             (VF * getMemoryInstructionCost(I, 1)));
6142       else if (auto Group = getInterleavedAccessGroup(I)) {
6143         // Scalarize an interleave group of address loads.
6144         for (unsigned I = 0; I < Group->getFactor(); ++I) {
6145           if (Instruction *Member = Group->getMember(I))
6146             setWideningDecision(Member, VF, CM_Scalarize,
6147                                 (VF * getMemoryInstructionCost(Member, 1)));
6148         }
6149       }
6150     } else
6151       // Make sure I gets scalarized and a cost estimate without
6152       // scalarization overhead.
6153       ForcedScalars[VF].insert(I);
6154   }
6155 }
6156 
6157 unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
6158                                                         unsigned VF,
6159                                                         Type *&VectorTy) {
6160   Type *RetTy = I->getType();
6161   if (canTruncateToMinimalBitwidth(I, VF))
6162     RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
6163   VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF);
6164   auto SE = PSE.getSE();
6165 
6166   // TODO: We need to estimate the cost of intrinsic calls.
6167   switch (I->getOpcode()) {
6168   case Instruction::GetElementPtr:
6169     // We mark this instruction as zero-cost because the cost of GEPs in
6170     // vectorized code depends on whether the corresponding memory instruction
6171     // is scalarized or not. Therefore, we handle GEPs with the memory
6172     // instruction cost.
6173     return 0;
6174   case Instruction::Br: {
6175     // In cases of scalarized and predicated instructions, there will be VF
6176     // predicated blocks in the vectorized loop. Each branch around these
6177     // blocks requires also an extract of its vector compare i1 element.
6178     bool ScalarPredicatedBB = false;
6179     BranchInst *BI = cast<BranchInst>(I);
6180     if (VF > 1 && BI->isConditional() &&
6181         (PredicatedBBsAfterVectorization.find(BI->getSuccessor(0)) !=
6182              PredicatedBBsAfterVectorization.end() ||
6183          PredicatedBBsAfterVectorization.find(BI->getSuccessor(1)) !=
6184              PredicatedBBsAfterVectorization.end()))
6185       ScalarPredicatedBB = true;
6186 
6187     if (ScalarPredicatedBB) {
6188       // Return cost for branches around scalarized and predicated blocks.
6189       Type *Vec_i1Ty =
6190           VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
6191       return (TTI.getScalarizationOverhead(Vec_i1Ty, false, true) +
6192               (TTI.getCFInstrCost(Instruction::Br) * VF));
6193     } else if (I->getParent() == TheLoop->getLoopLatch() || VF == 1)
6194       // The back-edge branch will remain, as will all scalar branches.
6195       return TTI.getCFInstrCost(Instruction::Br);
6196     else
6197       // This branch will be eliminated by if-conversion.
6198       return 0;
6199     // Note: We currently assume zero cost for an unconditional branch inside
6200     // a predicated block since it will become a fall-through, although we
6201     // may decide in the future to call TTI for all branches.
6202   }
6203   case Instruction::PHI: {
6204     auto *Phi = cast<PHINode>(I);
6205 
6206     // First-order recurrences are replaced by vector shuffles inside the loop.
6207     // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type.
6208     if (VF > 1 && Legal->isFirstOrderRecurrence(Phi))
6209       return TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector,
6210                                 VectorTy, VF - 1, VectorType::get(RetTy, 1));
6211 
6212     // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
6213     // converted into select instructions. We require N - 1 selects per phi
6214     // node, where N is the number of incoming values.
6215     if (VF > 1 && Phi->getParent() != TheLoop->getHeader())
6216       return (Phi->getNumIncomingValues() - 1) *
6217              TTI.getCmpSelInstrCost(
6218                  Instruction::Select, ToVectorTy(Phi->getType(), VF),
6219                  ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF));
6220 
6221     return TTI.getCFInstrCost(Instruction::PHI);
6222   }
6223   case Instruction::UDiv:
6224   case Instruction::SDiv:
6225   case Instruction::URem:
6226   case Instruction::SRem:
6227     // If we have a predicated instruction, it may not be executed for each
6228     // vector lane. Get the scalarization cost and scale this amount by the
6229     // probability of executing the predicated block. If the instruction is not
6230     // predicated, we fall through to the next case.
6231     if (VF > 1 && isScalarWithPredication(I)) {
6232       unsigned Cost = 0;
6233 
6234       // These instructions have a non-void type, so account for the phi nodes
6235       // that we will create. This cost is likely to be zero. The phi node
6236       // cost, if any, should be scaled by the block probability because it
6237       // models a copy at the end of each predicated block.
6238       Cost += VF * TTI.getCFInstrCost(Instruction::PHI);
6239 
6240       // The cost of the non-predicated instruction.
6241       Cost += VF * TTI.getArithmeticInstrCost(I->getOpcode(), RetTy);
6242 
6243       // The cost of insertelement and extractelement instructions needed for
6244       // scalarization.
6245       Cost += getScalarizationOverhead(I, VF);
6246 
6247       // Scale the cost by the probability of executing the predicated blocks.
6248       // This assumes the predicated block for each vector lane is equally
6249       // likely.
6250       return Cost / getReciprocalPredBlockProb();
6251     }
6252     LLVM_FALLTHROUGH;
6253   case Instruction::Add:
6254   case Instruction::FAdd:
6255   case Instruction::Sub:
6256   case Instruction::FSub:
6257   case Instruction::Mul:
6258   case Instruction::FMul:
6259   case Instruction::FDiv:
6260   case Instruction::FRem:
6261   case Instruction::Shl:
6262   case Instruction::LShr:
6263   case Instruction::AShr:
6264   case Instruction::And:
6265   case Instruction::Or:
6266   case Instruction::Xor: {
6267     // Since we will replace the stride by 1 the multiplication should go away.
6268     if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal))
6269       return 0;
6270     // Certain instructions can be cheaper to vectorize if they have a constant
6271     // second vector operand. One example of this are shifts on x86.
6272     Value *Op2 = I->getOperand(1);
6273     TargetTransformInfo::OperandValueProperties Op2VP;
6274     TargetTransformInfo::OperandValueKind Op2VK =
6275         TTI.getOperandInfo(Op2, Op2VP);
6276     if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2))
6277       Op2VK = TargetTransformInfo::OK_UniformValue;
6278 
6279     SmallVector<const Value *, 4> Operands(I->operand_values());
6280     unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
6281     return N * TTI.getArithmeticInstrCost(
6282                    I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue,
6283                    Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I);
6284   }
6285   case Instruction::FNeg: {
6286     unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
6287     return N * TTI.getArithmeticInstrCost(
6288                    I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue,
6289                    TargetTransformInfo::OK_AnyValue,
6290                    TargetTransformInfo::OP_None, TargetTransformInfo::OP_None,
6291                    I->getOperand(0), I);
6292   }
6293   case Instruction::Select: {
6294     SelectInst *SI = cast<SelectInst>(I);
6295     const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
6296     bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
6297     Type *CondTy = SI->getCondition()->getType();
6298     if (!ScalarCond)
6299       CondTy = VectorType::get(CondTy, VF);
6300 
6301     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, I);
6302   }
6303   case Instruction::ICmp:
6304   case Instruction::FCmp: {
6305     Type *ValTy = I->getOperand(0)->getType();
6306     Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
6307     if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
6308       ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
6309     VectorTy = ToVectorTy(ValTy, VF);
6310     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, I);
6311   }
6312   case Instruction::Store:
6313   case Instruction::Load: {
6314     unsigned Width = VF;
6315     if (Width > 1) {
6316       InstWidening Decision = getWideningDecision(I, Width);
6317       assert(Decision != CM_Unknown &&
6318              "CM decision should be taken at this point");
6319       if (Decision == CM_Scalarize)
6320         Width = 1;
6321     }
6322     VectorTy = ToVectorTy(getMemInstValueType(I), Width);
6323     return getMemoryInstructionCost(I, VF);
6324   }
6325   case Instruction::ZExt:
6326   case Instruction::SExt:
6327   case Instruction::FPToUI:
6328   case Instruction::FPToSI:
6329   case Instruction::FPExt:
6330   case Instruction::PtrToInt:
6331   case Instruction::IntToPtr:
6332   case Instruction::SIToFP:
6333   case Instruction::UIToFP:
6334   case Instruction::Trunc:
6335   case Instruction::FPTrunc:
6336   case Instruction::BitCast: {
6337     // We optimize the truncation of induction variables having constant
6338     // integer steps. The cost of these truncations is the same as the scalar
6339     // operation.
6340     if (isOptimizableIVTruncate(I, VF)) {
6341       auto *Trunc = cast<TruncInst>(I);
6342       return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
6343                                   Trunc->getSrcTy(), Trunc);
6344     }
6345 
6346     Type *SrcScalarTy = I->getOperand(0)->getType();
6347     Type *SrcVecTy =
6348         VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
6349     if (canTruncateToMinimalBitwidth(I, VF)) {
6350       // This cast is going to be shrunk. This may remove the cast or it might
6351       // turn it into slightly different cast. For example, if MinBW == 16,
6352       // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
6353       //
6354       // Calculate the modified src and dest types.
6355       Type *MinVecTy = VectorTy;
6356       if (I->getOpcode() == Instruction::Trunc) {
6357         SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
6358         VectorTy =
6359             largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
6360       } else if (I->getOpcode() == Instruction::ZExt ||
6361                  I->getOpcode() == Instruction::SExt) {
6362         SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy);
6363         VectorTy =
6364             smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
6365       }
6366     }
6367 
6368     unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
6369     return N * TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy, I);
6370   }
6371   case Instruction::Call: {
6372     bool NeedToScalarize;
6373     CallInst *CI = cast<CallInst>(I);
6374     unsigned CallCost = getVectorCallCost(CI, VF, NeedToScalarize);
6375     if (getVectorIntrinsicIDForCall(CI, TLI))
6376       return std::min(CallCost, getVectorIntrinsicCost(CI, VF));
6377     return CallCost;
6378   }
6379   default:
6380     // The cost of executing VF copies of the scalar instruction. This opcode
6381     // is unknown. Assume that it is the same as 'mul'.
6382     return VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy) +
6383            getScalarizationOverhead(I, VF);
6384   } // end of switch.
6385 }
6386 
6387 char LoopVectorize::ID = 0;
6388 
6389 static const char lv_name[] = "Loop Vectorization";
6390 
6391 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
6392 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
6393 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)
6394 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
6395 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
6396 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
6397 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
6398 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
6399 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
6400 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
6401 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
6402 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
6403 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
6404 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
6405 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy)
6406 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
6407 
6408 namespace llvm {
6409 
6410 Pass *createLoopVectorizePass() { return new LoopVectorize(); }
6411 
6412 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced,
6413                               bool VectorizeOnlyWhenForced) {
6414   return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced);
6415 }
6416 
6417 } // end namespace llvm
6418 
6419 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
6420   // Check if the pointer operand of a load or store instruction is
6421   // consecutive.
6422   if (auto *Ptr = getLoadStorePointerOperand(Inst))
6423     return Legal->isConsecutivePtr(Ptr);
6424   return false;
6425 }
6426 
6427 void LoopVectorizationCostModel::collectValuesToIgnore() {
6428   // Ignore ephemeral values.
6429   CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
6430 
6431   // Ignore type-promoting instructions we identified during reduction
6432   // detection.
6433   for (auto &Reduction : Legal->getReductionVars()) {
6434     RecurrenceDescriptor &RedDes = Reduction.second;
6435     SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
6436     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
6437   }
6438   // Ignore type-casting instructions we identified during induction
6439   // detection.
6440   for (auto &Induction : Legal->getInductionVars()) {
6441     InductionDescriptor &IndDes = Induction.second;
6442     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
6443     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
6444   }
6445 }
6446 
6447 // TODO: we could return a pair of values that specify the max VF and
6448 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
6449 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
6450 // doesn't have a cost model that can choose which plan to execute if
6451 // more than one is generated.
6452 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits,
6453                                  LoopVectorizationCostModel &CM) {
6454   unsigned WidestType;
6455   std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
6456   return WidestVectorRegBits / WidestType;
6457 }
6458 
6459 VectorizationFactor
6460 LoopVectorizationPlanner::planInVPlanNativePath(unsigned UserVF) {
6461   unsigned VF = UserVF;
6462   // Outer loop handling: They may require CFG and instruction level
6463   // transformations before even evaluating whether vectorization is profitable.
6464   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
6465   // the vectorization pipeline.
6466   if (!OrigLoop->empty()) {
6467     // If the user doesn't provide a vectorization factor, determine a
6468     // reasonable one.
6469     if (!UserVF) {
6470       VF = determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM);
6471       LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
6472 
6473       // Make sure we have a VF > 1 for stress testing.
6474       if (VPlanBuildStressTest && VF < 2) {
6475         LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
6476                           << "overriding computed VF.\n");
6477         VF = 4;
6478       }
6479     }
6480     assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
6481     assert(isPowerOf2_32(VF) && "VF needs to be a power of two");
6482     LLVM_DEBUG(dbgs() << "LV: Using " << (UserVF ? "user " : "") << "VF " << VF
6483                       << " to build VPlans.\n");
6484     buildVPlans(VF, VF);
6485 
6486     // For VPlan build stress testing, we bail out after VPlan construction.
6487     if (VPlanBuildStressTest)
6488       return VectorizationFactor::Disabled();
6489 
6490     return {VF, 0};
6491   }
6492 
6493   LLVM_DEBUG(
6494       dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
6495                 "VPlan-native path.\n");
6496   return VectorizationFactor::Disabled();
6497 }
6498 
6499 Optional<VectorizationFactor> LoopVectorizationPlanner::plan(unsigned UserVF) {
6500   assert(OrigLoop->empty() && "Inner loop expected.");
6501   Optional<unsigned> MaybeMaxVF = CM.computeMaxVF();
6502   if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved.
6503     return None;
6504 
6505   // Invalidate interleave groups if all blocks of loop will be predicated.
6506   if (CM.blockNeedsPredication(OrigLoop->getHeader()) &&
6507       !useMaskedInterleavedAccesses(*TTI)) {
6508     LLVM_DEBUG(
6509         dbgs()
6510         << "LV: Invalidate all interleaved groups due to fold-tail by masking "
6511            "which requires masked-interleaved support.\n");
6512     CM.InterleaveInfo.reset();
6513   }
6514 
6515   if (UserVF) {
6516     LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
6517     assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two");
6518     // Collect the instructions (and their associated costs) that will be more
6519     // profitable to scalarize.
6520     CM.selectUserVectorizationFactor(UserVF);
6521     buildVPlansWithVPRecipes(UserVF, UserVF);
6522     LLVM_DEBUG(printPlans(dbgs()));
6523     return {{UserVF, 0}};
6524   }
6525 
6526   unsigned MaxVF = MaybeMaxVF.getValue();
6527   assert(MaxVF != 0 && "MaxVF is zero.");
6528 
6529   for (unsigned VF = 1; VF <= MaxVF; VF *= 2) {
6530     // Collect Uniform and Scalar instructions after vectorization with VF.
6531     CM.collectUniformsAndScalars(VF);
6532 
6533     // Collect the instructions (and their associated costs) that will be more
6534     // profitable to scalarize.
6535     if (VF > 1)
6536       CM.collectInstsToScalarize(VF);
6537   }
6538 
6539   buildVPlansWithVPRecipes(1, MaxVF);
6540   LLVM_DEBUG(printPlans(dbgs()));
6541   if (MaxVF == 1)
6542     return VectorizationFactor::Disabled();
6543 
6544   // Select the optimal vectorization factor.
6545   return CM.selectVectorizationFactor(MaxVF);
6546 }
6547 
6548 void LoopVectorizationPlanner::setBestPlan(unsigned VF, unsigned UF) {
6549   LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF
6550                     << '\n');
6551   BestVF = VF;
6552   BestUF = UF;
6553 
6554   erase_if(VPlans, [VF](const VPlanPtr &Plan) {
6555     return !Plan->hasVF(VF);
6556   });
6557   assert(VPlans.size() == 1 && "Best VF has not a single VPlan.");
6558 }
6559 
6560 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV,
6561                                            DominatorTree *DT) {
6562   // Perform the actual loop transformation.
6563 
6564   // 1. Create a new empty loop. Unlink the old loop and connect the new one.
6565   VPCallbackILV CallbackILV(ILV);
6566 
6567   VPTransformState State{BestVF, BestUF,      LI,
6568                          DT,     ILV.Builder, ILV.VectorLoopValueMap,
6569                          &ILV,   CallbackILV};
6570   State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
6571   State.TripCount = ILV.getOrCreateTripCount(nullptr);
6572 
6573   //===------------------------------------------------===//
6574   //
6575   // Notice: any optimization or new instruction that go
6576   // into the code below should also be implemented in
6577   // the cost-model.
6578   //
6579   //===------------------------------------------------===//
6580 
6581   // 2. Copy and widen instructions from the old loop into the new loop.
6582   assert(VPlans.size() == 1 && "Not a single VPlan to execute.");
6583   VPlans.front()->execute(&State);
6584 
6585   // 3. Fix the vectorized code: take care of header phi's, live-outs,
6586   //    predication, updating analyses.
6587   ILV.fixVectorizedLoop();
6588 }
6589 
6590 void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
6591     SmallPtrSetImpl<Instruction *> &DeadInstructions) {
6592   BasicBlock *Latch = OrigLoop->getLoopLatch();
6593 
6594   // We create new control-flow for the vectorized loop, so the original
6595   // condition will be dead after vectorization if it's only used by the
6596   // branch.
6597   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
6598   if (Cmp && Cmp->hasOneUse())
6599     DeadInstructions.insert(Cmp);
6600 
6601   // We create new "steps" for induction variable updates to which the original
6602   // induction variables map. An original update instruction will be dead if
6603   // all its users except the induction variable are dead.
6604   for (auto &Induction : Legal->getInductionVars()) {
6605     PHINode *Ind = Induction.first;
6606     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
6607     if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
6608           return U == Ind || DeadInstructions.find(cast<Instruction>(U)) !=
6609                                  DeadInstructions.end();
6610         }))
6611       DeadInstructions.insert(IndUpdate);
6612 
6613     // We record as "Dead" also the type-casting instructions we had identified
6614     // during induction analysis. We don't need any handling for them in the
6615     // vectorized loop because we have proven that, under a proper runtime
6616     // test guarding the vectorized loop, the value of the phi, and the casted
6617     // value of the phi, are the same. The last instruction in this casting chain
6618     // will get its scalar/vector/widened def from the scalar/vector/widened def
6619     // of the respective phi node. Any other casts in the induction def-use chain
6620     // have no other uses outside the phi update chain, and will be ignored.
6621     InductionDescriptor &IndDes = Induction.second;
6622     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
6623     DeadInstructions.insert(Casts.begin(), Casts.end());
6624   }
6625 }
6626 
6627 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; }
6628 
6629 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; }
6630 
6631 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step,
6632                                         Instruction::BinaryOps BinOp) {
6633   // When unrolling and the VF is 1, we only need to add a simple scalar.
6634   Type *Ty = Val->getType();
6635   assert(!Ty->isVectorTy() && "Val must be a scalar");
6636 
6637   if (Ty->isFloatingPointTy()) {
6638     Constant *C = ConstantFP::get(Ty, (double)StartIdx);
6639 
6640     // Floating point operations had to be 'fast' to enable the unrolling.
6641     Value *MulOp = addFastMathFlag(Builder.CreateFMul(C, Step));
6642     return addFastMathFlag(Builder.CreateBinOp(BinOp, Val, MulOp));
6643   }
6644   Constant *C = ConstantInt::get(Ty, StartIdx);
6645   return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction");
6646 }
6647 
6648 static void AddRuntimeUnrollDisableMetaData(Loop *L) {
6649   SmallVector<Metadata *, 4> MDs;
6650   // Reserve first location for self reference to the LoopID metadata node.
6651   MDs.push_back(nullptr);
6652   bool IsUnrollMetadata = false;
6653   MDNode *LoopID = L->getLoopID();
6654   if (LoopID) {
6655     // First find existing loop unrolling disable metadata.
6656     for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
6657       auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
6658       if (MD) {
6659         const auto *S = dyn_cast<MDString>(MD->getOperand(0));
6660         IsUnrollMetadata =
6661             S && S->getString().startswith("llvm.loop.unroll.disable");
6662       }
6663       MDs.push_back(LoopID->getOperand(i));
6664     }
6665   }
6666 
6667   if (!IsUnrollMetadata) {
6668     // Add runtime unroll disable metadata.
6669     LLVMContext &Context = L->getHeader()->getContext();
6670     SmallVector<Metadata *, 1> DisableOperands;
6671     DisableOperands.push_back(
6672         MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
6673     MDNode *DisableNode = MDNode::get(Context, DisableOperands);
6674     MDs.push_back(DisableNode);
6675     MDNode *NewLoopID = MDNode::get(Context, MDs);
6676     // Set operand 0 to refer to the loop id itself.
6677     NewLoopID->replaceOperandWith(0, NewLoopID);
6678     L->setLoopID(NewLoopID);
6679   }
6680 }
6681 
6682 bool LoopVectorizationPlanner::getDecisionAndClampRange(
6683     const std::function<bool(unsigned)> &Predicate, VFRange &Range) {
6684   assert(Range.End > Range.Start && "Trying to test an empty VF range.");
6685   bool PredicateAtRangeStart = Predicate(Range.Start);
6686 
6687   for (unsigned TmpVF = Range.Start * 2; TmpVF < Range.End; TmpVF *= 2)
6688     if (Predicate(TmpVF) != PredicateAtRangeStart) {
6689       Range.End = TmpVF;
6690       break;
6691     }
6692 
6693   return PredicateAtRangeStart;
6694 }
6695 
6696 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
6697 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
6698 /// of VF's starting at a given VF and extending it as much as possible. Each
6699 /// vectorization decision can potentially shorten this sub-range during
6700 /// buildVPlan().
6701 void LoopVectorizationPlanner::buildVPlans(unsigned MinVF, unsigned MaxVF) {
6702   for (unsigned VF = MinVF; VF < MaxVF + 1;) {
6703     VFRange SubRange = {VF, MaxVF + 1};
6704     VPlans.push_back(buildVPlan(SubRange));
6705     VF = SubRange.End;
6706   }
6707 }
6708 
6709 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
6710                                          VPlanPtr &Plan) {
6711   assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
6712 
6713   // Look for cached value.
6714   std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
6715   EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
6716   if (ECEntryIt != EdgeMaskCache.end())
6717     return ECEntryIt->second;
6718 
6719   VPValue *SrcMask = createBlockInMask(Src, Plan);
6720 
6721   // The terminator has to be a branch inst!
6722   BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
6723   assert(BI && "Unexpected terminator found");
6724 
6725   if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1))
6726     return EdgeMaskCache[Edge] = SrcMask;
6727 
6728   VPValue *EdgeMask = Plan->getVPValue(BI->getCondition());
6729   assert(EdgeMask && "No Edge Mask found for condition");
6730 
6731   if (BI->getSuccessor(0) != Dst)
6732     EdgeMask = Builder.createNot(EdgeMask);
6733 
6734   if (SrcMask) // Otherwise block in-mask is all-one, no need to AND.
6735     EdgeMask = Builder.createAnd(EdgeMask, SrcMask);
6736 
6737   return EdgeMaskCache[Edge] = EdgeMask;
6738 }
6739 
6740 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
6741   assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
6742 
6743   // Look for cached value.
6744   BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB);
6745   if (BCEntryIt != BlockMaskCache.end())
6746     return BCEntryIt->second;
6747 
6748   // All-one mask is modelled as no-mask following the convention for masked
6749   // load/store/gather/scatter. Initialize BlockMask to no-mask.
6750   VPValue *BlockMask = nullptr;
6751 
6752   if (OrigLoop->getHeader() == BB) {
6753     if (!CM.blockNeedsPredication(BB))
6754       return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one.
6755 
6756     // Introduce the early-exit compare IV <= BTC to form header block mask.
6757     // This is used instead of IV < TC because TC may wrap, unlike BTC.
6758     VPValue *IV = Plan->getVPValue(Legal->getPrimaryInduction());
6759     VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
6760     BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
6761     return BlockMaskCache[BB] = BlockMask;
6762   }
6763 
6764   // This is the block mask. We OR all incoming edges.
6765   for (auto *Predecessor : predecessors(BB)) {
6766     VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan);
6767     if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too.
6768       return BlockMaskCache[BB] = EdgeMask;
6769 
6770     if (!BlockMask) { // BlockMask has its initialized nullptr value.
6771       BlockMask = EdgeMask;
6772       continue;
6773     }
6774 
6775     BlockMask = Builder.createOr(BlockMask, EdgeMask);
6776   }
6777 
6778   return BlockMaskCache[BB] = BlockMask;
6779 }
6780 
6781 VPWidenMemoryInstructionRecipe *
6782 VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range,
6783                                   VPlanPtr &Plan) {
6784   if (!isa<LoadInst>(I) && !isa<StoreInst>(I))
6785     return nullptr;
6786 
6787   auto willWiden = [&](unsigned VF) -> bool {
6788     if (VF == 1)
6789       return false;
6790     LoopVectorizationCostModel::InstWidening Decision =
6791         CM.getWideningDecision(I, VF);
6792     assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
6793            "CM decision should be taken at this point.");
6794     if (Decision == LoopVectorizationCostModel::CM_Interleave)
6795       return true;
6796     if (CM.isScalarAfterVectorization(I, VF) ||
6797         CM.isProfitableToScalarize(I, VF))
6798       return false;
6799     return Decision != LoopVectorizationCostModel::CM_Scalarize;
6800   };
6801 
6802   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
6803     return nullptr;
6804 
6805   VPValue *Mask = nullptr;
6806   if (Legal->isMaskRequired(I))
6807     Mask = createBlockInMask(I->getParent(), Plan);
6808 
6809   VPValue *Addr = Plan->getOrAddVPValue(getLoadStorePointerOperand(I));
6810   return new VPWidenMemoryInstructionRecipe(*I, Addr, Mask);
6811 }
6812 
6813 VPWidenIntOrFpInductionRecipe *
6814 VPRecipeBuilder::tryToOptimizeInduction(Instruction *I, VFRange &Range) {
6815   if (PHINode *Phi = dyn_cast<PHINode>(I)) {
6816     // Check if this is an integer or fp induction. If so, build the recipe that
6817     // produces its scalar and vector values.
6818     InductionDescriptor II = Legal->getInductionVars().lookup(Phi);
6819     if (II.getKind() == InductionDescriptor::IK_IntInduction ||
6820         II.getKind() == InductionDescriptor::IK_FpInduction)
6821       return new VPWidenIntOrFpInductionRecipe(Phi);
6822 
6823     return nullptr;
6824   }
6825 
6826   // Optimize the special case where the source is a constant integer
6827   // induction variable. Notice that we can only optimize the 'trunc' case
6828   // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
6829   // (c) other casts depend on pointer size.
6830 
6831   // Determine whether \p K is a truncation based on an induction variable that
6832   // can be optimized.
6833   auto isOptimizableIVTruncate =
6834       [&](Instruction *K) -> std::function<bool(unsigned)> {
6835     return
6836         [=](unsigned VF) -> bool { return CM.isOptimizableIVTruncate(K, VF); };
6837   };
6838 
6839   if (isa<TruncInst>(I) && LoopVectorizationPlanner::getDecisionAndClampRange(
6840                                isOptimizableIVTruncate(I), Range))
6841     return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)),
6842                                              cast<TruncInst>(I));
6843   return nullptr;
6844 }
6845 
6846 VPBlendRecipe *VPRecipeBuilder::tryToBlend(Instruction *I, VPlanPtr &Plan) {
6847   PHINode *Phi = dyn_cast<PHINode>(I);
6848   if (!Phi || Phi->getParent() == OrigLoop->getHeader())
6849     return nullptr;
6850 
6851   // We know that all PHIs in non-header blocks are converted into selects, so
6852   // we don't have to worry about the insertion order and we can just use the
6853   // builder. At this point we generate the predication tree. There may be
6854   // duplications since this is a simple recursive scan, but future
6855   // optimizations will clean it up.
6856 
6857   SmallVector<VPValue *, 2> Masks;
6858   unsigned NumIncoming = Phi->getNumIncomingValues();
6859   for (unsigned In = 0; In < NumIncoming; In++) {
6860     VPValue *EdgeMask =
6861       createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan);
6862     assert((EdgeMask || NumIncoming == 1) &&
6863            "Multiple predecessors with one having a full mask");
6864     if (EdgeMask)
6865       Masks.push_back(EdgeMask);
6866   }
6867   return new VPBlendRecipe(Phi, Masks);
6868 }
6869 
6870 bool VPRecipeBuilder::tryToWiden(Instruction *I, VPBasicBlock *VPBB,
6871                                  VFRange &Range) {
6872 
6873   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
6874       [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range);
6875 
6876   if (IsPredicated)
6877     return false;
6878 
6879   auto IsVectorizableOpcode = [](unsigned Opcode) {
6880     switch (Opcode) {
6881     case Instruction::Add:
6882     case Instruction::And:
6883     case Instruction::AShr:
6884     case Instruction::BitCast:
6885     case Instruction::Br:
6886     case Instruction::Call:
6887     case Instruction::FAdd:
6888     case Instruction::FCmp:
6889     case Instruction::FDiv:
6890     case Instruction::FMul:
6891     case Instruction::FNeg:
6892     case Instruction::FPExt:
6893     case Instruction::FPToSI:
6894     case Instruction::FPToUI:
6895     case Instruction::FPTrunc:
6896     case Instruction::FRem:
6897     case Instruction::FSub:
6898     case Instruction::ICmp:
6899     case Instruction::IntToPtr:
6900     case Instruction::Load:
6901     case Instruction::LShr:
6902     case Instruction::Mul:
6903     case Instruction::Or:
6904     case Instruction::PHI:
6905     case Instruction::PtrToInt:
6906     case Instruction::SDiv:
6907     case Instruction::Select:
6908     case Instruction::SExt:
6909     case Instruction::Shl:
6910     case Instruction::SIToFP:
6911     case Instruction::SRem:
6912     case Instruction::Store:
6913     case Instruction::Sub:
6914     case Instruction::Trunc:
6915     case Instruction::UDiv:
6916     case Instruction::UIToFP:
6917     case Instruction::URem:
6918     case Instruction::Xor:
6919     case Instruction::ZExt:
6920       return true;
6921     }
6922     return false;
6923   };
6924 
6925   if (!IsVectorizableOpcode(I->getOpcode()))
6926     return false;
6927 
6928   if (CallInst *CI = dyn_cast<CallInst>(I)) {
6929     Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
6930     if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
6931                ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect))
6932       return false;
6933   }
6934 
6935   auto willWiden = [&](unsigned VF) -> bool {
6936     if (!isa<PHINode>(I) && (CM.isScalarAfterVectorization(I, VF) ||
6937                              CM.isProfitableToScalarize(I, VF)))
6938       return false;
6939     if (CallInst *CI = dyn_cast<CallInst>(I)) {
6940       Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
6941       // The following case may be scalarized depending on the VF.
6942       // The flag shows whether we use Intrinsic or a usual Call for vectorized
6943       // version of the instruction.
6944       // Is it beneficial to perform intrinsic call compared to lib call?
6945       bool NeedToScalarize;
6946       unsigned CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize);
6947       bool UseVectorIntrinsic =
6948           ID && CM.getVectorIntrinsicCost(CI, VF) <= CallCost;
6949       return UseVectorIntrinsic || !NeedToScalarize;
6950     }
6951     if (isa<LoadInst>(I) || isa<StoreInst>(I)) {
6952       assert(CM.getWideningDecision(I, VF) ==
6953                  LoopVectorizationCostModel::CM_Scalarize &&
6954              "Memory widening decisions should have been taken care by now");
6955       return false;
6956     }
6957     return true;
6958   };
6959 
6960   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
6961     return false;
6962   // If this ingredient's recipe is to be recorded, keep its recipe a singleton
6963   // to avoid having to split recipes later.
6964   bool IsSingleton = Ingredient2Recipe.count(I);
6965 
6966   // Success: widen this instruction.
6967 
6968   // Use the default widening recipe. We optimize the common case where
6969   // consecutive instructions can be represented by a single recipe.
6970   if (!IsSingleton && !VPBB->empty() && LastExtensibleRecipe == &VPBB->back() &&
6971       LastExtensibleRecipe->appendInstruction(I))
6972     return true;
6973 
6974   VPWidenRecipe *WidenRecipe = new VPWidenRecipe(I);
6975   if (!IsSingleton)
6976     LastExtensibleRecipe = WidenRecipe;
6977   setRecipe(I, WidenRecipe);
6978   VPBB->appendRecipe(WidenRecipe);
6979   return true;
6980 }
6981 
6982 VPBasicBlock *VPRecipeBuilder::handleReplication(
6983     Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
6984     DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe,
6985     VPlanPtr &Plan) {
6986   bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
6987       [&](unsigned VF) { return CM.isUniformAfterVectorization(I, VF); },
6988       Range);
6989 
6990   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
6991       [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range);
6992 
6993   auto *Recipe = new VPReplicateRecipe(I, IsUniform, IsPredicated);
6994   setRecipe(I, Recipe);
6995 
6996   // Find if I uses a predicated instruction. If so, it will use its scalar
6997   // value. Avoid hoisting the insert-element which packs the scalar value into
6998   // a vector value, as that happens iff all users use the vector value.
6999   for (auto &Op : I->operands())
7000     if (auto *PredInst = dyn_cast<Instruction>(Op))
7001       if (PredInst2Recipe.find(PredInst) != PredInst2Recipe.end())
7002         PredInst2Recipe[PredInst]->setAlsoPack(false);
7003 
7004   // Finalize the recipe for Instr, first if it is not predicated.
7005   if (!IsPredicated) {
7006     LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
7007     VPBB->appendRecipe(Recipe);
7008     return VPBB;
7009   }
7010   LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
7011   assert(VPBB->getSuccessors().empty() &&
7012          "VPBB has successors when handling predicated replication.");
7013   // Record predicated instructions for above packing optimizations.
7014   PredInst2Recipe[I] = Recipe;
7015   VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan);
7016   VPBlockUtils::insertBlockAfter(Region, VPBB);
7017   auto *RegSucc = new VPBasicBlock();
7018   VPBlockUtils::insertBlockAfter(RegSucc, Region);
7019   return RegSucc;
7020 }
7021 
7022 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr,
7023                                                       VPRecipeBase *PredRecipe,
7024                                                       VPlanPtr &Plan) {
7025   // Instructions marked for predication are replicated and placed under an
7026   // if-then construct to prevent side-effects.
7027 
7028   // Generate recipes to compute the block mask for this region.
7029   VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan);
7030 
7031   // Build the triangular if-then region.
7032   std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
7033   assert(Instr->getParent() && "Predicated instruction not in any basic block");
7034   auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask);
7035   auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
7036   auto *PHIRecipe =
7037       Instr->getType()->isVoidTy() ? nullptr : new VPPredInstPHIRecipe(Instr);
7038   auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
7039   auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe);
7040   VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true);
7041 
7042   // Note: first set Entry as region entry and then connect successors starting
7043   // from it in order, to propagate the "parent" of each VPBasicBlock.
7044   VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry);
7045   VPBlockUtils::connectBlocks(Pred, Exit);
7046 
7047   return Region;
7048 }
7049 
7050 bool VPRecipeBuilder::tryToCreateRecipe(Instruction *Instr, VFRange &Range,
7051                                         VPlanPtr &Plan, VPBasicBlock *VPBB) {
7052   VPRecipeBase *Recipe = nullptr;
7053 
7054   // First, check for specific widening recipes that deal with memory
7055   // operations, inductions and Phi nodes.
7056   if ((Recipe = tryToWidenMemory(Instr, Range, Plan)) ||
7057       (Recipe = tryToOptimizeInduction(Instr, Range)) ||
7058       (Recipe = tryToBlend(Instr, Plan)) ||
7059       (isa<PHINode>(Instr) &&
7060        (Recipe = new VPWidenPHIRecipe(cast<PHINode>(Instr))))) {
7061     setRecipe(Instr, Recipe);
7062     VPBB->appendRecipe(Recipe);
7063     return true;
7064   }
7065 
7066   // Handle GEP widening.
7067   if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Instr)) {
7068     auto Scalarize = [&](unsigned VF) {
7069       return CM.isScalarWithPredication(Instr, VF) ||
7070              CM.isScalarAfterVectorization(Instr, VF) ||
7071              CM.isProfitableToScalarize(Instr, VF);
7072     };
7073     if (LoopVectorizationPlanner::getDecisionAndClampRange(Scalarize, Range))
7074       return false;
7075     VPWidenGEPRecipe *Recipe = new VPWidenGEPRecipe(GEP, OrigLoop);
7076     setRecipe(Instr, Recipe);
7077     VPBB->appendRecipe(Recipe);
7078     return true;
7079   }
7080 
7081   // Check if Instr is to be widened by a general VPWidenRecipe, after
7082   // having first checked for specific widening recipes.
7083   if (tryToWiden(Instr, VPBB, Range))
7084     return true;
7085 
7086   return false;
7087 }
7088 
7089 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF,
7090                                                         unsigned MaxVF) {
7091   assert(OrigLoop->empty() && "Inner loop expected.");
7092 
7093   // Collect conditions feeding internal conditional branches; they need to be
7094   // represented in VPlan for it to model masking.
7095   SmallPtrSet<Value *, 1> NeedDef;
7096 
7097   auto *Latch = OrigLoop->getLoopLatch();
7098   for (BasicBlock *BB : OrigLoop->blocks()) {
7099     if (BB == Latch)
7100       continue;
7101     BranchInst *Branch = dyn_cast<BranchInst>(BB->getTerminator());
7102     if (Branch && Branch->isConditional())
7103       NeedDef.insert(Branch->getCondition());
7104   }
7105 
7106   // If the tail is to be folded by masking, the primary induction variable
7107   // needs to be represented in VPlan for it to model early-exit masking.
7108   // Also, both the Phi and the live-out instruction of each reduction are
7109   // required in order to introduce a select between them in VPlan.
7110   if (CM.foldTailByMasking()) {
7111     NeedDef.insert(Legal->getPrimaryInduction());
7112     for (auto &Reduction : Legal->getReductionVars()) {
7113       NeedDef.insert(Reduction.first);
7114       NeedDef.insert(Reduction.second.getLoopExitInstr());
7115     }
7116   }
7117 
7118   // Collect instructions from the original loop that will become trivially dead
7119   // in the vectorized loop. We don't need to vectorize these instructions. For
7120   // example, original induction update instructions can become dead because we
7121   // separately emit induction "steps" when generating code for the new loop.
7122   // Similarly, we create a new latch condition when setting up the structure
7123   // of the new loop, so the old one can become dead.
7124   SmallPtrSet<Instruction *, 4> DeadInstructions;
7125   collectTriviallyDeadInstructions(DeadInstructions);
7126 
7127   // Add assume instructions we need to drop to DeadInstructions, to prevent
7128   // them from being added to the VPlan.
7129   // TODO: We only need to drop assumes in blocks that get flattend. If the
7130   // control flow is preserved, we should keep them.
7131   auto &ConditionalAssumes = Legal->getConditionalAssumes();
7132   DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end());
7133 
7134   DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter();
7135   // Dead instructions do not need sinking. Remove them from SinkAfter.
7136   for (Instruction *I : DeadInstructions)
7137     SinkAfter.erase(I);
7138 
7139   for (unsigned VF = MinVF; VF < MaxVF + 1;) {
7140     VFRange SubRange = {VF, MaxVF + 1};
7141     VPlans.push_back(buildVPlanWithVPRecipes(SubRange, NeedDef,
7142                                              DeadInstructions, SinkAfter));
7143     VF = SubRange.End;
7144   }
7145 }
7146 
7147 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
7148     VFRange &Range, SmallPtrSetImpl<Value *> &NeedDef,
7149     SmallPtrSetImpl<Instruction *> &DeadInstructions,
7150     const DenseMap<Instruction *, Instruction *> &SinkAfter) {
7151 
7152   // Hold a mapping from predicated instructions to their recipes, in order to
7153   // fix their AlsoPack behavior if a user is determined to replicate and use a
7154   // scalar instead of vector value.
7155   DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe;
7156 
7157   SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
7158 
7159   VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, Builder);
7160 
7161   // ---------------------------------------------------------------------------
7162   // Pre-construction: record ingredients whose recipes we'll need to further
7163   // process after constructing the initial VPlan.
7164   // ---------------------------------------------------------------------------
7165 
7166   // Mark instructions we'll need to sink later and their targets as
7167   // ingredients whose recipe we'll need to record.
7168   for (auto &Entry : SinkAfter) {
7169     RecipeBuilder.recordRecipeOf(Entry.first);
7170     RecipeBuilder.recordRecipeOf(Entry.second);
7171   }
7172 
7173   // For each interleave group which is relevant for this (possibly trimmed)
7174   // Range, add it to the set of groups to be later applied to the VPlan and add
7175   // placeholders for its members' Recipes which we'll be replacing with a
7176   // single VPInterleaveRecipe.
7177   for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
7178     auto applyIG = [IG, this](unsigned VF) -> bool {
7179       return (VF >= 2 && // Query is illegal for VF == 1
7180               CM.getWideningDecision(IG->getInsertPos(), VF) ==
7181                   LoopVectorizationCostModel::CM_Interleave);
7182     };
7183     if (!getDecisionAndClampRange(applyIG, Range))
7184       continue;
7185     InterleaveGroups.insert(IG);
7186     for (unsigned i = 0; i < IG->getFactor(); i++)
7187       if (Instruction *Member = IG->getMember(i))
7188         RecipeBuilder.recordRecipeOf(Member);
7189   };
7190 
7191   // ---------------------------------------------------------------------------
7192   // Build initial VPlan: Scan the body of the loop in a topological order to
7193   // visit each basic block after having visited its predecessor basic blocks.
7194   // ---------------------------------------------------------------------------
7195 
7196   // Create a dummy pre-entry VPBasicBlock to start building the VPlan.
7197   auto Plan = std::make_unique<VPlan>();
7198   VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry");
7199   Plan->setEntry(VPBB);
7200 
7201   // Represent values that will have defs inside VPlan.
7202   for (Value *V : NeedDef)
7203     Plan->addVPValue(V);
7204 
7205   // Scan the body of the loop in a topological order to visit each basic block
7206   // after having visited its predecessor basic blocks.
7207   LoopBlocksDFS DFS(OrigLoop);
7208   DFS.perform(LI);
7209 
7210   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
7211     // Relevant instructions from basic block BB will be grouped into VPRecipe
7212     // ingredients and fill a new VPBasicBlock.
7213     unsigned VPBBsForBB = 0;
7214     auto *FirstVPBBForBB = new VPBasicBlock(BB->getName());
7215     VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB);
7216     VPBB = FirstVPBBForBB;
7217     Builder.setInsertPoint(VPBB);
7218 
7219     // Introduce each ingredient into VPlan.
7220     for (Instruction &I : BB->instructionsWithoutDebug()) {
7221       Instruction *Instr = &I;
7222 
7223       // First filter out irrelevant instructions, to ensure no recipes are
7224       // built for them.
7225       if (isa<BranchInst>(Instr) ||
7226           DeadInstructions.find(Instr) != DeadInstructions.end())
7227         continue;
7228 
7229       if (RecipeBuilder.tryToCreateRecipe(Instr, Range, Plan, VPBB))
7230         continue;
7231 
7232       // Otherwise, if all widening options failed, Instruction is to be
7233       // replicated. This may create a successor for VPBB.
7234       VPBasicBlock *NextVPBB = RecipeBuilder.handleReplication(
7235           Instr, Range, VPBB, PredInst2Recipe, Plan);
7236       if (NextVPBB != VPBB) {
7237         VPBB = NextVPBB;
7238         VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++)
7239                                     : "");
7240       }
7241     }
7242   }
7243 
7244   // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks
7245   // may also be empty, such as the last one VPBB, reflecting original
7246   // basic-blocks with no recipes.
7247   VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry());
7248   assert(PreEntry->empty() && "Expecting empty pre-entry block.");
7249   VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor());
7250   VPBlockUtils::disconnectBlocks(PreEntry, Entry);
7251   delete PreEntry;
7252 
7253   // ---------------------------------------------------------------------------
7254   // Transform initial VPlan: Apply previously taken decisions, in order, to
7255   // bring the VPlan to its final state.
7256   // ---------------------------------------------------------------------------
7257 
7258   // Apply Sink-After legal constraints.
7259   for (auto &Entry : SinkAfter) {
7260     VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first);
7261     VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second);
7262     Sink->moveAfter(Target);
7263   }
7264 
7265   // Interleave memory: for each Interleave Group we marked earlier as relevant
7266   // for this VPlan, replace the Recipes widening its memory instructions with a
7267   // single VPInterleaveRecipe at its insertion point.
7268   for (auto IG : InterleaveGroups) {
7269     auto *Recipe = cast<VPWidenMemoryInstructionRecipe>(
7270         RecipeBuilder.getRecipe(IG->getInsertPos()));
7271     (new VPInterleaveRecipe(IG, Recipe->getAddr(), Recipe->getMask()))
7272         ->insertBefore(Recipe);
7273 
7274     for (unsigned i = 0; i < IG->getFactor(); ++i)
7275       if (Instruction *Member = IG->getMember(i)) {
7276         RecipeBuilder.getRecipe(Member)->eraseFromParent();
7277       }
7278   }
7279 
7280   // Finally, if tail is folded by masking, introduce selects between the phi
7281   // and the live-out instruction of each reduction, at the end of the latch.
7282   if (CM.foldTailByMasking()) {
7283     Builder.setInsertPoint(VPBB);
7284     auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan);
7285     for (auto &Reduction : Legal->getReductionVars()) {
7286       VPValue *Phi = Plan->getVPValue(Reduction.first);
7287       VPValue *Red = Plan->getVPValue(Reduction.second.getLoopExitInstr());
7288       Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi});
7289     }
7290   }
7291 
7292   std::string PlanName;
7293   raw_string_ostream RSO(PlanName);
7294   unsigned VF = Range.Start;
7295   Plan->addVF(VF);
7296   RSO << "Initial VPlan for VF={" << VF;
7297   for (VF *= 2; VF < Range.End; VF *= 2) {
7298     Plan->addVF(VF);
7299     RSO << "," << VF;
7300   }
7301   RSO << "},UF>=1";
7302   RSO.flush();
7303   Plan->setName(PlanName);
7304 
7305   return Plan;
7306 }
7307 
7308 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
7309   // Outer loop handling: They may require CFG and instruction level
7310   // transformations before even evaluating whether vectorization is profitable.
7311   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
7312   // the vectorization pipeline.
7313   assert(!OrigLoop->empty());
7314   assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
7315 
7316   // Create new empty VPlan
7317   auto Plan = std::make_unique<VPlan>();
7318 
7319   // Build hierarchical CFG
7320   VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
7321   HCFGBuilder.buildHierarchicalCFG();
7322 
7323   for (unsigned VF = Range.Start; VF < Range.End; VF *= 2)
7324     Plan->addVF(VF);
7325 
7326   if (EnableVPlanPredication) {
7327     VPlanPredicator VPP(*Plan);
7328     VPP.predicate();
7329 
7330     // Avoid running transformation to recipes until masked code generation in
7331     // VPlan-native path is in place.
7332     return Plan;
7333   }
7334 
7335   SmallPtrSet<Instruction *, 1> DeadInstructions;
7336   VPlanTransforms::VPInstructionsToVPRecipes(
7337       OrigLoop, Plan, Legal->getInductionVars(), DeadInstructions);
7338   return Plan;
7339 }
7340 
7341 Value* LoopVectorizationPlanner::VPCallbackILV::
7342 getOrCreateVectorValues(Value *V, unsigned Part) {
7343       return ILV.getOrCreateVectorValue(V, Part);
7344 }
7345 
7346 Value *LoopVectorizationPlanner::VPCallbackILV::getOrCreateScalarValue(
7347     Value *V, const VPIteration &Instance) {
7348   return ILV.getOrCreateScalarValue(V, Instance);
7349 }
7350 
7351 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
7352                                VPSlotTracker &SlotTracker) const {
7353   O << " +\n"
7354     << Indent << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
7355   IG->getInsertPos()->printAsOperand(O, false);
7356   O << ", ";
7357   getAddr()->printAsOperand(O, SlotTracker);
7358   VPValue *Mask = getMask();
7359   if (Mask) {
7360     O << ", ";
7361     Mask->printAsOperand(O, SlotTracker);
7362   }
7363   O << "\\l\"";
7364   for (unsigned i = 0; i < IG->getFactor(); ++i)
7365     if (Instruction *I = IG->getMember(i))
7366       O << " +\n"
7367         << Indent << "\"  " << VPlanIngredient(I) << " " << i << "\\l\"";
7368 }
7369 
7370 void VPWidenRecipe::execute(VPTransformState &State) {
7371   for (auto &Instr : make_range(Begin, End))
7372     State.ILV->widenInstruction(Instr);
7373 }
7374 
7375 void VPWidenGEPRecipe::execute(VPTransformState &State) {
7376   State.ILV->widenGEP(GEP, State.UF, State.VF, IsPtrLoopInvariant,
7377                       IsIndexLoopInvariant);
7378 }
7379 
7380 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
7381   assert(!State.Instance && "Int or FP induction being replicated.");
7382   State.ILV->widenIntOrFpInduction(IV, Trunc);
7383 }
7384 
7385 void VPWidenPHIRecipe::execute(VPTransformState &State) {
7386   State.ILV->widenPHIInstruction(Phi, State.UF, State.VF);
7387 }
7388 
7389 void VPBlendRecipe::execute(VPTransformState &State) {
7390   State.ILV->setDebugLocFromInst(State.Builder, Phi);
7391   // We know that all PHIs in non-header blocks are converted into
7392   // selects, so we don't have to worry about the insertion order and we
7393   // can just use the builder.
7394   // At this point we generate the predication tree. There may be
7395   // duplications since this is a simple recursive scan, but future
7396   // optimizations will clean it up.
7397 
7398   unsigned NumIncoming = Phi->getNumIncomingValues();
7399 
7400   assert((User || NumIncoming == 1) &&
7401          "Multiple predecessors with predecessors having a full mask");
7402   // Generate a sequence of selects of the form:
7403   // SELECT(Mask3, In3,
7404   //      SELECT(Mask2, In2,
7405   //                   ( ...)))
7406   InnerLoopVectorizer::VectorParts Entry(State.UF);
7407   for (unsigned In = 0; In < NumIncoming; ++In) {
7408     for (unsigned Part = 0; Part < State.UF; ++Part) {
7409       // We might have single edge PHIs (blocks) - use an identity
7410       // 'select' for the first PHI operand.
7411       Value *In0 =
7412           State.ILV->getOrCreateVectorValue(Phi->getIncomingValue(In), Part);
7413       if (In == 0)
7414         Entry[Part] = In0; // Initialize with the first incoming value.
7415       else {
7416         // Select between the current value and the previous incoming edge
7417         // based on the incoming mask.
7418         Value *Cond = State.get(User->getOperand(In), Part);
7419         Entry[Part] =
7420             State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi");
7421       }
7422     }
7423   }
7424   for (unsigned Part = 0; Part < State.UF; ++Part)
7425     State.ValueMap.setVectorValue(Phi, Part, Entry[Part]);
7426 }
7427 
7428 void VPInterleaveRecipe::execute(VPTransformState &State) {
7429   assert(!State.Instance && "Interleave group being replicated.");
7430   State.ILV->vectorizeInterleaveGroup(IG->getInsertPos(), State, getAddr(),
7431                                       getMask());
7432 }
7433 
7434 void VPReplicateRecipe::execute(VPTransformState &State) {
7435   if (State.Instance) { // Generate a single instance.
7436     State.ILV->scalarizeInstruction(Ingredient, *State.Instance, IsPredicated);
7437     // Insert scalar instance packing it into a vector.
7438     if (AlsoPack && State.VF > 1) {
7439       // If we're constructing lane 0, initialize to start from undef.
7440       if (State.Instance->Lane == 0) {
7441         Value *Undef =
7442             UndefValue::get(VectorType::get(Ingredient->getType(), State.VF));
7443         State.ValueMap.setVectorValue(Ingredient, State.Instance->Part, Undef);
7444       }
7445       State.ILV->packScalarIntoVectorValue(Ingredient, *State.Instance);
7446     }
7447     return;
7448   }
7449 
7450   // Generate scalar instances for all VF lanes of all UF parts, unless the
7451   // instruction is uniform inwhich case generate only the first lane for each
7452   // of the UF parts.
7453   unsigned EndLane = IsUniform ? 1 : State.VF;
7454   for (unsigned Part = 0; Part < State.UF; ++Part)
7455     for (unsigned Lane = 0; Lane < EndLane; ++Lane)
7456       State.ILV->scalarizeInstruction(Ingredient, {Part, Lane}, IsPredicated);
7457 }
7458 
7459 void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
7460   assert(State.Instance && "Branch on Mask works only on single instance.");
7461 
7462   unsigned Part = State.Instance->Part;
7463   unsigned Lane = State.Instance->Lane;
7464 
7465   Value *ConditionBit = nullptr;
7466   if (!User) // Block in mask is all-one.
7467     ConditionBit = State.Builder.getTrue();
7468   else {
7469     VPValue *BlockInMask = User->getOperand(0);
7470     ConditionBit = State.get(BlockInMask, Part);
7471     if (ConditionBit->getType()->isVectorTy())
7472       ConditionBit = State.Builder.CreateExtractElement(
7473           ConditionBit, State.Builder.getInt32(Lane));
7474   }
7475 
7476   // Replace the temporary unreachable terminator with a new conditional branch,
7477   // whose two destinations will be set later when they are created.
7478   auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
7479   assert(isa<UnreachableInst>(CurrentTerminator) &&
7480          "Expected to replace unreachable terminator with conditional branch.");
7481   auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
7482   CondBr->setSuccessor(0, nullptr);
7483   ReplaceInstWithInst(CurrentTerminator, CondBr);
7484 }
7485 
7486 void VPPredInstPHIRecipe::execute(VPTransformState &State) {
7487   assert(State.Instance && "Predicated instruction PHI works per instance.");
7488   Instruction *ScalarPredInst = cast<Instruction>(
7489       State.ValueMap.getScalarValue(PredInst, *State.Instance));
7490   BasicBlock *PredicatedBB = ScalarPredInst->getParent();
7491   BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
7492   assert(PredicatingBB && "Predicated block has no single predecessor.");
7493 
7494   // By current pack/unpack logic we need to generate only a single phi node: if
7495   // a vector value for the predicated instruction exists at this point it means
7496   // the instruction has vector users only, and a phi for the vector value is
7497   // needed. In this case the recipe of the predicated instruction is marked to
7498   // also do that packing, thereby "hoisting" the insert-element sequence.
7499   // Otherwise, a phi node for the scalar value is needed.
7500   unsigned Part = State.Instance->Part;
7501   if (State.ValueMap.hasVectorValue(PredInst, Part)) {
7502     Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part);
7503     InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
7504     PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
7505     VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
7506     VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
7507     State.ValueMap.resetVectorValue(PredInst, Part, VPhi); // Update cache.
7508   } else {
7509     Type *PredInstType = PredInst->getType();
7510     PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
7511     Phi->addIncoming(UndefValue::get(ScalarPredInst->getType()), PredicatingBB);
7512     Phi->addIncoming(ScalarPredInst, PredicatedBB);
7513     State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi);
7514   }
7515 }
7516 
7517 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
7518   State.ILV->vectorizeMemoryInstruction(&Instr, State, getAddr(), getMask());
7519 }
7520 
7521 // Determine how to lower the scalar epilogue, which depends on 1) optimising
7522 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
7523 // predication, and 4) a TTI hook that analyses whether the loop is suitable
7524 // for predication.
7525 static ScalarEpilogueLowering getScalarEpilogueLowering(
7526     Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI,
7527     BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
7528     AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
7529     LoopVectorizationLegality &LVL) {
7530   bool OptSize =
7531       F->hasOptSize() || llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
7532                                                      PGSOQueryType::IRPass);
7533   // 1) OptSize takes precedence over all other options, i.e. if this is set,
7534   // don't look at hints or options, and don't request a scalar epilogue.
7535   if (OptSize && Hints.getForce() != LoopVectorizeHints::FK_Enabled)
7536     return CM_ScalarEpilogueNotAllowedOptSize;
7537 
7538   bool PredicateOptDisabled = PreferPredicateOverEpilog.getNumOccurrences() &&
7539                               !PreferPredicateOverEpilog;
7540 
7541   // 2) Next, if disabling predication is requested on the command line, honour
7542   // this and request a scalar epilogue. Also do this if we don't have a
7543   // primary induction variable, which is required for predication.
7544   if (PredicateOptDisabled || !LVL.getPrimaryInduction())
7545     return CM_ScalarEpilogueAllowed;
7546 
7547   // 3) and 4) look if enabling predication is requested on the command line,
7548   // with a loop hint, or if the TTI hook indicates this is profitable, request
7549   // predication .
7550   if (PreferPredicateOverEpilog ||
7551       Hints.getPredicate() == LoopVectorizeHints::FK_Enabled ||
7552       (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT,
7553                                         LVL.getLAI()) &&
7554        Hints.getPredicate() != LoopVectorizeHints::FK_Disabled))
7555     return CM_ScalarEpilogueNotNeededUsePredicate;
7556 
7557   return CM_ScalarEpilogueAllowed;
7558 }
7559 
7560 // Process the loop in the VPlan-native vectorization path. This path builds
7561 // VPlan upfront in the vectorization pipeline, which allows to apply
7562 // VPlan-to-VPlan transformations from the very beginning without modifying the
7563 // input LLVM IR.
7564 static bool processLoopInVPlanNativePath(
7565     Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
7566     LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
7567     TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
7568     OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
7569     ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) {
7570 
7571   assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
7572   Function *F = L->getHeader()->getParent();
7573   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
7574 
7575   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
7576       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL);
7577 
7578   LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
7579                                 &Hints, IAI);
7580   // Use the planner for outer loop vectorization.
7581   // TODO: CM is not used at this point inside the planner. Turn CM into an
7582   // optional argument if we don't need it in the future.
7583   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI);
7584 
7585   // Get user vectorization factor.
7586   const unsigned UserVF = Hints.getWidth();
7587 
7588   // Plan how to best vectorize, return the best VF and its cost.
7589   const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
7590 
7591   // If we are stress testing VPlan builds, do not attempt to generate vector
7592   // code. Masked vector code generation support will follow soon.
7593   // Also, do not attempt to vectorize if no vector code will be produced.
7594   if (VPlanBuildStressTest || EnableVPlanPredication ||
7595       VectorizationFactor::Disabled() == VF)
7596     return false;
7597 
7598   LVP.setBestPlan(VF.Width, 1);
7599 
7600   InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL,
7601                          &CM);
7602   LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
7603                     << L->getHeader()->getParent()->getName() << "\"\n");
7604   LVP.executePlan(LB, DT);
7605 
7606   // Mark the loop as already vectorized to avoid vectorizing again.
7607   Hints.setAlreadyVectorized();
7608 
7609   LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent()));
7610   return true;
7611 }
7612 
7613 bool LoopVectorizePass::processLoop(Loop *L) {
7614   assert((EnableVPlanNativePath || L->empty()) &&
7615          "VPlan-native path is not enabled. Only process inner loops.");
7616 
7617 #ifndef NDEBUG
7618   const std::string DebugLocStr = getDebugLocString(L);
7619 #endif /* NDEBUG */
7620 
7621   LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \""
7622                     << L->getHeader()->getParent()->getName() << "\" from "
7623                     << DebugLocStr << "\n");
7624 
7625   LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE);
7626 
7627   LLVM_DEBUG(
7628       dbgs() << "LV: Loop hints:"
7629              << " force="
7630              << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
7631                      ? "disabled"
7632                      : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
7633                             ? "enabled"
7634                             : "?"))
7635              << " width=" << Hints.getWidth()
7636              << " unroll=" << Hints.getInterleave() << "\n");
7637 
7638   // Function containing loop
7639   Function *F = L->getHeader()->getParent();
7640 
7641   // Looking at the diagnostic output is the only way to determine if a loop
7642   // was vectorized (other than looking at the IR or machine code), so it
7643   // is important to generate an optimization remark for each loop. Most of
7644   // these messages are generated as OptimizationRemarkAnalysis. Remarks
7645   // generated as OptimizationRemark and OptimizationRemarkMissed are
7646   // less verbose reporting vectorized loops and unvectorized loops that may
7647   // benefit from vectorization, respectively.
7648 
7649   if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
7650     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
7651     return false;
7652   }
7653 
7654   PredicatedScalarEvolution PSE(*SE, *L);
7655 
7656   // Check if it is legal to vectorize the loop.
7657   LoopVectorizationRequirements Requirements(*ORE);
7658   LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE,
7659                                 &Requirements, &Hints, DB, AC);
7660   if (!LVL.canVectorize(EnableVPlanNativePath)) {
7661     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
7662     Hints.emitRemarkWithHints();
7663     return false;
7664   }
7665 
7666   // Check the function attributes and profiles to find out if this function
7667   // should be optimized for size.
7668   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
7669       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL);
7670 
7671   // Entrance to the VPlan-native vectorization path. Outer loops are processed
7672   // here. They may require CFG and instruction level transformations before
7673   // even evaluating whether vectorization is profitable. Since we cannot modify
7674   // the incoming IR, we need to build VPlan upfront in the vectorization
7675   // pipeline.
7676   if (!L->empty())
7677     return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
7678                                         ORE, BFI, PSI, Hints);
7679 
7680   assert(L->empty() && "Inner loop expected.");
7681 
7682   // Check the loop for a trip count threshold: vectorize loops with a tiny trip
7683   // count by optimizing for size, to minimize overheads.
7684   auto ExpectedTC = getSmallBestKnownTC(*SE, L);
7685   if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
7686     LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
7687                       << "This loop is worth vectorizing only if no scalar "
7688                       << "iteration overheads are incurred.");
7689     if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
7690       LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
7691     else {
7692       LLVM_DEBUG(dbgs() << "\n");
7693       SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
7694     }
7695   }
7696 
7697   // Check the function attributes to see if implicit floats are allowed.
7698   // FIXME: This check doesn't seem possibly correct -- what if the loop is
7699   // an integer loop and the vector instructions selected are purely integer
7700   // vector instructions?
7701   if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
7702     reportVectorizationFailure(
7703         "Can't vectorize when the NoImplicitFloat attribute is used",
7704         "loop not vectorized due to NoImplicitFloat attribute",
7705         "NoImplicitFloat", ORE, L);
7706     Hints.emitRemarkWithHints();
7707     return false;
7708   }
7709 
7710   // Check if the target supports potentially unsafe FP vectorization.
7711   // FIXME: Add a check for the type of safety issue (denormal, signaling)
7712   // for the target we're vectorizing for, to make sure none of the
7713   // additional fp-math flags can help.
7714   if (Hints.isPotentiallyUnsafe() &&
7715       TTI->isFPVectorizationPotentiallyUnsafe()) {
7716     reportVectorizationFailure(
7717         "Potentially unsafe FP op prevents vectorization",
7718         "loop not vectorized due to unsafe FP support.",
7719         "UnsafeFP", ORE, L);
7720     Hints.emitRemarkWithHints();
7721     return false;
7722   }
7723 
7724   bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
7725   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
7726 
7727   // If an override option has been passed in for interleaved accesses, use it.
7728   if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
7729     UseInterleaved = EnableInterleavedMemAccesses;
7730 
7731   // Analyze interleaved memory accesses.
7732   if (UseInterleaved) {
7733     IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
7734   }
7735 
7736   // Use the cost model.
7737   LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
7738                                 F, &Hints, IAI);
7739   CM.collectValuesToIgnore();
7740 
7741   // Use the planner for vectorization.
7742   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI);
7743 
7744   // Get user vectorization factor.
7745   unsigned UserVF = Hints.getWidth();
7746 
7747   // Plan how to best vectorize, return the best VF and its cost.
7748   Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF);
7749 
7750   VectorizationFactor VF = VectorizationFactor::Disabled();
7751   unsigned IC = 1;
7752   unsigned UserIC = Hints.getInterleave();
7753 
7754   if (MaybeVF) {
7755     VF = *MaybeVF;
7756     // Select the interleave count.
7757     IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
7758   }
7759 
7760   // Identify the diagnostic messages that should be produced.
7761   std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
7762   bool VectorizeLoop = true, InterleaveLoop = true;
7763   if (Requirements.doesNotMeet(F, L, Hints)) {
7764     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization "
7765                          "requirements.\n");
7766     Hints.emitRemarkWithHints();
7767     return false;
7768   }
7769 
7770   if (VF.Width == 1) {
7771     LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
7772     VecDiagMsg = std::make_pair(
7773         "VectorizationNotBeneficial",
7774         "the cost-model indicates that vectorization is not beneficial");
7775     VectorizeLoop = false;
7776   }
7777 
7778   if (!MaybeVF && UserIC > 1) {
7779     // Tell the user interleaving was avoided up-front, despite being explicitly
7780     // requested.
7781     LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
7782                          "interleaving should be avoided up front\n");
7783     IntDiagMsg = std::make_pair(
7784         "InterleavingAvoided",
7785         "Ignoring UserIC, because interleaving was avoided up front");
7786     InterleaveLoop = false;
7787   } else if (IC == 1 && UserIC <= 1) {
7788     // Tell the user interleaving is not beneficial.
7789     LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
7790     IntDiagMsg = std::make_pair(
7791         "InterleavingNotBeneficial",
7792         "the cost-model indicates that interleaving is not beneficial");
7793     InterleaveLoop = false;
7794     if (UserIC == 1) {
7795       IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
7796       IntDiagMsg.second +=
7797           " and is explicitly disabled or interleave count is set to 1";
7798     }
7799   } else if (IC > 1 && UserIC == 1) {
7800     // Tell the user interleaving is beneficial, but it explicitly disabled.
7801     LLVM_DEBUG(
7802         dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
7803     IntDiagMsg = std::make_pair(
7804         "InterleavingBeneficialButDisabled",
7805         "the cost-model indicates that interleaving is beneficial "
7806         "but is explicitly disabled or interleave count is set to 1");
7807     InterleaveLoop = false;
7808   }
7809 
7810   // Override IC if user provided an interleave count.
7811   IC = UserIC > 0 ? UserIC : IC;
7812 
7813   // Emit diagnostic messages, if any.
7814   const char *VAPassName = Hints.vectorizeAnalysisPassName();
7815   if (!VectorizeLoop && !InterleaveLoop) {
7816     // Do not vectorize or interleaving the loop.
7817     ORE->emit([&]() {
7818       return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
7819                                       L->getStartLoc(), L->getHeader())
7820              << VecDiagMsg.second;
7821     });
7822     ORE->emit([&]() {
7823       return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
7824                                       L->getStartLoc(), L->getHeader())
7825              << IntDiagMsg.second;
7826     });
7827     return false;
7828   } else if (!VectorizeLoop && InterleaveLoop) {
7829     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
7830     ORE->emit([&]() {
7831       return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
7832                                         L->getStartLoc(), L->getHeader())
7833              << VecDiagMsg.second;
7834     });
7835   } else if (VectorizeLoop && !InterleaveLoop) {
7836     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
7837                       << ") in " << DebugLocStr << '\n');
7838     ORE->emit([&]() {
7839       return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
7840                                         L->getStartLoc(), L->getHeader())
7841              << IntDiagMsg.second;
7842     });
7843   } else if (VectorizeLoop && InterleaveLoop) {
7844     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
7845                       << ") in " << DebugLocStr << '\n');
7846     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
7847   }
7848 
7849   LVP.setBestPlan(VF.Width, IC);
7850 
7851   using namespace ore;
7852   bool DisableRuntimeUnroll = false;
7853   MDNode *OrigLoopID = L->getLoopID();
7854 
7855   if (!VectorizeLoop) {
7856     assert(IC > 1 && "interleave count should not be 1 or 0");
7857     // If we decided that it is not legal to vectorize the loop, then
7858     // interleave it.
7859     InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
7860                                &CM);
7861     LVP.executePlan(Unroller, DT);
7862 
7863     ORE->emit([&]() {
7864       return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
7865                                 L->getHeader())
7866              << "interleaved loop (interleaved count: "
7867              << NV("InterleaveCount", IC) << ")";
7868     });
7869   } else {
7870     // If we decided that it is *legal* to vectorize the loop, then do it.
7871     InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
7872                            &LVL, &CM);
7873     LVP.executePlan(LB, DT);
7874     ++LoopsVectorized;
7875 
7876     // Add metadata to disable runtime unrolling a scalar loop when there are
7877     // no runtime checks about strides and memory. A scalar loop that is
7878     // rarely used is not worth unrolling.
7879     if (!LB.areSafetyChecksAdded())
7880       DisableRuntimeUnroll = true;
7881 
7882     // Report the vectorization decision.
7883     ORE->emit([&]() {
7884       return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(),
7885                                 L->getHeader())
7886              << "vectorized loop (vectorization width: "
7887              << NV("VectorizationFactor", VF.Width)
7888              << ", interleaved count: " << NV("InterleaveCount", IC) << ")";
7889     });
7890   }
7891 
7892   Optional<MDNode *> RemainderLoopID =
7893       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
7894                                       LLVMLoopVectorizeFollowupEpilogue});
7895   if (RemainderLoopID.hasValue()) {
7896     L->setLoopID(RemainderLoopID.getValue());
7897   } else {
7898     if (DisableRuntimeUnroll)
7899       AddRuntimeUnrollDisableMetaData(L);
7900 
7901     // Mark the loop as already vectorized to avoid vectorizing again.
7902     Hints.setAlreadyVectorized();
7903   }
7904 
7905   LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent()));
7906   return true;
7907 }
7908 
7909 bool LoopVectorizePass::runImpl(
7910     Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_,
7911     DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_,
7912     DemandedBits &DB_, AliasAnalysis &AA_, AssumptionCache &AC_,
7913     std::function<const LoopAccessInfo &(Loop &)> &GetLAA_,
7914     OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) {
7915   SE = &SE_;
7916   LI = &LI_;
7917   TTI = &TTI_;
7918   DT = &DT_;
7919   BFI = &BFI_;
7920   TLI = TLI_;
7921   AA = &AA_;
7922   AC = &AC_;
7923   GetLAA = &GetLAA_;
7924   DB = &DB_;
7925   ORE = &ORE_;
7926   PSI = PSI_;
7927 
7928   // Don't attempt if
7929   // 1. the target claims to have no vector registers, and
7930   // 2. interleaving won't help ILP.
7931   //
7932   // The second condition is necessary because, even if the target has no
7933   // vector registers, loop vectorization may still enable scalar
7934   // interleaving.
7935   if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) &&
7936       TTI->getMaxInterleaveFactor(1) < 2)
7937     return false;
7938 
7939   bool Changed = false;
7940 
7941   // The vectorizer requires loops to be in simplified form.
7942   // Since simplification may add new inner loops, it has to run before the
7943   // legality and profitability checks. This means running the loop vectorizer
7944   // will simplify all loops, regardless of whether anything end up being
7945   // vectorized.
7946   for (auto &L : *LI)
7947     Changed |=
7948         simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
7949 
7950   // Build up a worklist of inner-loops to vectorize. This is necessary as
7951   // the act of vectorizing or partially unrolling a loop creates new loops
7952   // and can invalidate iterators across the loops.
7953   SmallVector<Loop *, 8> Worklist;
7954 
7955   for (Loop *L : *LI)
7956     collectSupportedLoops(*L, LI, ORE, Worklist);
7957 
7958   LoopsAnalyzed += Worklist.size();
7959 
7960   // Now walk the identified inner loops.
7961   while (!Worklist.empty()) {
7962     Loop *L = Worklist.pop_back_val();
7963 
7964     // For the inner loops we actually process, form LCSSA to simplify the
7965     // transform.
7966     Changed |= formLCSSARecursively(*L, *DT, LI, SE);
7967 
7968     Changed |= processLoop(L);
7969   }
7970 
7971   // Process each loop nest in the function.
7972   return Changed;
7973 }
7974 
7975 PreservedAnalyses LoopVectorizePass::run(Function &F,
7976                                          FunctionAnalysisManager &AM) {
7977     auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
7978     auto &LI = AM.getResult<LoopAnalysis>(F);
7979     auto &TTI = AM.getResult<TargetIRAnalysis>(F);
7980     auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
7981     auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F);
7982     auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
7983     auto &AA = AM.getResult<AAManager>(F);
7984     auto &AC = AM.getResult<AssumptionAnalysis>(F);
7985     auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
7986     auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
7987     MemorySSA *MSSA = EnableMSSALoopDependency
7988                           ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA()
7989                           : nullptr;
7990 
7991     auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
7992     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
7993         [&](Loop &L) -> const LoopAccessInfo & {
7994       LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI, MSSA};
7995       return LAM.getResult<LoopAccessAnalysis>(L, AR);
7996     };
7997     const ModuleAnalysisManager &MAM =
7998         AM.getResult<ModuleAnalysisManagerFunctionProxy>(F).getManager();
7999     ProfileSummaryInfo *PSI =
8000         MAM.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
8001     bool Changed =
8002         runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI);
8003     if (!Changed)
8004       return PreservedAnalyses::all();
8005     PreservedAnalyses PA;
8006 
8007     // We currently do not preserve loopinfo/dominator analyses with outer loop
8008     // vectorization. Until this is addressed, mark these analyses as preserved
8009     // only for non-VPlan-native path.
8010     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
8011     if (!EnableVPlanNativePath) {
8012       PA.preserve<LoopAnalysis>();
8013       PA.preserve<DominatorTreeAnalysis>();
8014     }
8015     PA.preserve<BasicAA>();
8016     PA.preserve<GlobalsAA>();
8017     return PA;
8018 }
8019