1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10 // and generates target-independent LLVM-IR.
11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12 // of instructions in order to estimate the profitability of vectorization.
13 //
14 // The loop vectorizer combines consecutive loop iterations into a single
15 // 'wide' iteration. After this transformation the index is incremented
16 // by the SIMD vector width, and not by one.
17 //
18 // This pass has three parts:
19 // 1. The main loop pass that drives the different parts.
20 // 2. LoopVectorizationLegality - A unit that checks for the legality
21 //    of the vectorization.
22 // 3. InnerLoopVectorizer - A unit that performs the actual
23 //    widening of instructions.
24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
25 //    of vectorization. It decides on the optimal vector width, which
26 //    can be one, if vectorization is not profitable.
27 //
28 // There is a development effort going on to migrate loop vectorizer to the
29 // VPlan infrastructure and to introduce outer loop vectorization support (see
30 // docs/Proposal/VectorizationPlan.rst and
31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32 // purpose, we temporarily introduced the VPlan-native vectorization path: an
33 // alternative vectorization path that is natively implemented on top of the
34 // VPlan infrastructure. See EnableVPlanNativePath for enabling.
35 //
36 //===----------------------------------------------------------------------===//
37 //
38 // The reduction-variable vectorization is based on the paper:
39 //  D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40 //
41 // Variable uniformity checks are inspired by:
42 //  Karrenberg, R. and Hack, S. Whole Function Vectorization.
43 //
44 // The interleaved access vectorization is based on the paper:
45 //  Dorit Nuzman, Ira Rosen and Ayal Zaks.  Auto-Vectorization of Interleaved
46 //  Data for SIMD
47 //
48 // Other ideas/concepts are from:
49 //  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50 //
51 //  S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua.  An Evaluation of
52 //  Vectorizing Compilers.
53 //
54 //===----------------------------------------------------------------------===//
55 
56 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
57 #include "LoopVectorizationPlanner.h"
58 #include "VPRecipeBuilder.h"
59 #include "VPlan.h"
60 #include "VPlanHCFGBuilder.h"
61 #include "VPlanHCFGTransforms.h"
62 #include "VPlanPredicator.h"
63 #include "llvm/ADT/APInt.h"
64 #include "llvm/ADT/ArrayRef.h"
65 #include "llvm/ADT/DenseMap.h"
66 #include "llvm/ADT/DenseMapInfo.h"
67 #include "llvm/ADT/Hashing.h"
68 #include "llvm/ADT/MapVector.h"
69 #include "llvm/ADT/None.h"
70 #include "llvm/ADT/Optional.h"
71 #include "llvm/ADT/STLExtras.h"
72 #include "llvm/ADT/SetVector.h"
73 #include "llvm/ADT/SmallPtrSet.h"
74 #include "llvm/ADT/SmallVector.h"
75 #include "llvm/ADT/Statistic.h"
76 #include "llvm/ADT/StringRef.h"
77 #include "llvm/ADT/Twine.h"
78 #include "llvm/ADT/iterator_range.h"
79 #include "llvm/Analysis/AssumptionCache.h"
80 #include "llvm/Analysis/BasicAliasAnalysis.h"
81 #include "llvm/Analysis/BlockFrequencyInfo.h"
82 #include "llvm/Analysis/CFG.h"
83 #include "llvm/Analysis/CodeMetrics.h"
84 #include "llvm/Analysis/DemandedBits.h"
85 #include "llvm/Analysis/GlobalsModRef.h"
86 #include "llvm/Analysis/LoopAccessAnalysis.h"
87 #include "llvm/Analysis/LoopAnalysisManager.h"
88 #include "llvm/Analysis/LoopInfo.h"
89 #include "llvm/Analysis/LoopIterator.h"
90 #include "llvm/Analysis/MemorySSA.h"
91 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
92 #include "llvm/Analysis/ProfileSummaryInfo.h"
93 #include "llvm/Analysis/ScalarEvolution.h"
94 #include "llvm/Analysis/ScalarEvolutionExpander.h"
95 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
96 #include "llvm/Analysis/TargetLibraryInfo.h"
97 #include "llvm/Analysis/TargetTransformInfo.h"
98 #include "llvm/Analysis/VectorUtils.h"
99 #include "llvm/IR/Attributes.h"
100 #include "llvm/IR/BasicBlock.h"
101 #include "llvm/IR/CFG.h"
102 #include "llvm/IR/Constant.h"
103 #include "llvm/IR/Constants.h"
104 #include "llvm/IR/DataLayout.h"
105 #include "llvm/IR/DebugInfoMetadata.h"
106 #include "llvm/IR/DebugLoc.h"
107 #include "llvm/IR/DerivedTypes.h"
108 #include "llvm/IR/DiagnosticInfo.h"
109 #include "llvm/IR/Dominators.h"
110 #include "llvm/IR/Function.h"
111 #include "llvm/IR/IRBuilder.h"
112 #include "llvm/IR/InstrTypes.h"
113 #include "llvm/IR/Instruction.h"
114 #include "llvm/IR/Instructions.h"
115 #include "llvm/IR/IntrinsicInst.h"
116 #include "llvm/IR/Intrinsics.h"
117 #include "llvm/IR/LLVMContext.h"
118 #include "llvm/IR/Metadata.h"
119 #include "llvm/IR/Module.h"
120 #include "llvm/IR/Operator.h"
121 #include "llvm/IR/Type.h"
122 #include "llvm/IR/Use.h"
123 #include "llvm/IR/User.h"
124 #include "llvm/IR/Value.h"
125 #include "llvm/IR/ValueHandle.h"
126 #include "llvm/IR/Verifier.h"
127 #include "llvm/Pass.h"
128 #include "llvm/Support/Casting.h"
129 #include "llvm/Support/CommandLine.h"
130 #include "llvm/Support/Compiler.h"
131 #include "llvm/Support/Debug.h"
132 #include "llvm/Support/ErrorHandling.h"
133 #include "llvm/Support/MathExtras.h"
134 #include "llvm/Support/raw_ostream.h"
135 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
136 #include "llvm/Transforms/Utils/LoopSimplify.h"
137 #include "llvm/Transforms/Utils/LoopUtils.h"
138 #include "llvm/Transforms/Utils/LoopVersioning.h"
139 #include "llvm/Transforms/Utils/SizeOpts.h"
140 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
141 #include <algorithm>
142 #include <cassert>
143 #include <cstdint>
144 #include <cstdlib>
145 #include <functional>
146 #include <iterator>
147 #include <limits>
148 #include <memory>
149 #include <string>
150 #include <tuple>
151 #include <utility>
152 #include <vector>
153 
154 using namespace llvm;
155 
156 #define LV_NAME "loop-vectorize"
157 #define DEBUG_TYPE LV_NAME
158 
159 /// @{
160 /// Metadata attribute names
161 static const char *const LLVMLoopVectorizeFollowupAll =
162     "llvm.loop.vectorize.followup_all";
163 static const char *const LLVMLoopVectorizeFollowupVectorized =
164     "llvm.loop.vectorize.followup_vectorized";
165 static const char *const LLVMLoopVectorizeFollowupEpilogue =
166     "llvm.loop.vectorize.followup_epilogue";
167 /// @}
168 
169 STATISTIC(LoopsVectorized, "Number of loops vectorized");
170 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
171 
172 /// Loops with a known constant trip count below this number are vectorized only
173 /// if no scalar iteration overheads are incurred.
174 static cl::opt<unsigned> TinyTripCountVectorThreshold(
175     "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
176     cl::desc("Loops with a constant trip count that is smaller than this "
177              "value are vectorized only if no scalar iteration overheads "
178              "are incurred."));
179 
180 // Indicates that an epilogue is undesired, predication is preferred.
181 // This means that the vectorizer will try to fold the loop-tail (epilogue)
182 // into the loop and predicate the loop body accordingly.
183 static cl::opt<bool> PreferPredicateOverEpilog(
184     "prefer-predicate-over-epilog", cl::init(false), cl::Hidden,
185     cl::desc("Indicate that an epilogue is undesired, predication should be "
186              "used instead."));
187 
188 static cl::opt<bool> MaximizeBandwidth(
189     "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
190     cl::desc("Maximize bandwidth when selecting vectorization factor which "
191              "will be determined by the smallest type in loop."));
192 
193 static cl::opt<bool> EnableInterleavedMemAccesses(
194     "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
195     cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
196 
197 /// An interleave-group may need masking if it resides in a block that needs
198 /// predication, or in order to mask away gaps.
199 static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
200     "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
201     cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
202 
203 /// We don't interleave loops with a known constant trip count below this
204 /// number.
205 static const unsigned TinyTripCountInterleaveThreshold = 128;
206 
207 static cl::opt<unsigned> ForceTargetNumScalarRegs(
208     "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
209     cl::desc("A flag that overrides the target's number of scalar registers."));
210 
211 static cl::opt<unsigned> ForceTargetNumVectorRegs(
212     "force-target-num-vector-regs", cl::init(0), cl::Hidden,
213     cl::desc("A flag that overrides the target's number of vector registers."));
214 
215 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
216     "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
217     cl::desc("A flag that overrides the target's max interleave factor for "
218              "scalar loops."));
219 
220 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
221     "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
222     cl::desc("A flag that overrides the target's max interleave factor for "
223              "vectorized loops."));
224 
225 static cl::opt<unsigned> ForceTargetInstructionCost(
226     "force-target-instruction-cost", cl::init(0), cl::Hidden,
227     cl::desc("A flag that overrides the target's expected cost for "
228              "an instruction to a single constant value. Mostly "
229              "useful for getting consistent testing."));
230 
231 static cl::opt<unsigned> SmallLoopCost(
232     "small-loop-cost", cl::init(20), cl::Hidden,
233     cl::desc(
234         "The cost of a loop that is considered 'small' by the interleaver."));
235 
236 static cl::opt<bool> LoopVectorizeWithBlockFrequency(
237     "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
238     cl::desc("Enable the use of the block frequency analysis to access PGO "
239              "heuristics minimizing code growth in cold regions and being more "
240              "aggressive in hot regions."));
241 
242 // Runtime interleave loops for load/store throughput.
243 static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
244     "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
245     cl::desc(
246         "Enable runtime interleaving until load/store ports are saturated"));
247 
248 /// The number of stores in a loop that are allowed to need predication.
249 static cl::opt<unsigned> NumberOfStoresToPredicate(
250     "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
251     cl::desc("Max number of stores to be predicated behind an if."));
252 
253 static cl::opt<bool> EnableIndVarRegisterHeur(
254     "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
255     cl::desc("Count the induction variable only once when interleaving"));
256 
257 static cl::opt<bool> EnableCondStoresVectorization(
258     "enable-cond-stores-vec", cl::init(true), cl::Hidden,
259     cl::desc("Enable if predication of stores during vectorization."));
260 
261 static cl::opt<unsigned> MaxNestedScalarReductionIC(
262     "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
263     cl::desc("The maximum interleave count to use when interleaving a scalar "
264              "reduction in a nested loop."));
265 
266 cl::opt<bool> EnableVPlanNativePath(
267     "enable-vplan-native-path", cl::init(false), cl::Hidden,
268     cl::desc("Enable VPlan-native vectorization path with "
269              "support for outer loop vectorization."));
270 
271 // FIXME: Remove this switch once we have divergence analysis. Currently we
272 // assume divergent non-backedge branches when this switch is true.
273 cl::opt<bool> EnableVPlanPredication(
274     "enable-vplan-predication", cl::init(false), cl::Hidden,
275     cl::desc("Enable VPlan-native vectorization path predicator with "
276              "support for outer loop vectorization."));
277 
278 // This flag enables the stress testing of the VPlan H-CFG construction in the
279 // VPlan-native vectorization path. It must be used in conjuction with
280 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
281 // verification of the H-CFGs built.
282 static cl::opt<bool> VPlanBuildStressTest(
283     "vplan-build-stress-test", cl::init(false), cl::Hidden,
284     cl::desc(
285         "Build VPlan for every supported loop nest in the function and bail "
286         "out right after the build (stress test the VPlan H-CFG construction "
287         "in the VPlan-native vectorization path)."));
288 
289 cl::opt<bool> llvm::EnableLoopInterleaving(
290     "interleave-loops", cl::init(true), cl::Hidden,
291     cl::desc("Enable loop interleaving in Loop vectorization passes"));
292 cl::opt<bool> llvm::EnableLoopVectorization(
293     "vectorize-loops", cl::init(true), cl::Hidden,
294     cl::desc("Run the Loop vectorization passes"));
295 
296 /// A helper function for converting Scalar types to vector types.
297 /// If the incoming type is void, we return void. If the VF is 1, we return
298 /// the scalar type.
299 static Type *ToVectorTy(Type *Scalar, unsigned VF) {
300   if (Scalar->isVoidTy() || VF == 1)
301     return Scalar;
302   return VectorType::get(Scalar, VF);
303 }
304 
305 /// A helper function that returns the type of loaded or stored value.
306 static Type *getMemInstValueType(Value *I) {
307   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
308          "Expected Load or Store instruction");
309   if (auto *LI = dyn_cast<LoadInst>(I))
310     return LI->getType();
311   return cast<StoreInst>(I)->getValueOperand()->getType();
312 }
313 
314 /// A helper function that returns true if the given type is irregular. The
315 /// type is irregular if its allocated size doesn't equal the store size of an
316 /// element of the corresponding vector type at the given vectorization factor.
317 static bool hasIrregularType(Type *Ty, const DataLayout &DL, unsigned VF) {
318   // Determine if an array of VF elements of type Ty is "bitcast compatible"
319   // with a <VF x Ty> vector.
320   if (VF > 1) {
321     auto *VectorTy = VectorType::get(Ty, VF);
322     return VF * DL.getTypeAllocSize(Ty) != DL.getTypeStoreSize(VectorTy);
323   }
324 
325   // If the vectorization factor is one, we just check if an array of type Ty
326   // requires padding between elements.
327   return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
328 }
329 
330 /// A helper function that returns the reciprocal of the block probability of
331 /// predicated blocks. If we return X, we are assuming the predicated block
332 /// will execute once for every X iterations of the loop header.
333 ///
334 /// TODO: We should use actual block probability here, if available. Currently,
335 ///       we always assume predicated blocks have a 50% chance of executing.
336 static unsigned getReciprocalPredBlockProb() { return 2; }
337 
338 /// A helper function that adds a 'fast' flag to floating-point operations.
339 static Value *addFastMathFlag(Value *V) {
340   if (isa<FPMathOperator>(V))
341     cast<Instruction>(V)->setFastMathFlags(FastMathFlags::getFast());
342   return V;
343 }
344 
345 static Value *addFastMathFlag(Value *V, FastMathFlags FMF) {
346   if (isa<FPMathOperator>(V))
347     cast<Instruction>(V)->setFastMathFlags(FMF);
348   return V;
349 }
350 
351 /// A helper function that returns an integer or floating-point constant with
352 /// value C.
353 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
354   return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
355                            : ConstantFP::get(Ty, C);
356 }
357 
358 /// Returns "best known" trip count for the specified loop \p L as defined by
359 /// the following procedure:
360 ///   1) Returns exact trip count if it is known.
361 ///   2) Returns expected trip count according to profile data if any.
362 ///   3) Returns upper bound estimate if it is known.
363 ///   4) Returns None if all of the above failed.
364 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) {
365   // Check if exact trip count is known.
366   if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
367     return ExpectedTC;
368 
369   // Check if there is an expected trip count available from profile data.
370   if (LoopVectorizeWithBlockFrequency)
371     if (auto EstimatedTC = getLoopEstimatedTripCount(L))
372       return EstimatedTC;
373 
374   // Check if upper bound estimate is known.
375   if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
376     return ExpectedTC;
377 
378   return None;
379 }
380 
381 namespace llvm {
382 
383 /// InnerLoopVectorizer vectorizes loops which contain only one basic
384 /// block to a specified vectorization factor (VF).
385 /// This class performs the widening of scalars into vectors, or multiple
386 /// scalars. This class also implements the following features:
387 /// * It inserts an epilogue loop for handling loops that don't have iteration
388 ///   counts that are known to be a multiple of the vectorization factor.
389 /// * It handles the code generation for reduction variables.
390 /// * Scalarization (implementation using scalars) of un-vectorizable
391 ///   instructions.
392 /// InnerLoopVectorizer does not perform any vectorization-legality
393 /// checks, and relies on the caller to check for the different legality
394 /// aspects. The InnerLoopVectorizer relies on the
395 /// LoopVectorizationLegality class to provide information about the induction
396 /// and reduction variables that were found to a given vectorization factor.
397 class InnerLoopVectorizer {
398 public:
399   InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
400                       LoopInfo *LI, DominatorTree *DT,
401                       const TargetLibraryInfo *TLI,
402                       const TargetTransformInfo *TTI, AssumptionCache *AC,
403                       OptimizationRemarkEmitter *ORE, unsigned VecWidth,
404                       unsigned UnrollFactor, LoopVectorizationLegality *LVL,
405                       LoopVectorizationCostModel *CM)
406       : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
407         AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
408         Builder(PSE.getSE()->getContext()),
409         VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM) {}
410   virtual ~InnerLoopVectorizer() = default;
411 
412   /// Create a new empty loop. Unlink the old loop and connect the new one.
413   /// Return the pre-header block of the new loop.
414   BasicBlock *createVectorizedLoopSkeleton();
415 
416   /// Widen a single instruction within the innermost loop.
417   void widenInstruction(Instruction &I);
418 
419   /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
420   void fixVectorizedLoop();
421 
422   // Return true if any runtime check is added.
423   bool areSafetyChecksAdded() { return AddedSafetyChecks; }
424 
425   /// A type for vectorized values in the new loop. Each value from the
426   /// original loop, when vectorized, is represented by UF vector values in the
427   /// new unrolled loop, where UF is the unroll factor.
428   using VectorParts = SmallVector<Value *, 2>;
429 
430   /// Vectorize a single PHINode in a block. This method handles the induction
431   /// variable canonicalization. It supports both VF = 1 for unrolled loops and
432   /// arbitrary length vectors.
433   void widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF);
434 
435   /// A helper function to scalarize a single Instruction in the innermost loop.
436   /// Generates a sequence of scalar instances for each lane between \p MinLane
437   /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
438   /// inclusive..
439   void scalarizeInstruction(Instruction *Instr, const VPIteration &Instance,
440                             bool IfPredicateInstr);
441 
442   /// Widen an integer or floating-point induction variable \p IV. If \p Trunc
443   /// is provided, the integer induction variable will first be truncated to
444   /// the corresponding type.
445   void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr);
446 
447   /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a
448   /// vector or scalar value on-demand if one is not yet available. When
449   /// vectorizing a loop, we visit the definition of an instruction before its
450   /// uses. When visiting the definition, we either vectorize or scalarize the
451   /// instruction, creating an entry for it in the corresponding map. (In some
452   /// cases, such as induction variables, we will create both vector and scalar
453   /// entries.) Then, as we encounter uses of the definition, we derive values
454   /// for each scalar or vector use unless such a value is already available.
455   /// For example, if we scalarize a definition and one of its uses is vector,
456   /// we build the required vector on-demand with an insertelement sequence
457   /// when visiting the use. Otherwise, if the use is scalar, we can use the
458   /// existing scalar definition.
459   ///
460   /// Return a value in the new loop corresponding to \p V from the original
461   /// loop at unroll index \p Part. If the value has already been vectorized,
462   /// the corresponding vector entry in VectorLoopValueMap is returned. If,
463   /// however, the value has a scalar entry in VectorLoopValueMap, we construct
464   /// a new vector value on-demand by inserting the scalar values into a vector
465   /// with an insertelement sequence. If the value has been neither vectorized
466   /// nor scalarized, it must be loop invariant, so we simply broadcast the
467   /// value into a vector.
468   Value *getOrCreateVectorValue(Value *V, unsigned Part);
469 
470   /// Return a value in the new loop corresponding to \p V from the original
471   /// loop at unroll and vector indices \p Instance. If the value has been
472   /// vectorized but not scalarized, the necessary extractelement instruction
473   /// will be generated.
474   Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance);
475 
476   /// Construct the vector value of a scalarized value \p V one lane at a time.
477   void packScalarIntoVectorValue(Value *V, const VPIteration &Instance);
478 
479   /// Try to vectorize the interleaved access group that \p Instr belongs to,
480   /// optionally masking the vector operations if \p BlockInMask is non-null.
481   void vectorizeInterleaveGroup(Instruction *Instr,
482                                 VectorParts *BlockInMask = nullptr);
483 
484   /// Vectorize Load and Store instructions, optionally masking the vector
485   /// operations if \p BlockInMask is non-null.
486   void vectorizeMemoryInstruction(Instruction *Instr,
487                                   VectorParts *BlockInMask = nullptr);
488 
489   /// Set the debug location in the builder using the debug location in
490   /// the instruction.
491   void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr);
492 
493   /// Fix the non-induction PHIs in the OrigPHIsToFix vector.
494   void fixNonInductionPHIs(void);
495 
496 protected:
497   friend class LoopVectorizationPlanner;
498 
499   /// A small list of PHINodes.
500   using PhiVector = SmallVector<PHINode *, 4>;
501 
502   /// A type for scalarized values in the new loop. Each value from the
503   /// original loop, when scalarized, is represented by UF x VF scalar values
504   /// in the new unrolled loop, where UF is the unroll factor and VF is the
505   /// vectorization factor.
506   using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
507 
508   /// Set up the values of the IVs correctly when exiting the vector loop.
509   void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
510                     Value *CountRoundDown, Value *EndValue,
511                     BasicBlock *MiddleBlock);
512 
513   /// Create a new induction variable inside L.
514   PHINode *createInductionVariable(Loop *L, Value *Start, Value *End,
515                                    Value *Step, Instruction *DL);
516 
517   /// Handle all cross-iteration phis in the header.
518   void fixCrossIterationPHIs();
519 
520   /// Fix a first-order recurrence. This is the second phase of vectorizing
521   /// this phi node.
522   void fixFirstOrderRecurrence(PHINode *Phi);
523 
524   /// Fix a reduction cross-iteration phi. This is the second phase of
525   /// vectorizing this phi node.
526   void fixReduction(PHINode *Phi);
527 
528   /// The Loop exit block may have single value PHI nodes with some
529   /// incoming value. While vectorizing we only handled real values
530   /// that were defined inside the loop and we should have one value for
531   /// each predecessor of its parent basic block. See PR14725.
532   void fixLCSSAPHIs();
533 
534   /// Iteratively sink the scalarized operands of a predicated instruction into
535   /// the block that was created for it.
536   void sinkScalarOperands(Instruction *PredInst);
537 
538   /// Shrinks vector element sizes to the smallest bitwidth they can be legally
539   /// represented as.
540   void truncateToMinimalBitwidths();
541 
542   /// Insert the new loop to the loop hierarchy and pass manager
543   /// and update the analysis passes.
544   void updateAnalysis();
545 
546   /// Create a broadcast instruction. This method generates a broadcast
547   /// instruction (shuffle) for loop invariant values and for the induction
548   /// value. If this is the induction variable then we extend it to N, N+1, ...
549   /// this is needed because each iteration in the loop corresponds to a SIMD
550   /// element.
551   virtual Value *getBroadcastInstrs(Value *V);
552 
553   /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...)
554   /// to each vector element of Val. The sequence starts at StartIndex.
555   /// \p Opcode is relevant for FP induction variable.
556   virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step,
557                                Instruction::BinaryOps Opcode =
558                                Instruction::BinaryOpsEnd);
559 
560   /// Compute scalar induction steps. \p ScalarIV is the scalar induction
561   /// variable on which to base the steps, \p Step is the size of the step, and
562   /// \p EntryVal is the value from the original loop that maps to the steps.
563   /// Note that \p EntryVal doesn't have to be an induction variable - it
564   /// can also be a truncate instruction.
565   void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal,
566                         const InductionDescriptor &ID);
567 
568   /// Create a vector induction phi node based on an existing scalar one. \p
569   /// EntryVal is the value from the original loop that maps to the vector phi
570   /// node, and \p Step is the loop-invariant step. If \p EntryVal is a
571   /// truncate instruction, instead of widening the original IV, we widen a
572   /// version of the IV truncated to \p EntryVal's type.
573   void createVectorIntOrFpInductionPHI(const InductionDescriptor &II,
574                                        Value *Step, Instruction *EntryVal);
575 
576   /// Returns true if an instruction \p I should be scalarized instead of
577   /// vectorized for the chosen vectorization factor.
578   bool shouldScalarizeInstruction(Instruction *I) const;
579 
580   /// Returns true if we should generate a scalar version of \p IV.
581   bool needsScalarInduction(Instruction *IV) const;
582 
583   /// If there is a cast involved in the induction variable \p ID, which should
584   /// be ignored in the vectorized loop body, this function records the
585   /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the
586   /// cast. We had already proved that the casted Phi is equal to the uncasted
587   /// Phi in the vectorized loop (under a runtime guard), and therefore
588   /// there is no need to vectorize the cast - the same value can be used in the
589   /// vector loop for both the Phi and the cast.
590   /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified,
591   /// Otherwise, \p VectorLoopValue is a widened/vectorized value.
592   ///
593   /// \p EntryVal is the value from the original loop that maps to the vector
594   /// phi node and is used to distinguish what is the IV currently being
595   /// processed - original one (if \p EntryVal is a phi corresponding to the
596   /// original IV) or the "newly-created" one based on the proof mentioned above
597   /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the
598   /// latter case \p EntryVal is a TruncInst and we must not record anything for
599   /// that IV, but it's error-prone to expect callers of this routine to care
600   /// about that, hence this explicit parameter.
601   void recordVectorLoopValueForInductionCast(const InductionDescriptor &ID,
602                                              const Instruction *EntryVal,
603                                              Value *VectorLoopValue,
604                                              unsigned Part,
605                                              unsigned Lane = UINT_MAX);
606 
607   /// Generate a shuffle sequence that will reverse the vector Vec.
608   virtual Value *reverseVector(Value *Vec);
609 
610   /// Returns (and creates if needed) the original loop trip count.
611   Value *getOrCreateTripCount(Loop *NewLoop);
612 
613   /// Returns (and creates if needed) the trip count of the widened loop.
614   Value *getOrCreateVectorTripCount(Loop *NewLoop);
615 
616   /// Returns a bitcasted value to the requested vector type.
617   /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
618   Value *createBitOrPointerCast(Value *V, VectorType *DstVTy,
619                                 const DataLayout &DL);
620 
621   /// Emit a bypass check to see if the vector trip count is zero, including if
622   /// it overflows.
623   void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass);
624 
625   /// Emit a bypass check to see if all of the SCEV assumptions we've
626   /// had to make are correct.
627   void emitSCEVChecks(Loop *L, BasicBlock *Bypass);
628 
629   /// Emit bypass checks to check any memory assumptions we may have made.
630   void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass);
631 
632   /// Compute the transformed value of Index at offset StartValue using step
633   /// StepValue.
634   /// For integer induction, returns StartValue + Index * StepValue.
635   /// For pointer induction, returns StartValue[Index * StepValue].
636   /// FIXME: The newly created binary instructions should contain nsw/nuw
637   /// flags, which can be found from the original scalar operations.
638   Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE,
639                               const DataLayout &DL,
640                               const InductionDescriptor &ID) const;
641 
642   /// Add additional metadata to \p To that was not present on \p Orig.
643   ///
644   /// Currently this is used to add the noalias annotations based on the
645   /// inserted memchecks.  Use this for instructions that are *cloned* into the
646   /// vector loop.
647   void addNewMetadata(Instruction *To, const Instruction *Orig);
648 
649   /// Add metadata from one instruction to another.
650   ///
651   /// This includes both the original MDs from \p From and additional ones (\see
652   /// addNewMetadata).  Use this for *newly created* instructions in the vector
653   /// loop.
654   void addMetadata(Instruction *To, Instruction *From);
655 
656   /// Similar to the previous function but it adds the metadata to a
657   /// vector of instructions.
658   void addMetadata(ArrayRef<Value *> To, Instruction *From);
659 
660   /// The original loop.
661   Loop *OrigLoop;
662 
663   /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
664   /// dynamic knowledge to simplify SCEV expressions and converts them to a
665   /// more usable form.
666   PredicatedScalarEvolution &PSE;
667 
668   /// Loop Info.
669   LoopInfo *LI;
670 
671   /// Dominator Tree.
672   DominatorTree *DT;
673 
674   /// Alias Analysis.
675   AliasAnalysis *AA;
676 
677   /// Target Library Info.
678   const TargetLibraryInfo *TLI;
679 
680   /// Target Transform Info.
681   const TargetTransformInfo *TTI;
682 
683   /// Assumption Cache.
684   AssumptionCache *AC;
685 
686   /// Interface to emit optimization remarks.
687   OptimizationRemarkEmitter *ORE;
688 
689   /// LoopVersioning.  It's only set up (non-null) if memchecks were
690   /// used.
691   ///
692   /// This is currently only used to add no-alias metadata based on the
693   /// memchecks.  The actually versioning is performed manually.
694   std::unique_ptr<LoopVersioning> LVer;
695 
696   /// The vectorization SIMD factor to use. Each vector will have this many
697   /// vector elements.
698   unsigned VF;
699 
700   /// The vectorization unroll factor to use. Each scalar is vectorized to this
701   /// many different vector instructions.
702   unsigned UF;
703 
704   /// The builder that we use
705   IRBuilder<> Builder;
706 
707   // --- Vectorization state ---
708 
709   /// The vector-loop preheader.
710   BasicBlock *LoopVectorPreHeader;
711 
712   /// The scalar-loop preheader.
713   BasicBlock *LoopScalarPreHeader;
714 
715   /// Middle Block between the vector and the scalar.
716   BasicBlock *LoopMiddleBlock;
717 
718   /// The ExitBlock of the scalar loop.
719   BasicBlock *LoopExitBlock;
720 
721   /// The vector loop body.
722   BasicBlock *LoopVectorBody;
723 
724   /// The scalar loop body.
725   BasicBlock *LoopScalarBody;
726 
727   /// A list of all bypass blocks. The first block is the entry of the loop.
728   SmallVector<BasicBlock *, 4> LoopBypassBlocks;
729 
730   /// The new Induction variable which was added to the new block.
731   PHINode *Induction = nullptr;
732 
733   /// The induction variable of the old basic block.
734   PHINode *OldInduction = nullptr;
735 
736   /// Maps values from the original loop to their corresponding values in the
737   /// vectorized loop. A key value can map to either vector values, scalar
738   /// values or both kinds of values, depending on whether the key was
739   /// vectorized and scalarized.
740   VectorizerValueMap VectorLoopValueMap;
741 
742   /// Store instructions that were predicated.
743   SmallVector<Instruction *, 4> PredicatedInstructions;
744 
745   /// Trip count of the original loop.
746   Value *TripCount = nullptr;
747 
748   /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
749   Value *VectorTripCount = nullptr;
750 
751   /// The legality analysis.
752   LoopVectorizationLegality *Legal;
753 
754   /// The profitablity analysis.
755   LoopVectorizationCostModel *Cost;
756 
757   // Record whether runtime checks are added.
758   bool AddedSafetyChecks = false;
759 
760   // Holds the end values for each induction variable. We save the end values
761   // so we can later fix-up the external users of the induction variables.
762   DenseMap<PHINode *, Value *> IVEndValues;
763 
764   // Vector of original scalar PHIs whose corresponding widened PHIs need to be
765   // fixed up at the end of vector code generation.
766   SmallVector<PHINode *, 8> OrigPHIsToFix;
767 };
768 
769 class InnerLoopUnroller : public InnerLoopVectorizer {
770 public:
771   InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
772                     LoopInfo *LI, DominatorTree *DT,
773                     const TargetLibraryInfo *TLI,
774                     const TargetTransformInfo *TTI, AssumptionCache *AC,
775                     OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
776                     LoopVectorizationLegality *LVL,
777                     LoopVectorizationCostModel *CM)
778       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1,
779                             UnrollFactor, LVL, CM) {}
780 
781 private:
782   Value *getBroadcastInstrs(Value *V) override;
783   Value *getStepVector(Value *Val, int StartIdx, Value *Step,
784                        Instruction::BinaryOps Opcode =
785                        Instruction::BinaryOpsEnd) override;
786   Value *reverseVector(Value *Vec) override;
787 };
788 
789 } // end namespace llvm
790 
791 /// Look for a meaningful debug location on the instruction or it's
792 /// operands.
793 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
794   if (!I)
795     return I;
796 
797   DebugLoc Empty;
798   if (I->getDebugLoc() != Empty)
799     return I;
800 
801   for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) {
802     if (Instruction *OpInst = dyn_cast<Instruction>(*OI))
803       if (OpInst->getDebugLoc() != Empty)
804         return OpInst;
805   }
806 
807   return I;
808 }
809 
810 void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) {
811   if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) {
812     const DILocation *DIL = Inst->getDebugLoc();
813     if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
814         !isa<DbgInfoIntrinsic>(Inst)) {
815       auto NewDIL = DIL->cloneByMultiplyingDuplicationFactor(UF * VF);
816       if (NewDIL)
817         B.SetCurrentDebugLocation(NewDIL.getValue());
818       else
819         LLVM_DEBUG(dbgs()
820                    << "Failed to create new discriminator: "
821                    << DIL->getFilename() << " Line: " << DIL->getLine());
822     }
823     else
824       B.SetCurrentDebugLocation(DIL);
825   } else
826     B.SetCurrentDebugLocation(DebugLoc());
827 }
828 
829 /// Write a record \p DebugMsg about vectorization failure to the debug
830 /// output stream. If \p I is passed, it is an instruction that prevents
831 /// vectorization.
832 #ifndef NDEBUG
833 static void debugVectorizationFailure(const StringRef DebugMsg,
834     Instruction *I) {
835   dbgs() << "LV: Not vectorizing: " << DebugMsg;
836   if (I != nullptr)
837     dbgs() << " " << *I;
838   else
839     dbgs() << '.';
840   dbgs() << '\n';
841 }
842 #endif
843 
844 /// Create an analysis remark that explains why vectorization failed
845 ///
846 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint).  \p
847 /// RemarkName is the identifier for the remark.  If \p I is passed it is an
848 /// instruction that prevents vectorization.  Otherwise \p TheLoop is used for
849 /// the location of the remark.  \return the remark object that can be
850 /// streamed to.
851 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
852     StringRef RemarkName, Loop *TheLoop, Instruction *I) {
853   Value *CodeRegion = TheLoop->getHeader();
854   DebugLoc DL = TheLoop->getStartLoc();
855 
856   if (I) {
857     CodeRegion = I->getParent();
858     // If there is no debug location attached to the instruction, revert back to
859     // using the loop's.
860     if (I->getDebugLoc())
861       DL = I->getDebugLoc();
862   }
863 
864   OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion);
865   R << "loop not vectorized: ";
866   return R;
867 }
868 
869 namespace llvm {
870 
871 void reportVectorizationFailure(const StringRef DebugMsg,
872     const StringRef OREMsg, const StringRef ORETag,
873     OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) {
874   LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I));
875   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
876   ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(),
877                 ORETag, TheLoop, I) << OREMsg);
878 }
879 
880 } // end namespace llvm
881 
882 #ifndef NDEBUG
883 /// \return string containing a file name and a line # for the given loop.
884 static std::string getDebugLocString(const Loop *L) {
885   std::string Result;
886   if (L) {
887     raw_string_ostream OS(Result);
888     if (const DebugLoc LoopDbgLoc = L->getStartLoc())
889       LoopDbgLoc.print(OS);
890     else
891       // Just print the module name.
892       OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
893     OS.flush();
894   }
895   return Result;
896 }
897 #endif
898 
899 void InnerLoopVectorizer::addNewMetadata(Instruction *To,
900                                          const Instruction *Orig) {
901   // If the loop was versioned with memchecks, add the corresponding no-alias
902   // metadata.
903   if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig)))
904     LVer->annotateInstWithNoAlias(To, Orig);
905 }
906 
907 void InnerLoopVectorizer::addMetadata(Instruction *To,
908                                       Instruction *From) {
909   propagateMetadata(To, From);
910   addNewMetadata(To, From);
911 }
912 
913 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,
914                                       Instruction *From) {
915   for (Value *V : To) {
916     if (Instruction *I = dyn_cast<Instruction>(V))
917       addMetadata(I, From);
918   }
919 }
920 
921 namespace llvm {
922 
923 // Loop vectorization cost-model hints how the scalar epilogue loop should be
924 // lowered.
925 enum ScalarEpilogueLowering {
926 
927   // The default: allowing scalar epilogues.
928   CM_ScalarEpilogueAllowed,
929 
930   // Vectorization with OptForSize: don't allow epilogues.
931   CM_ScalarEpilogueNotAllowedOptSize,
932 
933   // A special case of vectorisation with OptForSize: loops with a very small
934   // trip count are considered for vectorization under OptForSize, thereby
935   // making sure the cost of their loop body is dominant, free of runtime
936   // guards and scalar iteration overheads.
937   CM_ScalarEpilogueNotAllowedLowTripLoop,
938 
939   // Loop hint predicate indicating an epilogue is undesired.
940   CM_ScalarEpilogueNotNeededUsePredicate
941 };
942 
943 /// LoopVectorizationCostModel - estimates the expected speedups due to
944 /// vectorization.
945 /// In many cases vectorization is not profitable. This can happen because of
946 /// a number of reasons. In this class we mainly attempt to predict the
947 /// expected speedup/slowdowns due to the supported instruction set. We use the
948 /// TargetTransformInfo to query the different backends for the cost of
949 /// different operations.
950 class LoopVectorizationCostModel {
951 public:
952   LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
953                              PredicatedScalarEvolution &PSE, LoopInfo *LI,
954                              LoopVectorizationLegality *Legal,
955                              const TargetTransformInfo &TTI,
956                              const TargetLibraryInfo *TLI, DemandedBits *DB,
957                              AssumptionCache *AC,
958                              OptimizationRemarkEmitter *ORE, const Function *F,
959                              const LoopVectorizeHints *Hints,
960                              InterleavedAccessInfo &IAI)
961       : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
962         TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
963         Hints(Hints), InterleaveInfo(IAI) {}
964 
965   /// \return An upper bound for the vectorization factor, or None if
966   /// vectorization and interleaving should be avoided up front.
967   Optional<unsigned> computeMaxVF();
968 
969   /// \return True if runtime checks are required for vectorization, and false
970   /// otherwise.
971   bool runtimeChecksRequired();
972 
973   /// \return The most profitable vectorization factor and the cost of that VF.
974   /// This method checks every power of two up to MaxVF. If UserVF is not ZERO
975   /// then this vectorization factor will be selected if vectorization is
976   /// possible.
977   VectorizationFactor selectVectorizationFactor(unsigned MaxVF);
978 
979   /// Setup cost-based decisions for user vectorization factor.
980   void selectUserVectorizationFactor(unsigned UserVF) {
981     collectUniformsAndScalars(UserVF);
982     collectInstsToScalarize(UserVF);
983   }
984 
985   /// \return The size (in bits) of the smallest and widest types in the code
986   /// that needs to be vectorized. We ignore values that remain scalar such as
987   /// 64 bit loop indices.
988   std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
989 
990   /// \return The desired interleave count.
991   /// If interleave count has been specified by metadata it will be returned.
992   /// Otherwise, the interleave count is computed and returned. VF and LoopCost
993   /// are the selected vectorization factor and the cost of the selected VF.
994   unsigned selectInterleaveCount(unsigned VF, unsigned LoopCost);
995 
996   /// Memory access instruction may be vectorized in more than one way.
997   /// Form of instruction after vectorization depends on cost.
998   /// This function takes cost-based decisions for Load/Store instructions
999   /// and collects them in a map. This decisions map is used for building
1000   /// the lists of loop-uniform and loop-scalar instructions.
1001   /// The calculated cost is saved with widening decision in order to
1002   /// avoid redundant calculations.
1003   void setCostBasedWideningDecision(unsigned VF);
1004 
1005   /// A struct that represents some properties of the register usage
1006   /// of a loop.
1007   struct RegisterUsage {
1008     /// Holds the number of loop invariant values that are used in the loop.
1009     unsigned LoopInvariantRegs;
1010 
1011     /// Holds the maximum number of concurrent live intervals in the loop.
1012     unsigned MaxLocalUsers;
1013   };
1014 
1015   /// \return Returns information about the register usages of the loop for the
1016   /// given vectorization factors.
1017   SmallVector<RegisterUsage, 8> calculateRegisterUsage(ArrayRef<unsigned> VFs);
1018 
1019   /// Collect values we want to ignore in the cost model.
1020   void collectValuesToIgnore();
1021 
1022   /// \returns The smallest bitwidth each instruction can be represented with.
1023   /// The vector equivalents of these instructions should be truncated to this
1024   /// type.
1025   const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
1026     return MinBWs;
1027   }
1028 
1029   /// \returns True if it is more profitable to scalarize instruction \p I for
1030   /// vectorization factor \p VF.
1031   bool isProfitableToScalarize(Instruction *I, unsigned VF) const {
1032     assert(VF > 1 && "Profitable to scalarize relevant only for VF > 1.");
1033 
1034     // Cost model is not run in the VPlan-native path - return conservative
1035     // result until this changes.
1036     if (EnableVPlanNativePath)
1037       return false;
1038 
1039     auto Scalars = InstsToScalarize.find(VF);
1040     assert(Scalars != InstsToScalarize.end() &&
1041            "VF not yet analyzed for scalarization profitability");
1042     return Scalars->second.find(I) != Scalars->second.end();
1043   }
1044 
1045   /// Returns true if \p I is known to be uniform after vectorization.
1046   bool isUniformAfterVectorization(Instruction *I, unsigned VF) const {
1047     if (VF == 1)
1048       return true;
1049 
1050     // Cost model is not run in the VPlan-native path - return conservative
1051     // result until this changes.
1052     if (EnableVPlanNativePath)
1053       return false;
1054 
1055     auto UniformsPerVF = Uniforms.find(VF);
1056     assert(UniformsPerVF != Uniforms.end() &&
1057            "VF not yet analyzed for uniformity");
1058     return UniformsPerVF->second.find(I) != UniformsPerVF->second.end();
1059   }
1060 
1061   /// Returns true if \p I is known to be scalar after vectorization.
1062   bool isScalarAfterVectorization(Instruction *I, unsigned VF) const {
1063     if (VF == 1)
1064       return true;
1065 
1066     // Cost model is not run in the VPlan-native path - return conservative
1067     // result until this changes.
1068     if (EnableVPlanNativePath)
1069       return false;
1070 
1071     auto ScalarsPerVF = Scalars.find(VF);
1072     assert(ScalarsPerVF != Scalars.end() &&
1073            "Scalar values are not calculated for VF");
1074     return ScalarsPerVF->second.find(I) != ScalarsPerVF->second.end();
1075   }
1076 
1077   /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1078   /// for vectorization factor \p VF.
1079   bool canTruncateToMinimalBitwidth(Instruction *I, unsigned VF) const {
1080     return VF > 1 && MinBWs.find(I) != MinBWs.end() &&
1081            !isProfitableToScalarize(I, VF) &&
1082            !isScalarAfterVectorization(I, VF);
1083   }
1084 
1085   /// Decision that was taken during cost calculation for memory instruction.
1086   enum InstWidening {
1087     CM_Unknown,
1088     CM_Widen,         // For consecutive accesses with stride +1.
1089     CM_Widen_Reverse, // For consecutive accesses with stride -1.
1090     CM_Interleave,
1091     CM_GatherScatter,
1092     CM_Scalarize
1093   };
1094 
1095   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1096   /// instruction \p I and vector width \p VF.
1097   void setWideningDecision(Instruction *I, unsigned VF, InstWidening W,
1098                            unsigned Cost) {
1099     assert(VF >= 2 && "Expected VF >=2");
1100     WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1101   }
1102 
1103   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1104   /// interleaving group \p Grp and vector width \p VF.
1105   void setWideningDecision(const InterleaveGroup<Instruction> *Grp, unsigned VF,
1106                            InstWidening W, unsigned Cost) {
1107     assert(VF >= 2 && "Expected VF >=2");
1108     /// Broadcast this decicion to all instructions inside the group.
1109     /// But the cost will be assigned to one instruction only.
1110     for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1111       if (auto *I = Grp->getMember(i)) {
1112         if (Grp->getInsertPos() == I)
1113           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1114         else
1115           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1116       }
1117     }
1118   }
1119 
1120   /// Return the cost model decision for the given instruction \p I and vector
1121   /// width \p VF. Return CM_Unknown if this instruction did not pass
1122   /// through the cost modeling.
1123   InstWidening getWideningDecision(Instruction *I, unsigned VF) {
1124     assert(VF >= 2 && "Expected VF >=2");
1125 
1126     // Cost model is not run in the VPlan-native path - return conservative
1127     // result until this changes.
1128     if (EnableVPlanNativePath)
1129       return CM_GatherScatter;
1130 
1131     std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
1132     auto Itr = WideningDecisions.find(InstOnVF);
1133     if (Itr == WideningDecisions.end())
1134       return CM_Unknown;
1135     return Itr->second.first;
1136   }
1137 
1138   /// Return the vectorization cost for the given instruction \p I and vector
1139   /// width \p VF.
1140   unsigned getWideningCost(Instruction *I, unsigned VF) {
1141     assert(VF >= 2 && "Expected VF >=2");
1142     std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
1143     assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&
1144            "The cost is not calculated");
1145     return WideningDecisions[InstOnVF].second;
1146   }
1147 
1148   /// Return True if instruction \p I is an optimizable truncate whose operand
1149   /// is an induction variable. Such a truncate will be removed by adding a new
1150   /// induction variable with the destination type.
1151   bool isOptimizableIVTruncate(Instruction *I, unsigned VF) {
1152     // If the instruction is not a truncate, return false.
1153     auto *Trunc = dyn_cast<TruncInst>(I);
1154     if (!Trunc)
1155       return false;
1156 
1157     // Get the source and destination types of the truncate.
1158     Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1159     Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1160 
1161     // If the truncate is free for the given types, return false. Replacing a
1162     // free truncate with an induction variable would add an induction variable
1163     // update instruction to each iteration of the loop. We exclude from this
1164     // check the primary induction variable since it will need an update
1165     // instruction regardless.
1166     Value *Op = Trunc->getOperand(0);
1167     if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1168       return false;
1169 
1170     // If the truncated value is not an induction variable, return false.
1171     return Legal->isInductionPhi(Op);
1172   }
1173 
1174   /// Collects the instructions to scalarize for each predicated instruction in
1175   /// the loop.
1176   void collectInstsToScalarize(unsigned VF);
1177 
1178   /// Collect Uniform and Scalar values for the given \p VF.
1179   /// The sets depend on CM decision for Load/Store instructions
1180   /// that may be vectorized as interleave, gather-scatter or scalarized.
1181   void collectUniformsAndScalars(unsigned VF) {
1182     // Do the analysis once.
1183     if (VF == 1 || Uniforms.find(VF) != Uniforms.end())
1184       return;
1185     setCostBasedWideningDecision(VF);
1186     collectLoopUniforms(VF);
1187     collectLoopScalars(VF);
1188   }
1189 
1190   /// Returns true if the target machine supports masked store operation
1191   /// for the given \p DataType and kind of access to \p Ptr.
1192   bool isLegalMaskedStore(Type *DataType, Value *Ptr) {
1193     return Legal->isConsecutivePtr(Ptr) && TTI.isLegalMaskedStore(DataType);
1194   }
1195 
1196   /// Returns true if the target machine supports masked load operation
1197   /// for the given \p DataType and kind of access to \p Ptr.
1198   bool isLegalMaskedLoad(Type *DataType, Value *Ptr) {
1199     return Legal->isConsecutivePtr(Ptr) && TTI.isLegalMaskedLoad(DataType);
1200   }
1201 
1202   /// Returns true if the target machine supports masked scatter operation
1203   /// for the given \p DataType.
1204   bool isLegalMaskedScatter(Type *DataType) {
1205     return TTI.isLegalMaskedScatter(DataType);
1206   }
1207 
1208   /// Returns true if the target machine supports masked gather operation
1209   /// for the given \p DataType.
1210   bool isLegalMaskedGather(Type *DataType) {
1211     return TTI.isLegalMaskedGather(DataType);
1212   }
1213 
1214   /// Returns true if the target machine can represent \p V as a masked gather
1215   /// or scatter operation.
1216   bool isLegalGatherOrScatter(Value *V) {
1217     bool LI = isa<LoadInst>(V);
1218     bool SI = isa<StoreInst>(V);
1219     if (!LI && !SI)
1220       return false;
1221     auto *Ty = getMemInstValueType(V);
1222     return (LI && isLegalMaskedGather(Ty)) || (SI && isLegalMaskedScatter(Ty));
1223   }
1224 
1225   /// Returns true if \p I is an instruction that will be scalarized with
1226   /// predication. Such instructions include conditional stores and
1227   /// instructions that may divide by zero.
1228   /// If a non-zero VF has been calculated, we check if I will be scalarized
1229   /// predication for that VF.
1230   bool isScalarWithPredication(Instruction *I, unsigned VF = 1);
1231 
1232   // Returns true if \p I is an instruction that will be predicated either
1233   // through scalar predication or masked load/store or masked gather/scatter.
1234   // Superset of instructions that return true for isScalarWithPredication.
1235   bool isPredicatedInst(Instruction *I) {
1236     if (!blockNeedsPredication(I->getParent()))
1237       return false;
1238     // Loads and stores that need some form of masked operation are predicated
1239     // instructions.
1240     if (isa<LoadInst>(I) || isa<StoreInst>(I))
1241       return Legal->isMaskRequired(I);
1242     return isScalarWithPredication(I);
1243   }
1244 
1245   /// Returns true if \p I is a memory instruction with consecutive memory
1246   /// access that can be widened.
1247   bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1);
1248 
1249   /// Returns true if \p I is a memory instruction in an interleaved-group
1250   /// of memory accesses that can be vectorized with wide vector loads/stores
1251   /// and shuffles.
1252   bool interleavedAccessCanBeWidened(Instruction *I, unsigned VF = 1);
1253 
1254   /// Check if \p Instr belongs to any interleaved access group.
1255   bool isAccessInterleaved(Instruction *Instr) {
1256     return InterleaveInfo.isInterleaved(Instr);
1257   }
1258 
1259   /// Get the interleaved access group that \p Instr belongs to.
1260   const InterleaveGroup<Instruction> *
1261   getInterleavedAccessGroup(Instruction *Instr) {
1262     return InterleaveInfo.getInterleaveGroup(Instr);
1263   }
1264 
1265   /// Returns true if an interleaved group requires a scalar iteration
1266   /// to handle accesses with gaps, and there is nothing preventing us from
1267   /// creating a scalar epilogue.
1268   bool requiresScalarEpilogue() const {
1269     return isScalarEpilogueAllowed() && InterleaveInfo.requiresScalarEpilogue();
1270   }
1271 
1272   /// Returns true if a scalar epilogue is not allowed due to optsize or a
1273   /// loop hint annotation.
1274   bool isScalarEpilogueAllowed() const {
1275     return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1276   }
1277 
1278   /// Returns true if all loop blocks should be masked to fold tail loop.
1279   bool foldTailByMasking() const { return FoldTailByMasking; }
1280 
1281   bool blockNeedsPredication(BasicBlock *BB) {
1282     return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1283   }
1284 
1285   /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1286   /// with factor VF.  Return the cost of the instruction, including
1287   /// scalarization overhead if it's needed.
1288   unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF);
1289 
1290   /// Estimate cost of a call instruction CI if it were vectorized with factor
1291   /// VF. Return the cost of the instruction, including scalarization overhead
1292   /// if it's needed. The flag NeedToScalarize shows if the call needs to be
1293   /// scalarized -
1294   /// i.e. either vector version isn't available, or is too expensive.
1295   unsigned getVectorCallCost(CallInst *CI, unsigned VF, bool &NeedToScalarize);
1296 
1297 private:
1298   unsigned NumPredStores = 0;
1299 
1300   /// \return An upper bound for the vectorization factor, larger than zero.
1301   /// One is returned if vectorization should best be avoided due to cost.
1302   unsigned computeFeasibleMaxVF(unsigned ConstTripCount);
1303 
1304   /// The vectorization cost is a combination of the cost itself and a boolean
1305   /// indicating whether any of the contributing operations will actually
1306   /// operate on
1307   /// vector values after type legalization in the backend. If this latter value
1308   /// is
1309   /// false, then all operations will be scalarized (i.e. no vectorization has
1310   /// actually taken place).
1311   using VectorizationCostTy = std::pair<unsigned, bool>;
1312 
1313   /// Returns the expected execution cost. The unit of the cost does
1314   /// not matter because we use the 'cost' units to compare different
1315   /// vector widths. The cost that is returned is *not* normalized by
1316   /// the factor width.
1317   VectorizationCostTy expectedCost(unsigned VF);
1318 
1319   /// Returns the execution time cost of an instruction for a given vector
1320   /// width. Vector width of one means scalar.
1321   VectorizationCostTy getInstructionCost(Instruction *I, unsigned VF);
1322 
1323   /// The cost-computation logic from getInstructionCost which provides
1324   /// the vector type as an output parameter.
1325   unsigned getInstructionCost(Instruction *I, unsigned VF, Type *&VectorTy);
1326 
1327   /// Calculate vectorization cost of memory instruction \p I.
1328   unsigned getMemoryInstructionCost(Instruction *I, unsigned VF);
1329 
1330   /// The cost computation for scalarized memory instruction.
1331   unsigned getMemInstScalarizationCost(Instruction *I, unsigned VF);
1332 
1333   /// The cost computation for interleaving group of memory instructions.
1334   unsigned getInterleaveGroupCost(Instruction *I, unsigned VF);
1335 
1336   /// The cost computation for Gather/Scatter instruction.
1337   unsigned getGatherScatterCost(Instruction *I, unsigned VF);
1338 
1339   /// The cost computation for widening instruction \p I with consecutive
1340   /// memory access.
1341   unsigned getConsecutiveMemOpCost(Instruction *I, unsigned VF);
1342 
1343   /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1344   /// Load: scalar load + broadcast.
1345   /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1346   /// element)
1347   unsigned getUniformMemOpCost(Instruction *I, unsigned VF);
1348 
1349   /// Estimate the overhead of scalarizing an instruction. This is a
1350   /// convenience wrapper for the type-based getScalarizationOverhead API.
1351   unsigned getScalarizationOverhead(Instruction *I, unsigned VF);
1352 
1353   /// Returns whether the instruction is a load or store and will be a emitted
1354   /// as a vector operation.
1355   bool isConsecutiveLoadOrStore(Instruction *I);
1356 
1357   /// Returns true if an artificially high cost for emulated masked memrefs
1358   /// should be used.
1359   bool useEmulatedMaskMemRefHack(Instruction *I);
1360 
1361   /// Map of scalar integer values to the smallest bitwidth they can be legally
1362   /// represented as. The vector equivalents of these values should be truncated
1363   /// to this type.
1364   MapVector<Instruction *, uint64_t> MinBWs;
1365 
1366   /// A type representing the costs for instructions if they were to be
1367   /// scalarized rather than vectorized. The entries are Instruction-Cost
1368   /// pairs.
1369   using ScalarCostsTy = DenseMap<Instruction *, unsigned>;
1370 
1371   /// A set containing all BasicBlocks that are known to present after
1372   /// vectorization as a predicated block.
1373   SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;
1374 
1375   /// Records whether it is allowed to have the original scalar loop execute at
1376   /// least once. This may be needed as a fallback loop in case runtime
1377   /// aliasing/dependence checks fail, or to handle the tail/remainder
1378   /// iterations when the trip count is unknown or doesn't divide by the VF,
1379   /// or as a peel-loop to handle gaps in interleave-groups.
1380   /// Under optsize and when the trip count is very small we don't allow any
1381   /// iterations to execute in the scalar loop.
1382   ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1383 
1384   /// All blocks of loop are to be masked to fold tail of scalar iterations.
1385   bool FoldTailByMasking = false;
1386 
1387   /// A map holding scalar costs for different vectorization factors. The
1388   /// presence of a cost for an instruction in the mapping indicates that the
1389   /// instruction will be scalarized when vectorizing with the associated
1390   /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1391   DenseMap<unsigned, ScalarCostsTy> InstsToScalarize;
1392 
1393   /// Holds the instructions known to be uniform after vectorization.
1394   /// The data is collected per VF.
1395   DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Uniforms;
1396 
1397   /// Holds the instructions known to be scalar after vectorization.
1398   /// The data is collected per VF.
1399   DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Scalars;
1400 
1401   /// Holds the instructions (address computations) that are forced to be
1402   /// scalarized.
1403   DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1404 
1405   /// Returns the expected difference in cost from scalarizing the expression
1406   /// feeding a predicated instruction \p PredInst. The instructions to
1407   /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1408   /// non-negative return value implies the expression will be scalarized.
1409   /// Currently, only single-use chains are considered for scalarization.
1410   int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
1411                               unsigned VF);
1412 
1413   /// Collect the instructions that are uniform after vectorization. An
1414   /// instruction is uniform if we represent it with a single scalar value in
1415   /// the vectorized loop corresponding to each vector iteration. Examples of
1416   /// uniform instructions include pointer operands of consecutive or
1417   /// interleaved memory accesses. Note that although uniformity implies an
1418   /// instruction will be scalar, the reverse is not true. In general, a
1419   /// scalarized instruction will be represented by VF scalar values in the
1420   /// vectorized loop, each corresponding to an iteration of the original
1421   /// scalar loop.
1422   void collectLoopUniforms(unsigned VF);
1423 
1424   /// Collect the instructions that are scalar after vectorization. An
1425   /// instruction is scalar if it is known to be uniform or will be scalarized
1426   /// during vectorization. Non-uniform scalarized instructions will be
1427   /// represented by VF values in the vectorized loop, each corresponding to an
1428   /// iteration of the original scalar loop.
1429   void collectLoopScalars(unsigned VF);
1430 
1431   /// Keeps cost model vectorization decision and cost for instructions.
1432   /// Right now it is used for memory instructions only.
1433   using DecisionList = DenseMap<std::pair<Instruction *, unsigned>,
1434                                 std::pair<InstWidening, unsigned>>;
1435 
1436   DecisionList WideningDecisions;
1437 
1438   /// Returns true if \p V is expected to be vectorized and it needs to be
1439   /// extracted.
1440   bool needsExtract(Value *V, unsigned VF) const {
1441     Instruction *I = dyn_cast<Instruction>(V);
1442     if (VF == 1 || !I || !TheLoop->contains(I) || TheLoop->isLoopInvariant(I))
1443       return false;
1444 
1445     // Assume we can vectorize V (and hence we need extraction) if the
1446     // scalars are not computed yet. This can happen, because it is called
1447     // via getScalarizationOverhead from setCostBasedWideningDecision, before
1448     // the scalars are collected. That should be a safe assumption in most
1449     // cases, because we check if the operands have vectorizable types
1450     // beforehand in LoopVectorizationLegality.
1451     return Scalars.find(VF) == Scalars.end() ||
1452            !isScalarAfterVectorization(I, VF);
1453   };
1454 
1455   /// Returns a range containing only operands needing to be extracted.
1456   SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1457                                                    unsigned VF) {
1458     return SmallVector<Value *, 4>(make_filter_range(
1459         Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1460   }
1461 
1462 public:
1463   /// The loop that we evaluate.
1464   Loop *TheLoop;
1465 
1466   /// Predicated scalar evolution analysis.
1467   PredicatedScalarEvolution &PSE;
1468 
1469   /// Loop Info analysis.
1470   LoopInfo *LI;
1471 
1472   /// Vectorization legality.
1473   LoopVectorizationLegality *Legal;
1474 
1475   /// Vector target information.
1476   const TargetTransformInfo &TTI;
1477 
1478   /// Target Library Info.
1479   const TargetLibraryInfo *TLI;
1480 
1481   /// Demanded bits analysis.
1482   DemandedBits *DB;
1483 
1484   /// Assumption cache.
1485   AssumptionCache *AC;
1486 
1487   /// Interface to emit optimization remarks.
1488   OptimizationRemarkEmitter *ORE;
1489 
1490   const Function *TheFunction;
1491 
1492   /// Loop Vectorize Hint.
1493   const LoopVectorizeHints *Hints;
1494 
1495   /// The interleave access information contains groups of interleaved accesses
1496   /// with the same stride and close to each other.
1497   InterleavedAccessInfo &InterleaveInfo;
1498 
1499   /// Values to ignore in the cost model.
1500   SmallPtrSet<const Value *, 16> ValuesToIgnore;
1501 
1502   /// Values to ignore in the cost model when VF > 1.
1503   SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1504 };
1505 
1506 } // end namespace llvm
1507 
1508 // Return true if \p OuterLp is an outer loop annotated with hints for explicit
1509 // vectorization. The loop needs to be annotated with #pragma omp simd
1510 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
1511 // vector length information is not provided, vectorization is not considered
1512 // explicit. Interleave hints are not allowed either. These limitations will be
1513 // relaxed in the future.
1514 // Please, note that we are currently forced to abuse the pragma 'clang
1515 // vectorize' semantics. This pragma provides *auto-vectorization hints*
1516 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
1517 // provides *explicit vectorization hints* (LV can bypass legal checks and
1518 // assume that vectorization is legal). However, both hints are implemented
1519 // using the same metadata (llvm.loop.vectorize, processed by
1520 // LoopVectorizeHints). This will be fixed in the future when the native IR
1521 // representation for pragma 'omp simd' is introduced.
1522 static bool isExplicitVecOuterLoop(Loop *OuterLp,
1523                                    OptimizationRemarkEmitter *ORE) {
1524   assert(!OuterLp->empty() && "This is not an outer loop");
1525   LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
1526 
1527   // Only outer loops with an explicit vectorization hint are supported.
1528   // Unannotated outer loops are ignored.
1529   if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
1530     return false;
1531 
1532   Function *Fn = OuterLp->getHeader()->getParent();
1533   if (!Hints.allowVectorization(Fn, OuterLp,
1534                                 true /*VectorizeOnlyWhenForced*/)) {
1535     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
1536     return false;
1537   }
1538 
1539   if (Hints.getInterleave() > 1) {
1540     // TODO: Interleave support is future work.
1541     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
1542                          "outer loops.\n");
1543     Hints.emitRemarkWithHints();
1544     return false;
1545   }
1546 
1547   return true;
1548 }
1549 
1550 static void collectSupportedLoops(Loop &L, LoopInfo *LI,
1551                                   OptimizationRemarkEmitter *ORE,
1552                                   SmallVectorImpl<Loop *> &V) {
1553   // Collect inner loops and outer loops without irreducible control flow. For
1554   // now, only collect outer loops that have explicit vectorization hints. If we
1555   // are stress testing the VPlan H-CFG construction, we collect the outermost
1556   // loop of every loop nest.
1557   if (L.empty() || VPlanBuildStressTest ||
1558       (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
1559     LoopBlocksRPO RPOT(&L);
1560     RPOT.perform(LI);
1561     if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
1562       V.push_back(&L);
1563       // TODO: Collect inner loops inside marked outer loops in case
1564       // vectorization fails for the outer loop. Do not invoke
1565       // 'containsIrreducibleCFG' again for inner loops when the outer loop is
1566       // already known to be reducible. We can use an inherited attribute for
1567       // that.
1568       return;
1569     }
1570   }
1571   for (Loop *InnerL : L)
1572     collectSupportedLoops(*InnerL, LI, ORE, V);
1573 }
1574 
1575 namespace {
1576 
1577 /// The LoopVectorize Pass.
1578 struct LoopVectorize : public FunctionPass {
1579   /// Pass identification, replacement for typeid
1580   static char ID;
1581 
1582   LoopVectorizePass Impl;
1583 
1584   explicit LoopVectorize(bool InterleaveOnlyWhenForced = false,
1585                          bool VectorizeOnlyWhenForced = false)
1586       : FunctionPass(ID) {
1587     Impl.InterleaveOnlyWhenForced = InterleaveOnlyWhenForced;
1588     Impl.VectorizeOnlyWhenForced = VectorizeOnlyWhenForced;
1589     initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
1590   }
1591 
1592   bool runOnFunction(Function &F) override {
1593     if (skipFunction(F))
1594       return false;
1595 
1596     auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
1597     auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
1598     auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
1599     auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
1600     auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
1601     auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
1602     auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
1603     auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
1604     auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
1605     auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
1606     auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
1607     auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
1608     auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
1609 
1610     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
1611         [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
1612 
1613     return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC,
1614                         GetLAA, *ORE, PSI);
1615   }
1616 
1617   void getAnalysisUsage(AnalysisUsage &AU) const override {
1618     AU.addRequired<AssumptionCacheTracker>();
1619     AU.addRequired<BlockFrequencyInfoWrapperPass>();
1620     AU.addRequired<DominatorTreeWrapperPass>();
1621     AU.addRequired<LoopInfoWrapperPass>();
1622     AU.addRequired<ScalarEvolutionWrapperPass>();
1623     AU.addRequired<TargetTransformInfoWrapperPass>();
1624     AU.addRequired<AAResultsWrapperPass>();
1625     AU.addRequired<LoopAccessLegacyAnalysis>();
1626     AU.addRequired<DemandedBitsWrapperPass>();
1627     AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
1628 
1629     // We currently do not preserve loopinfo/dominator analyses with outer loop
1630     // vectorization. Until this is addressed, mark these analyses as preserved
1631     // only for non-VPlan-native path.
1632     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
1633     if (!EnableVPlanNativePath) {
1634       AU.addPreserved<LoopInfoWrapperPass>();
1635       AU.addPreserved<DominatorTreeWrapperPass>();
1636     }
1637 
1638     AU.addPreserved<BasicAAWrapperPass>();
1639     AU.addPreserved<GlobalsAAWrapperPass>();
1640     AU.addRequired<ProfileSummaryInfoWrapperPass>();
1641   }
1642 };
1643 
1644 } // end anonymous namespace
1645 
1646 //===----------------------------------------------------------------------===//
1647 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
1648 // LoopVectorizationCostModel and LoopVectorizationPlanner.
1649 //===----------------------------------------------------------------------===//
1650 
1651 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
1652   // We need to place the broadcast of invariant variables outside the loop,
1653   // but only if it's proven safe to do so. Else, broadcast will be inside
1654   // vector loop body.
1655   Instruction *Instr = dyn_cast<Instruction>(V);
1656   bool SafeToHoist = OrigLoop->isLoopInvariant(V) &&
1657                      (!Instr ||
1658                       DT->dominates(Instr->getParent(), LoopVectorPreHeader));
1659   // Place the code for broadcasting invariant variables in the new preheader.
1660   IRBuilder<>::InsertPointGuard Guard(Builder);
1661   if (SafeToHoist)
1662     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
1663 
1664   // Broadcast the scalar into all locations in the vector.
1665   Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
1666 
1667   return Shuf;
1668 }
1669 
1670 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
1671     const InductionDescriptor &II, Value *Step, Instruction *EntryVal) {
1672   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1673          "Expected either an induction phi-node or a truncate of it!");
1674   Value *Start = II.getStartValue();
1675 
1676   // Construct the initial value of the vector IV in the vector loop preheader
1677   auto CurrIP = Builder.saveIP();
1678   Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
1679   if (isa<TruncInst>(EntryVal)) {
1680     assert(Start->getType()->isIntegerTy() &&
1681            "Truncation requires an integer type");
1682     auto *TruncType = cast<IntegerType>(EntryVal->getType());
1683     Step = Builder.CreateTrunc(Step, TruncType);
1684     Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
1685   }
1686   Value *SplatStart = Builder.CreateVectorSplat(VF, Start);
1687   Value *SteppedStart =
1688       getStepVector(SplatStart, 0, Step, II.getInductionOpcode());
1689 
1690   // We create vector phi nodes for both integer and floating-point induction
1691   // variables. Here, we determine the kind of arithmetic we will perform.
1692   Instruction::BinaryOps AddOp;
1693   Instruction::BinaryOps MulOp;
1694   if (Step->getType()->isIntegerTy()) {
1695     AddOp = Instruction::Add;
1696     MulOp = Instruction::Mul;
1697   } else {
1698     AddOp = II.getInductionOpcode();
1699     MulOp = Instruction::FMul;
1700   }
1701 
1702   // Multiply the vectorization factor by the step using integer or
1703   // floating-point arithmetic as appropriate.
1704   Value *ConstVF = getSignedIntOrFpConstant(Step->getType(), VF);
1705   Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF));
1706 
1707   // Create a vector splat to use in the induction update.
1708   //
1709   // FIXME: If the step is non-constant, we create the vector splat with
1710   //        IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
1711   //        handle a constant vector splat.
1712   Value *SplatVF = isa<Constant>(Mul)
1713                        ? ConstantVector::getSplat(VF, cast<Constant>(Mul))
1714                        : Builder.CreateVectorSplat(VF, Mul);
1715   Builder.restoreIP(CurrIP);
1716 
1717   // We may need to add the step a number of times, depending on the unroll
1718   // factor. The last of those goes into the PHI.
1719   PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
1720                                     &*LoopVectorBody->getFirstInsertionPt());
1721   VecInd->setDebugLoc(EntryVal->getDebugLoc());
1722   Instruction *LastInduction = VecInd;
1723   for (unsigned Part = 0; Part < UF; ++Part) {
1724     VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction);
1725 
1726     if (isa<TruncInst>(EntryVal))
1727       addMetadata(LastInduction, EntryVal);
1728     recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, Part);
1729 
1730     LastInduction = cast<Instruction>(addFastMathFlag(
1731         Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")));
1732     LastInduction->setDebugLoc(EntryVal->getDebugLoc());
1733   }
1734 
1735   // Move the last step to the end of the latch block. This ensures consistent
1736   // placement of all induction updates.
1737   auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
1738   auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator());
1739   auto *ICmp = cast<Instruction>(Br->getCondition());
1740   LastInduction->moveBefore(ICmp);
1741   LastInduction->setName("vec.ind.next");
1742 
1743   VecInd->addIncoming(SteppedStart, LoopVectorPreHeader);
1744   VecInd->addIncoming(LastInduction, LoopVectorLatch);
1745 }
1746 
1747 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const {
1748   return Cost->isScalarAfterVectorization(I, VF) ||
1749          Cost->isProfitableToScalarize(I, VF);
1750 }
1751 
1752 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const {
1753   if (shouldScalarizeInstruction(IV))
1754     return true;
1755   auto isScalarInst = [&](User *U) -> bool {
1756     auto *I = cast<Instruction>(U);
1757     return (OrigLoop->contains(I) && shouldScalarizeInstruction(I));
1758   };
1759   return llvm::any_of(IV->users(), isScalarInst);
1760 }
1761 
1762 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast(
1763     const InductionDescriptor &ID, const Instruction *EntryVal,
1764     Value *VectorLoopVal, unsigned Part, unsigned Lane) {
1765   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1766          "Expected either an induction phi-node or a truncate of it!");
1767 
1768   // This induction variable is not the phi from the original loop but the
1769   // newly-created IV based on the proof that casted Phi is equal to the
1770   // uncasted Phi in the vectorized loop (under a runtime guard possibly). It
1771   // re-uses the same InductionDescriptor that original IV uses but we don't
1772   // have to do any recording in this case - that is done when original IV is
1773   // processed.
1774   if (isa<TruncInst>(EntryVal))
1775     return;
1776 
1777   const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts();
1778   if (Casts.empty())
1779     return;
1780   // Only the first Cast instruction in the Casts vector is of interest.
1781   // The rest of the Casts (if exist) have no uses outside the
1782   // induction update chain itself.
1783   Instruction *CastInst = *Casts.begin();
1784   if (Lane < UINT_MAX)
1785     VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal);
1786   else
1787     VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal);
1788 }
1789 
1790 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) {
1791   assert((IV->getType()->isIntegerTy() || IV != OldInduction) &&
1792          "Primary induction variable must have an integer type");
1793 
1794   auto II = Legal->getInductionVars()->find(IV);
1795   assert(II != Legal->getInductionVars()->end() && "IV is not an induction");
1796 
1797   auto ID = II->second;
1798   assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
1799 
1800   // The scalar value to broadcast. This will be derived from the canonical
1801   // induction variable.
1802   Value *ScalarIV = nullptr;
1803 
1804   // The value from the original loop to which we are mapping the new induction
1805   // variable.
1806   Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
1807 
1808   // True if we have vectorized the induction variable.
1809   auto VectorizedIV = false;
1810 
1811   // Determine if we want a scalar version of the induction variable. This is
1812   // true if the induction variable itself is not widened, or if it has at
1813   // least one user in the loop that is not widened.
1814   auto NeedsScalarIV = VF > 1 && needsScalarInduction(EntryVal);
1815 
1816   // Generate code for the induction step. Note that induction steps are
1817   // required to be loop-invariant
1818   assert(PSE.getSE()->isLoopInvariant(ID.getStep(), OrigLoop) &&
1819          "Induction step should be loop invariant");
1820   auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
1821   Value *Step = nullptr;
1822   if (PSE.getSE()->isSCEVable(IV->getType())) {
1823     SCEVExpander Exp(*PSE.getSE(), DL, "induction");
1824     Step = Exp.expandCodeFor(ID.getStep(), ID.getStep()->getType(),
1825                              LoopVectorPreHeader->getTerminator());
1826   } else {
1827     Step = cast<SCEVUnknown>(ID.getStep())->getValue();
1828   }
1829 
1830   // Try to create a new independent vector induction variable. If we can't
1831   // create the phi node, we will splat the scalar induction variable in each
1832   // loop iteration.
1833   if (VF > 1 && !shouldScalarizeInstruction(EntryVal)) {
1834     createVectorIntOrFpInductionPHI(ID, Step, EntryVal);
1835     VectorizedIV = true;
1836   }
1837 
1838   // If we haven't yet vectorized the induction variable, or if we will create
1839   // a scalar one, we need to define the scalar induction variable and step
1840   // values. If we were given a truncation type, truncate the canonical
1841   // induction variable and step. Otherwise, derive these values from the
1842   // induction descriptor.
1843   if (!VectorizedIV || NeedsScalarIV) {
1844     ScalarIV = Induction;
1845     if (IV != OldInduction) {
1846       ScalarIV = IV->getType()->isIntegerTy()
1847                      ? Builder.CreateSExtOrTrunc(Induction, IV->getType())
1848                      : Builder.CreateCast(Instruction::SIToFP, Induction,
1849                                           IV->getType());
1850       ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID);
1851       ScalarIV->setName("offset.idx");
1852     }
1853     if (Trunc) {
1854       auto *TruncType = cast<IntegerType>(Trunc->getType());
1855       assert(Step->getType()->isIntegerTy() &&
1856              "Truncation requires an integer step");
1857       ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType);
1858       Step = Builder.CreateTrunc(Step, TruncType);
1859     }
1860   }
1861 
1862   // If we haven't yet vectorized the induction variable, splat the scalar
1863   // induction variable, and build the necessary step vectors.
1864   // TODO: Don't do it unless the vectorized IV is really required.
1865   if (!VectorizedIV) {
1866     Value *Broadcasted = getBroadcastInstrs(ScalarIV);
1867     for (unsigned Part = 0; Part < UF; ++Part) {
1868       Value *EntryPart =
1869           getStepVector(Broadcasted, VF * Part, Step, ID.getInductionOpcode());
1870       VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart);
1871       if (Trunc)
1872         addMetadata(EntryPart, Trunc);
1873       recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part);
1874     }
1875   }
1876 
1877   // If an induction variable is only used for counting loop iterations or
1878   // calculating addresses, it doesn't need to be widened. Create scalar steps
1879   // that can be used by instructions we will later scalarize. Note that the
1880   // addition of the scalar steps will not increase the number of instructions
1881   // in the loop in the common case prior to InstCombine. We will be trading
1882   // one vector extract for each scalar step.
1883   if (NeedsScalarIV)
1884     buildScalarSteps(ScalarIV, Step, EntryVal, ID);
1885 }
1886 
1887 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step,
1888                                           Instruction::BinaryOps BinOp) {
1889   // Create and check the types.
1890   assert(Val->getType()->isVectorTy() && "Must be a vector");
1891   int VLen = Val->getType()->getVectorNumElements();
1892 
1893   Type *STy = Val->getType()->getScalarType();
1894   assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
1895          "Induction Step must be an integer or FP");
1896   assert(Step->getType() == STy && "Step has wrong type");
1897 
1898   SmallVector<Constant *, 8> Indices;
1899 
1900   if (STy->isIntegerTy()) {
1901     // Create a vector of consecutive numbers from zero to VF.
1902     for (int i = 0; i < VLen; ++i)
1903       Indices.push_back(ConstantInt::get(STy, StartIdx + i));
1904 
1905     // Add the consecutive indices to the vector value.
1906     Constant *Cv = ConstantVector::get(Indices);
1907     assert(Cv->getType() == Val->getType() && "Invalid consecutive vec");
1908     Step = Builder.CreateVectorSplat(VLen, Step);
1909     assert(Step->getType() == Val->getType() && "Invalid step vec");
1910     // FIXME: The newly created binary instructions should contain nsw/nuw flags,
1911     // which can be found from the original scalar operations.
1912     Step = Builder.CreateMul(Cv, Step);
1913     return Builder.CreateAdd(Val, Step, "induction");
1914   }
1915 
1916   // Floating point induction.
1917   assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
1918          "Binary Opcode should be specified for FP induction");
1919   // Create a vector of consecutive numbers from zero to VF.
1920   for (int i = 0; i < VLen; ++i)
1921     Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i)));
1922 
1923   // Add the consecutive indices to the vector value.
1924   Constant *Cv = ConstantVector::get(Indices);
1925 
1926   Step = Builder.CreateVectorSplat(VLen, Step);
1927 
1928   // Floating point operations had to be 'fast' to enable the induction.
1929   FastMathFlags Flags;
1930   Flags.setFast();
1931 
1932   Value *MulOp = Builder.CreateFMul(Cv, Step);
1933   if (isa<Instruction>(MulOp))
1934     // Have to check, MulOp may be a constant
1935     cast<Instruction>(MulOp)->setFastMathFlags(Flags);
1936 
1937   Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
1938   if (isa<Instruction>(BOp))
1939     cast<Instruction>(BOp)->setFastMathFlags(Flags);
1940   return BOp;
1941 }
1942 
1943 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
1944                                            Instruction *EntryVal,
1945                                            const InductionDescriptor &ID) {
1946   // We shouldn't have to build scalar steps if we aren't vectorizing.
1947   assert(VF > 1 && "VF should be greater than one");
1948 
1949   // Get the value type and ensure it and the step have the same integer type.
1950   Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
1951   assert(ScalarIVTy == Step->getType() &&
1952          "Val and Step should have the same type");
1953 
1954   // We build scalar steps for both integer and floating-point induction
1955   // variables. Here, we determine the kind of arithmetic we will perform.
1956   Instruction::BinaryOps AddOp;
1957   Instruction::BinaryOps MulOp;
1958   if (ScalarIVTy->isIntegerTy()) {
1959     AddOp = Instruction::Add;
1960     MulOp = Instruction::Mul;
1961   } else {
1962     AddOp = ID.getInductionOpcode();
1963     MulOp = Instruction::FMul;
1964   }
1965 
1966   // Determine the number of scalars we need to generate for each unroll
1967   // iteration. If EntryVal is uniform, we only need to generate the first
1968   // lane. Otherwise, we generate all VF values.
1969   unsigned Lanes =
1970       Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) ? 1
1971                                                                          : VF;
1972   // Compute the scalar steps and save the results in VectorLoopValueMap.
1973   for (unsigned Part = 0; Part < UF; ++Part) {
1974     for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
1975       auto *StartIdx = getSignedIntOrFpConstant(ScalarIVTy, VF * Part + Lane);
1976       auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step));
1977       auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul));
1978       VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add);
1979       recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane);
1980     }
1981   }
1982 }
1983 
1984 Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) {
1985   assert(V != Induction && "The new induction variable should not be used.");
1986   assert(!V->getType()->isVectorTy() && "Can't widen a vector");
1987   assert(!V->getType()->isVoidTy() && "Type does not produce a value");
1988 
1989   // If we have a stride that is replaced by one, do it here. Defer this for
1990   // the VPlan-native path until we start running Legal checks in that path.
1991   if (!EnableVPlanNativePath && Legal->hasStride(V))
1992     V = ConstantInt::get(V->getType(), 1);
1993 
1994   // If we have a vector mapped to this value, return it.
1995   if (VectorLoopValueMap.hasVectorValue(V, Part))
1996     return VectorLoopValueMap.getVectorValue(V, Part);
1997 
1998   // If the value has not been vectorized, check if it has been scalarized
1999   // instead. If it has been scalarized, and we actually need the value in
2000   // vector form, we will construct the vector values on demand.
2001   if (VectorLoopValueMap.hasAnyScalarValue(V)) {
2002     Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0});
2003 
2004     // If we've scalarized a value, that value should be an instruction.
2005     auto *I = cast<Instruction>(V);
2006 
2007     // If we aren't vectorizing, we can just copy the scalar map values over to
2008     // the vector map.
2009     if (VF == 1) {
2010       VectorLoopValueMap.setVectorValue(V, Part, ScalarValue);
2011       return ScalarValue;
2012     }
2013 
2014     // Get the last scalar instruction we generated for V and Part. If the value
2015     // is known to be uniform after vectorization, this corresponds to lane zero
2016     // of the Part unroll iteration. Otherwise, the last instruction is the one
2017     // we created for the last vector lane of the Part unroll iteration.
2018     unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) ? 0 : VF - 1;
2019     auto *LastInst = cast<Instruction>(
2020         VectorLoopValueMap.getScalarValue(V, {Part, LastLane}));
2021 
2022     // Set the insert point after the last scalarized instruction. This ensures
2023     // the insertelement sequence will directly follow the scalar definitions.
2024     auto OldIP = Builder.saveIP();
2025     auto NewIP = std::next(BasicBlock::iterator(LastInst));
2026     Builder.SetInsertPoint(&*NewIP);
2027 
2028     // However, if we are vectorizing, we need to construct the vector values.
2029     // If the value is known to be uniform after vectorization, we can just
2030     // broadcast the scalar value corresponding to lane zero for each unroll
2031     // iteration. Otherwise, we construct the vector values using insertelement
2032     // instructions. Since the resulting vectors are stored in
2033     // VectorLoopValueMap, we will only generate the insertelements once.
2034     Value *VectorValue = nullptr;
2035     if (Cost->isUniformAfterVectorization(I, VF)) {
2036       VectorValue = getBroadcastInstrs(ScalarValue);
2037       VectorLoopValueMap.setVectorValue(V, Part, VectorValue);
2038     } else {
2039       // Initialize packing with insertelements to start from undef.
2040       Value *Undef = UndefValue::get(VectorType::get(V->getType(), VF));
2041       VectorLoopValueMap.setVectorValue(V, Part, Undef);
2042       for (unsigned Lane = 0; Lane < VF; ++Lane)
2043         packScalarIntoVectorValue(V, {Part, Lane});
2044       VectorValue = VectorLoopValueMap.getVectorValue(V, Part);
2045     }
2046     Builder.restoreIP(OldIP);
2047     return VectorValue;
2048   }
2049 
2050   // If this scalar is unknown, assume that it is a constant or that it is
2051   // loop invariant. Broadcast V and save the value for future uses.
2052   Value *B = getBroadcastInstrs(V);
2053   VectorLoopValueMap.setVectorValue(V, Part, B);
2054   return B;
2055 }
2056 
2057 Value *
2058 InnerLoopVectorizer::getOrCreateScalarValue(Value *V,
2059                                             const VPIteration &Instance) {
2060   // If the value is not an instruction contained in the loop, it should
2061   // already be scalar.
2062   if (OrigLoop->isLoopInvariant(V))
2063     return V;
2064 
2065   assert(Instance.Lane > 0
2066              ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF)
2067              : true && "Uniform values only have lane zero");
2068 
2069   // If the value from the original loop has not been vectorized, it is
2070   // represented by UF x VF scalar values in the new loop. Return the requested
2071   // scalar value.
2072   if (VectorLoopValueMap.hasScalarValue(V, Instance))
2073     return VectorLoopValueMap.getScalarValue(V, Instance);
2074 
2075   // If the value has not been scalarized, get its entry in VectorLoopValueMap
2076   // for the given unroll part. If this entry is not a vector type (i.e., the
2077   // vectorization factor is one), there is no need to generate an
2078   // extractelement instruction.
2079   auto *U = getOrCreateVectorValue(V, Instance.Part);
2080   if (!U->getType()->isVectorTy()) {
2081     assert(VF == 1 && "Value not scalarized has non-vector type");
2082     return U;
2083   }
2084 
2085   // Otherwise, the value from the original loop has been vectorized and is
2086   // represented by UF vector values. Extract and return the requested scalar
2087   // value from the appropriate vector lane.
2088   return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane));
2089 }
2090 
2091 void InnerLoopVectorizer::packScalarIntoVectorValue(
2092     Value *V, const VPIteration &Instance) {
2093   assert(V != Induction && "The new induction variable should not be used.");
2094   assert(!V->getType()->isVectorTy() && "Can't pack a vector");
2095   assert(!V->getType()->isVoidTy() && "Type does not produce a value");
2096 
2097   Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance);
2098   Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part);
2099   VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst,
2100                                             Builder.getInt32(Instance.Lane));
2101   VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue);
2102 }
2103 
2104 Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
2105   assert(Vec->getType()->isVectorTy() && "Invalid type");
2106   SmallVector<Constant *, 8> ShuffleMask;
2107   for (unsigned i = 0; i < VF; ++i)
2108     ShuffleMask.push_back(Builder.getInt32(VF - i - 1));
2109 
2110   return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()),
2111                                      ConstantVector::get(ShuffleMask),
2112                                      "reverse");
2113 }
2114 
2115 // Return whether we allow using masked interleave-groups (for dealing with
2116 // strided loads/stores that reside in predicated blocks, or for dealing
2117 // with gaps).
2118 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2119   // If an override option has been passed in for interleaved accesses, use it.
2120   if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2121     return EnableMaskedInterleavedMemAccesses;
2122 
2123   return TTI.enableMaskedInterleavedAccessVectorization();
2124 }
2125 
2126 // Try to vectorize the interleave group that \p Instr belongs to.
2127 //
2128 // E.g. Translate following interleaved load group (factor = 3):
2129 //   for (i = 0; i < N; i+=3) {
2130 //     R = Pic[i];             // Member of index 0
2131 //     G = Pic[i+1];           // Member of index 1
2132 //     B = Pic[i+2];           // Member of index 2
2133 //     ... // do something to R, G, B
2134 //   }
2135 // To:
2136 //   %wide.vec = load <12 x i32>                       ; Read 4 tuples of R,G,B
2137 //   %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9>   ; R elements
2138 //   %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10>  ; G elements
2139 //   %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11>  ; B elements
2140 //
2141 // Or translate following interleaved store group (factor = 3):
2142 //   for (i = 0; i < N; i+=3) {
2143 //     ... do something to R, G, B
2144 //     Pic[i]   = R;           // Member of index 0
2145 //     Pic[i+1] = G;           // Member of index 1
2146 //     Pic[i+2] = B;           // Member of index 2
2147 //   }
2148 // To:
2149 //   %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2150 //   %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u>
2151 //   %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2152 //        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements
2153 //   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
2154 void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr,
2155                                                    VectorParts *BlockInMask) {
2156   const InterleaveGroup<Instruction> *Group =
2157       Cost->getInterleavedAccessGroup(Instr);
2158   assert(Group && "Fail to get an interleaved access group.");
2159 
2160   // Skip if current instruction is not the insert position.
2161   if (Instr != Group->getInsertPos())
2162     return;
2163 
2164   const DataLayout &DL = Instr->getModule()->getDataLayout();
2165   Value *Ptr = getLoadStorePointerOperand(Instr);
2166 
2167   // Prepare for the vector type of the interleaved load/store.
2168   Type *ScalarTy = getMemInstValueType(Instr);
2169   unsigned InterleaveFactor = Group->getFactor();
2170   Type *VecTy = VectorType::get(ScalarTy, InterleaveFactor * VF);
2171   Type *PtrTy = VecTy->getPointerTo(getLoadStoreAddressSpace(Instr));
2172 
2173   // Prepare for the new pointers.
2174   setDebugLocFromInst(Builder, Ptr);
2175   SmallVector<Value *, 2> NewPtrs;
2176   unsigned Index = Group->getIndex(Instr);
2177 
2178   VectorParts Mask;
2179   bool IsMaskForCondRequired = BlockInMask;
2180   if (IsMaskForCondRequired) {
2181     Mask = *BlockInMask;
2182     // TODO: extend the masked interleaved-group support to reversed access.
2183     assert(!Group->isReverse() && "Reversed masked interleave-group "
2184                                   "not supported.");
2185   }
2186 
2187   // If the group is reverse, adjust the index to refer to the last vector lane
2188   // instead of the first. We adjust the index from the first vector lane,
2189   // rather than directly getting the pointer for lane VF - 1, because the
2190   // pointer operand of the interleaved access is supposed to be uniform. For
2191   // uniform instructions, we're only required to generate a value for the
2192   // first vector lane in each unroll iteration.
2193   if (Group->isReverse())
2194     Index += (VF - 1) * Group->getFactor();
2195 
2196   bool InBounds = false;
2197   if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
2198     InBounds = gep->isInBounds();
2199 
2200   for (unsigned Part = 0; Part < UF; Part++) {
2201     Value *NewPtr = getOrCreateScalarValue(Ptr, {Part, 0});
2202 
2203     // Notice current instruction could be any index. Need to adjust the address
2204     // to the member of index 0.
2205     //
2206     // E.g.  a = A[i+1];     // Member of index 1 (Current instruction)
2207     //       b = A[i];       // Member of index 0
2208     // Current pointer is pointed to A[i+1], adjust it to A[i].
2209     //
2210     // E.g.  A[i+1] = a;     // Member of index 1
2211     //       A[i]   = b;     // Member of index 0
2212     //       A[i+2] = c;     // Member of index 2 (Current instruction)
2213     // Current pointer is pointed to A[i+2], adjust it to A[i].
2214     NewPtr = Builder.CreateGEP(ScalarTy, NewPtr, Builder.getInt32(-Index));
2215     if (InBounds)
2216       cast<GetElementPtrInst>(NewPtr)->setIsInBounds(true);
2217 
2218     // Cast to the vector pointer type.
2219     NewPtrs.push_back(Builder.CreateBitCast(NewPtr, PtrTy));
2220   }
2221 
2222   setDebugLocFromInst(Builder, Instr);
2223   Value *UndefVec = UndefValue::get(VecTy);
2224 
2225   Value *MaskForGaps = nullptr;
2226   if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) {
2227     MaskForGaps = createBitMaskForGaps(Builder, VF, *Group);
2228     assert(MaskForGaps && "Mask for Gaps is required but it is null");
2229   }
2230 
2231   // Vectorize the interleaved load group.
2232   if (isa<LoadInst>(Instr)) {
2233     // For each unroll part, create a wide load for the group.
2234     SmallVector<Value *, 2> NewLoads;
2235     for (unsigned Part = 0; Part < UF; Part++) {
2236       Instruction *NewLoad;
2237       if (IsMaskForCondRequired || MaskForGaps) {
2238         assert(useMaskedInterleavedAccesses(*TTI) &&
2239                "masked interleaved groups are not allowed.");
2240         Value *GroupMask = MaskForGaps;
2241         if (IsMaskForCondRequired) {
2242           auto *Undefs = UndefValue::get(Mask[Part]->getType());
2243           auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF);
2244           Value *ShuffledMask = Builder.CreateShuffleVector(
2245               Mask[Part], Undefs, RepMask, "interleaved.mask");
2246           GroupMask = MaskForGaps
2247                           ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
2248                                                 MaskForGaps)
2249                           : ShuffledMask;
2250         }
2251         NewLoad =
2252             Builder.CreateMaskedLoad(NewPtrs[Part], Group->getAlignment(),
2253                                      GroupMask, UndefVec, "wide.masked.vec");
2254       }
2255       else
2256         NewLoad = Builder.CreateAlignedLoad(VecTy, NewPtrs[Part],
2257                                             Group->getAlignment(), "wide.vec");
2258       Group->addMetadata(NewLoad);
2259       NewLoads.push_back(NewLoad);
2260     }
2261 
2262     // For each member in the group, shuffle out the appropriate data from the
2263     // wide loads.
2264     for (unsigned I = 0; I < InterleaveFactor; ++I) {
2265       Instruction *Member = Group->getMember(I);
2266 
2267       // Skip the gaps in the group.
2268       if (!Member)
2269         continue;
2270 
2271       Constant *StrideMask = createStrideMask(Builder, I, InterleaveFactor, VF);
2272       for (unsigned Part = 0; Part < UF; Part++) {
2273         Value *StridedVec = Builder.CreateShuffleVector(
2274             NewLoads[Part], UndefVec, StrideMask, "strided.vec");
2275 
2276         // If this member has different type, cast the result type.
2277         if (Member->getType() != ScalarTy) {
2278           VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2279           StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2280         }
2281 
2282         if (Group->isReverse())
2283           StridedVec = reverseVector(StridedVec);
2284 
2285         VectorLoopValueMap.setVectorValue(Member, Part, StridedVec);
2286       }
2287     }
2288     return;
2289   }
2290 
2291   // The sub vector type for current instruction.
2292   VectorType *SubVT = VectorType::get(ScalarTy, VF);
2293 
2294   // Vectorize the interleaved store group.
2295   for (unsigned Part = 0; Part < UF; Part++) {
2296     // Collect the stored vector from each member.
2297     SmallVector<Value *, 4> StoredVecs;
2298     for (unsigned i = 0; i < InterleaveFactor; i++) {
2299       // Interleaved store group doesn't allow a gap, so each index has a member
2300       Instruction *Member = Group->getMember(i);
2301       assert(Member && "Fail to get a member from an interleaved store group");
2302 
2303       Value *StoredVec = getOrCreateVectorValue(
2304           cast<StoreInst>(Member)->getValueOperand(), Part);
2305       if (Group->isReverse())
2306         StoredVec = reverseVector(StoredVec);
2307 
2308       // If this member has different type, cast it to a unified type.
2309 
2310       if (StoredVec->getType() != SubVT)
2311         StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2312 
2313       StoredVecs.push_back(StoredVec);
2314     }
2315 
2316     // Concatenate all vectors into a wide vector.
2317     Value *WideVec = concatenateVectors(Builder, StoredVecs);
2318 
2319     // Interleave the elements in the wide vector.
2320     Constant *IMask = createInterleaveMask(Builder, VF, InterleaveFactor);
2321     Value *IVec = Builder.CreateShuffleVector(WideVec, UndefVec, IMask,
2322                                               "interleaved.vec");
2323 
2324     Instruction *NewStoreInstr;
2325     if (IsMaskForCondRequired) {
2326       auto *Undefs = UndefValue::get(Mask[Part]->getType());
2327       auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF);
2328       Value *ShuffledMask = Builder.CreateShuffleVector(
2329           Mask[Part], Undefs, RepMask, "interleaved.mask");
2330       NewStoreInstr = Builder.CreateMaskedStore(
2331           IVec, NewPtrs[Part], Group->getAlignment(), ShuffledMask);
2332     }
2333     else
2334       NewStoreInstr = Builder.CreateAlignedStore(IVec, NewPtrs[Part],
2335         Group->getAlignment());
2336 
2337     Group->addMetadata(NewStoreInstr);
2338   }
2339 }
2340 
2341 void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
2342                                                      VectorParts *BlockInMask) {
2343   // Attempt to issue a wide load.
2344   LoadInst *LI = dyn_cast<LoadInst>(Instr);
2345   StoreInst *SI = dyn_cast<StoreInst>(Instr);
2346 
2347   assert((LI || SI) && "Invalid Load/Store instruction");
2348 
2349   LoopVectorizationCostModel::InstWidening Decision =
2350       Cost->getWideningDecision(Instr, VF);
2351   assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
2352          "CM decision should be taken at this point");
2353   if (Decision == LoopVectorizationCostModel::CM_Interleave)
2354     return vectorizeInterleaveGroup(Instr);
2355 
2356   Type *ScalarDataTy = getMemInstValueType(Instr);
2357   Type *DataTy = VectorType::get(ScalarDataTy, VF);
2358   Value *Ptr = getLoadStorePointerOperand(Instr);
2359   unsigned Alignment = getLoadStoreAlignment(Instr);
2360   // An alignment of 0 means target abi alignment. We need to use the scalar's
2361   // target abi alignment in such a case.
2362   const DataLayout &DL = Instr->getModule()->getDataLayout();
2363   if (!Alignment)
2364     Alignment = DL.getABITypeAlignment(ScalarDataTy);
2365   unsigned AddressSpace = getLoadStoreAddressSpace(Instr);
2366 
2367   // Determine if the pointer operand of the access is either consecutive or
2368   // reverse consecutive.
2369   bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse);
2370   bool ConsecutiveStride =
2371       Reverse || (Decision == LoopVectorizationCostModel::CM_Widen);
2372   bool CreateGatherScatter =
2373       (Decision == LoopVectorizationCostModel::CM_GatherScatter);
2374 
2375   // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector
2376   // gather/scatter. Otherwise Decision should have been to Scalarize.
2377   assert((ConsecutiveStride || CreateGatherScatter) &&
2378          "The instruction should be scalarized");
2379 
2380   // Handle consecutive loads/stores.
2381   if (ConsecutiveStride)
2382     Ptr = getOrCreateScalarValue(Ptr, {0, 0});
2383 
2384   VectorParts Mask;
2385   bool isMaskRequired = BlockInMask;
2386   if (isMaskRequired)
2387     Mask = *BlockInMask;
2388 
2389   bool InBounds = false;
2390   if (auto *gep = dyn_cast<GetElementPtrInst>(
2391           getLoadStorePointerOperand(Instr)->stripPointerCasts()))
2392     InBounds = gep->isInBounds();
2393 
2394   const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
2395     // Calculate the pointer for the specific unroll-part.
2396     GetElementPtrInst *PartPtr = nullptr;
2397 
2398     if (Reverse) {
2399       // If the address is consecutive but reversed, then the
2400       // wide store needs to start at the last vector element.
2401       PartPtr = cast<GetElementPtrInst>(
2402           Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(-Part * VF)));
2403       PartPtr->setIsInBounds(InBounds);
2404       PartPtr = cast<GetElementPtrInst>(
2405           Builder.CreateGEP(ScalarDataTy, PartPtr, Builder.getInt32(1 - VF)));
2406       PartPtr->setIsInBounds(InBounds);
2407       if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
2408         Mask[Part] = reverseVector(Mask[Part]);
2409     } else {
2410       PartPtr = cast<GetElementPtrInst>(
2411           Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(Part * VF)));
2412       PartPtr->setIsInBounds(InBounds);
2413     }
2414 
2415     return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
2416   };
2417 
2418   // Handle Stores:
2419   if (SI) {
2420     setDebugLocFromInst(Builder, SI);
2421 
2422     for (unsigned Part = 0; Part < UF; ++Part) {
2423       Instruction *NewSI = nullptr;
2424       Value *StoredVal = getOrCreateVectorValue(SI->getValueOperand(), Part);
2425       if (CreateGatherScatter) {
2426         Value *MaskPart = isMaskRequired ? Mask[Part] : nullptr;
2427         Value *VectorGep = getOrCreateVectorValue(Ptr, Part);
2428         NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
2429                                             MaskPart);
2430       } else {
2431         if (Reverse) {
2432           // If we store to reverse consecutive memory locations, then we need
2433           // to reverse the order of elements in the stored value.
2434           StoredVal = reverseVector(StoredVal);
2435           // We don't want to update the value in the map as it might be used in
2436           // another expression. So don't call resetVectorValue(StoredVal).
2437         }
2438         auto *VecPtr = CreateVecPtr(Part, Ptr);
2439         if (isMaskRequired)
2440           NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
2441                                             Mask[Part]);
2442         else
2443           NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
2444       }
2445       addMetadata(NewSI, SI);
2446     }
2447     return;
2448   }
2449 
2450   // Handle loads.
2451   assert(LI && "Must have a load instruction");
2452   setDebugLocFromInst(Builder, LI);
2453   for (unsigned Part = 0; Part < UF; ++Part) {
2454     Value *NewLI;
2455     if (CreateGatherScatter) {
2456       Value *MaskPart = isMaskRequired ? Mask[Part] : nullptr;
2457       Value *VectorGep = getOrCreateVectorValue(Ptr, Part);
2458       NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart,
2459                                          nullptr, "wide.masked.gather");
2460       addMetadata(NewLI, LI);
2461     } else {
2462       auto *VecPtr = CreateVecPtr(Part, Ptr);
2463       if (isMaskRequired)
2464         NewLI = Builder.CreateMaskedLoad(VecPtr, Alignment, Mask[Part],
2465                                          UndefValue::get(DataTy),
2466                                          "wide.masked.load");
2467       else
2468         NewLI =
2469             Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");
2470 
2471       // Add metadata to the load, but setVectorValue to the reverse shuffle.
2472       addMetadata(NewLI, LI);
2473       if (Reverse)
2474         NewLI = reverseVector(NewLI);
2475     }
2476     VectorLoopValueMap.setVectorValue(Instr, Part, NewLI);
2477   }
2478 }
2479 
2480 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
2481                                                const VPIteration &Instance,
2482                                                bool IfPredicateInstr) {
2483   assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
2484 
2485   setDebugLocFromInst(Builder, Instr);
2486 
2487   // Does this instruction return a value ?
2488   bool IsVoidRetTy = Instr->getType()->isVoidTy();
2489 
2490   Instruction *Cloned = Instr->clone();
2491   if (!IsVoidRetTy)
2492     Cloned->setName(Instr->getName() + ".cloned");
2493 
2494   // Replace the operands of the cloned instructions with their scalar
2495   // equivalents in the new loop.
2496   for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
2497     auto *NewOp = getOrCreateScalarValue(Instr->getOperand(op), Instance);
2498     Cloned->setOperand(op, NewOp);
2499   }
2500   addNewMetadata(Cloned, Instr);
2501 
2502   // Place the cloned scalar in the new loop.
2503   Builder.Insert(Cloned);
2504 
2505   // Add the cloned scalar to the scalar map entry.
2506   VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned);
2507 
2508   // If we just cloned a new assumption, add it the assumption cache.
2509   if (auto *II = dyn_cast<IntrinsicInst>(Cloned))
2510     if (II->getIntrinsicID() == Intrinsic::assume)
2511       AC->registerAssumption(II);
2512 
2513   // End if-block.
2514   if (IfPredicateInstr)
2515     PredicatedInstructions.push_back(Cloned);
2516 }
2517 
2518 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start,
2519                                                       Value *End, Value *Step,
2520                                                       Instruction *DL) {
2521   BasicBlock *Header = L->getHeader();
2522   BasicBlock *Latch = L->getLoopLatch();
2523   // As we're just creating this loop, it's possible no latch exists
2524   // yet. If so, use the header as this will be a single block loop.
2525   if (!Latch)
2526     Latch = Header;
2527 
2528   IRBuilder<> Builder(&*Header->getFirstInsertionPt());
2529   Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction);
2530   setDebugLocFromInst(Builder, OldInst);
2531   auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index");
2532 
2533   Builder.SetInsertPoint(Latch->getTerminator());
2534   setDebugLocFromInst(Builder, OldInst);
2535 
2536   // Create i+1 and fill the PHINode.
2537   Value *Next = Builder.CreateAdd(Induction, Step, "index.next");
2538   Induction->addIncoming(Start, L->getLoopPreheader());
2539   Induction->addIncoming(Next, Latch);
2540   // Create the compare.
2541   Value *ICmp = Builder.CreateICmpEQ(Next, End);
2542   Builder.CreateCondBr(ICmp, L->getExitBlock(), Header);
2543 
2544   // Now we have two terminators. Remove the old one from the block.
2545   Latch->getTerminator()->eraseFromParent();
2546 
2547   return Induction;
2548 }
2549 
2550 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
2551   if (TripCount)
2552     return TripCount;
2553 
2554   assert(L && "Create Trip Count for null loop.");
2555   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2556   // Find the loop boundaries.
2557   ScalarEvolution *SE = PSE.getSE();
2558   const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
2559   assert(BackedgeTakenCount != SE->getCouldNotCompute() &&
2560          "Invalid loop count");
2561 
2562   Type *IdxTy = Legal->getWidestInductionType();
2563   assert(IdxTy && "No type for induction");
2564 
2565   // The exit count might have the type of i64 while the phi is i32. This can
2566   // happen if we have an induction variable that is sign extended before the
2567   // compare. The only way that we get a backedge taken count is that the
2568   // induction variable was signed and as such will not overflow. In such a case
2569   // truncation is legal.
2570   if (BackedgeTakenCount->getType()->getPrimitiveSizeInBits() >
2571       IdxTy->getPrimitiveSizeInBits())
2572     BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
2573   BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
2574 
2575   // Get the total trip count from the count by adding 1.
2576   const SCEV *ExitCount = SE->getAddExpr(
2577       BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
2578 
2579   const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
2580 
2581   // Expand the trip count and place the new instructions in the preheader.
2582   // Notice that the pre-header does not change, only the loop body.
2583   SCEVExpander Exp(*SE, DL, "induction");
2584 
2585   // Count holds the overall loop count (N).
2586   TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
2587                                 L->getLoopPreheader()->getTerminator());
2588 
2589   if (TripCount->getType()->isPointerTy())
2590     TripCount =
2591         CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
2592                                     L->getLoopPreheader()->getTerminator());
2593 
2594   return TripCount;
2595 }
2596 
2597 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
2598   if (VectorTripCount)
2599     return VectorTripCount;
2600 
2601   Value *TC = getOrCreateTripCount(L);
2602   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2603 
2604   Type *Ty = TC->getType();
2605   Constant *Step = ConstantInt::get(Ty, VF * UF);
2606 
2607   // If the tail is to be folded by masking, round the number of iterations N
2608   // up to a multiple of Step instead of rounding down. This is done by first
2609   // adding Step-1 and then rounding down. Note that it's ok if this addition
2610   // overflows: the vector induction variable will eventually wrap to zero given
2611   // that it starts at zero and its Step is a power of two; the loop will then
2612   // exit, with the last early-exit vector comparison also producing all-true.
2613   if (Cost->foldTailByMasking()) {
2614     assert(isPowerOf2_32(VF * UF) &&
2615            "VF*UF must be a power of 2 when folding tail by masking");
2616     TC = Builder.CreateAdd(TC, ConstantInt::get(Ty, VF * UF - 1), "n.rnd.up");
2617   }
2618 
2619   // Now we need to generate the expression for the part of the loop that the
2620   // vectorized body will execute. This is equal to N - (N % Step) if scalar
2621   // iterations are not required for correctness, or N - Step, otherwise. Step
2622   // is equal to the vectorization factor (number of SIMD elements) times the
2623   // unroll factor (number of SIMD instructions).
2624   Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
2625 
2626   // If there is a non-reversed interleaved group that may speculatively access
2627   // memory out-of-bounds, we need to ensure that there will be at least one
2628   // iteration of the scalar epilogue loop. Thus, if the step evenly divides
2629   // the trip count, we set the remainder to be equal to the step. If the step
2630   // does not evenly divide the trip count, no adjustment is necessary since
2631   // there will already be scalar iterations. Note that the minimum iterations
2632   // check ensures that N >= Step.
2633   if (VF > 1 && Cost->requiresScalarEpilogue()) {
2634     auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
2635     R = Builder.CreateSelect(IsZero, Step, R);
2636   }
2637 
2638   VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
2639 
2640   return VectorTripCount;
2641 }
2642 
2643 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
2644                                                    const DataLayout &DL) {
2645   // Verify that V is a vector type with same number of elements as DstVTy.
2646   unsigned VF = DstVTy->getNumElements();
2647   VectorType *SrcVecTy = cast<VectorType>(V->getType());
2648   assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match");
2649   Type *SrcElemTy = SrcVecTy->getElementType();
2650   Type *DstElemTy = DstVTy->getElementType();
2651   assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
2652          "Vector elements must have same size");
2653 
2654   // Do a direct cast if element types are castable.
2655   if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
2656     return Builder.CreateBitOrPointerCast(V, DstVTy);
2657   }
2658   // V cannot be directly casted to desired vector type.
2659   // May happen when V is a floating point vector but DstVTy is a vector of
2660   // pointers or vice-versa. Handle this using a two-step bitcast using an
2661   // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
2662   assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
2663          "Only one type should be a pointer type");
2664   assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
2665          "Only one type should be a floating point type");
2666   Type *IntTy =
2667       IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
2668   VectorType *VecIntTy = VectorType::get(IntTy, VF);
2669   Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
2670   return Builder.CreateBitOrPointerCast(CastVal, DstVTy);
2671 }
2672 
2673 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
2674                                                          BasicBlock *Bypass) {
2675   Value *Count = getOrCreateTripCount(L);
2676   BasicBlock *BB = L->getLoopPreheader();
2677   IRBuilder<> Builder(BB->getTerminator());
2678 
2679   // Generate code to check if the loop's trip count is less than VF * UF, or
2680   // equal to it in case a scalar epilogue is required; this implies that the
2681   // vector trip count is zero. This check also covers the case where adding one
2682   // to the backedge-taken count overflowed leading to an incorrect trip count
2683   // of zero. In this case we will also jump to the scalar loop.
2684   auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE
2685                                           : ICmpInst::ICMP_ULT;
2686 
2687   // If tail is to be folded, vector loop takes care of all iterations.
2688   Value *CheckMinIters = Builder.getFalse();
2689   if (!Cost->foldTailByMasking())
2690     CheckMinIters = Builder.CreateICmp(
2691         P, Count, ConstantInt::get(Count->getType(), VF * UF),
2692         "min.iters.check");
2693 
2694   BasicBlock *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
2695   // Update dominator tree immediately if the generated block is a
2696   // LoopBypassBlock because SCEV expansions to generate loop bypass
2697   // checks may query it before the current function is finished.
2698   DT->addNewBlock(NewBB, BB);
2699   if (L->getParentLoop())
2700     L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
2701   ReplaceInstWithInst(BB->getTerminator(),
2702                       BranchInst::Create(Bypass, NewBB, CheckMinIters));
2703   LoopBypassBlocks.push_back(BB);
2704 }
2705 
2706 void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
2707   BasicBlock *BB = L->getLoopPreheader();
2708 
2709   // Generate the code to check that the SCEV assumptions that we made.
2710   // We want the new basic block to start at the first instruction in a
2711   // sequence of instructions that form a check.
2712   SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(),
2713                    "scev.check");
2714   Value *SCEVCheck =
2715       Exp.expandCodeForPredicate(&PSE.getUnionPredicate(), BB->getTerminator());
2716 
2717   if (auto *C = dyn_cast<ConstantInt>(SCEVCheck))
2718     if (C->isZero())
2719       return;
2720 
2721   assert(!BB->getParent()->hasOptSize() &&
2722          "Cannot SCEV check stride or overflow when optimizing for size");
2723 
2724   // Create a new block containing the stride check.
2725   BB->setName("vector.scevcheck");
2726   auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
2727   // Update dominator tree immediately if the generated block is a
2728   // LoopBypassBlock because SCEV expansions to generate loop bypass
2729   // checks may query it before the current function is finished.
2730   DT->addNewBlock(NewBB, BB);
2731   if (L->getParentLoop())
2732     L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
2733   ReplaceInstWithInst(BB->getTerminator(),
2734                       BranchInst::Create(Bypass, NewBB, SCEVCheck));
2735   LoopBypassBlocks.push_back(BB);
2736   AddedSafetyChecks = true;
2737 }
2738 
2739 void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {
2740   // VPlan-native path does not do any analysis for runtime checks currently.
2741   if (EnableVPlanNativePath)
2742     return;
2743 
2744   BasicBlock *BB = L->getLoopPreheader();
2745 
2746   // Generate the code that checks in runtime if arrays overlap. We put the
2747   // checks into a separate block to make the more common case of few elements
2748   // faster.
2749   Instruction *FirstCheckInst;
2750   Instruction *MemRuntimeCheck;
2751   std::tie(FirstCheckInst, MemRuntimeCheck) =
2752       Legal->getLAI()->addRuntimeChecks(BB->getTerminator());
2753   if (!MemRuntimeCheck)
2754     return;
2755 
2756   if (BB->getParent()->hasOptSize()) {
2757     assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
2758            "Cannot emit memory checks when optimizing for size, unless forced "
2759            "to vectorize.");
2760     ORE->emit([&]() {
2761       return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
2762                                         L->getStartLoc(), L->getHeader())
2763              << "Code-size may be reduced by not forcing "
2764                 "vectorization, or by source-code modifications "
2765                 "eliminating the need for runtime checks "
2766                 "(e.g., adding 'restrict').";
2767     });
2768   }
2769 
2770   // Create a new block containing the memory check.
2771   BB->setName("vector.memcheck");
2772   auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
2773   // Update dominator tree immediately if the generated block is a
2774   // LoopBypassBlock because SCEV expansions to generate loop bypass
2775   // checks may query it before the current function is finished.
2776   DT->addNewBlock(NewBB, BB);
2777   if (L->getParentLoop())
2778     L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
2779   ReplaceInstWithInst(BB->getTerminator(),
2780                       BranchInst::Create(Bypass, NewBB, MemRuntimeCheck));
2781   LoopBypassBlocks.push_back(BB);
2782   AddedSafetyChecks = true;
2783 
2784   // We currently don't use LoopVersioning for the actual loop cloning but we
2785   // still use it to add the noalias metadata.
2786   LVer = std::make_unique<LoopVersioning>(*Legal->getLAI(), OrigLoop, LI, DT,
2787                                            PSE.getSE());
2788   LVer->prepareNoAliasMetadata();
2789 }
2790 
2791 Value *InnerLoopVectorizer::emitTransformedIndex(
2792     IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL,
2793     const InductionDescriptor &ID) const {
2794 
2795   SCEVExpander Exp(*SE, DL, "induction");
2796   auto Step = ID.getStep();
2797   auto StartValue = ID.getStartValue();
2798   assert(Index->getType() == Step->getType() &&
2799          "Index type does not match StepValue type");
2800 
2801   // Note: the IR at this point is broken. We cannot use SE to create any new
2802   // SCEV and then expand it, hoping that SCEV's simplification will give us
2803   // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
2804   // lead to various SCEV crashes. So all we can do is to use builder and rely
2805   // on InstCombine for future simplifications. Here we handle some trivial
2806   // cases only.
2807   auto CreateAdd = [&B](Value *X, Value *Y) {
2808     assert(X->getType() == Y->getType() && "Types don't match!");
2809     if (auto *CX = dyn_cast<ConstantInt>(X))
2810       if (CX->isZero())
2811         return Y;
2812     if (auto *CY = dyn_cast<ConstantInt>(Y))
2813       if (CY->isZero())
2814         return X;
2815     return B.CreateAdd(X, Y);
2816   };
2817 
2818   auto CreateMul = [&B](Value *X, Value *Y) {
2819     assert(X->getType() == Y->getType() && "Types don't match!");
2820     if (auto *CX = dyn_cast<ConstantInt>(X))
2821       if (CX->isOne())
2822         return Y;
2823     if (auto *CY = dyn_cast<ConstantInt>(Y))
2824       if (CY->isOne())
2825         return X;
2826     return B.CreateMul(X, Y);
2827   };
2828 
2829   switch (ID.getKind()) {
2830   case InductionDescriptor::IK_IntInduction: {
2831     assert(Index->getType() == StartValue->getType() &&
2832            "Index type does not match StartValue type");
2833     if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne())
2834       return B.CreateSub(StartValue, Index);
2835     auto *Offset = CreateMul(
2836         Index, Exp.expandCodeFor(Step, Index->getType(), &*B.GetInsertPoint()));
2837     return CreateAdd(StartValue, Offset);
2838   }
2839   case InductionDescriptor::IK_PtrInduction: {
2840     assert(isa<SCEVConstant>(Step) &&
2841            "Expected constant step for pointer induction");
2842     return B.CreateGEP(
2843         StartValue->getType()->getPointerElementType(), StartValue,
2844         CreateMul(Index, Exp.expandCodeFor(Step, Index->getType(),
2845                                            &*B.GetInsertPoint())));
2846   }
2847   case InductionDescriptor::IK_FpInduction: {
2848     assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
2849     auto InductionBinOp = ID.getInductionBinOp();
2850     assert(InductionBinOp &&
2851            (InductionBinOp->getOpcode() == Instruction::FAdd ||
2852             InductionBinOp->getOpcode() == Instruction::FSub) &&
2853            "Original bin op should be defined for FP induction");
2854 
2855     Value *StepValue = cast<SCEVUnknown>(Step)->getValue();
2856 
2857     // Floating point operations had to be 'fast' to enable the induction.
2858     FastMathFlags Flags;
2859     Flags.setFast();
2860 
2861     Value *MulExp = B.CreateFMul(StepValue, Index);
2862     if (isa<Instruction>(MulExp))
2863       // We have to check, the MulExp may be a constant.
2864       cast<Instruction>(MulExp)->setFastMathFlags(Flags);
2865 
2866     Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
2867                                "induction");
2868     if (isa<Instruction>(BOp))
2869       cast<Instruction>(BOp)->setFastMathFlags(Flags);
2870 
2871     return BOp;
2872   }
2873   case InductionDescriptor::IK_NoInduction:
2874     return nullptr;
2875   }
2876   llvm_unreachable("invalid enum");
2877 }
2878 
2879 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
2880   /*
2881    In this function we generate a new loop. The new loop will contain
2882    the vectorized instructions while the old loop will continue to run the
2883    scalar remainder.
2884 
2885        [ ] <-- loop iteration number check.
2886     /   |
2887    /    v
2888   |    [ ] <-- vector loop bypass (may consist of multiple blocks).
2889   |  /  |
2890   | /   v
2891   ||   [ ]     <-- vector pre header.
2892   |/    |
2893   |     v
2894   |    [  ] \
2895   |    [  ]_|   <-- vector loop.
2896   |     |
2897   |     v
2898   |   -[ ]   <--- middle-block.
2899   |  /  |
2900   | /   v
2901   -|- >[ ]     <--- new preheader.
2902    |    |
2903    |    v
2904    |   [ ] \
2905    |   [ ]_|   <-- old scalar loop to handle remainder.
2906     \   |
2907      \  v
2908       >[ ]     <-- exit block.
2909    ...
2910    */
2911 
2912   BasicBlock *OldBasicBlock = OrigLoop->getHeader();
2913   BasicBlock *VectorPH = OrigLoop->getLoopPreheader();
2914   BasicBlock *ExitBlock = OrigLoop->getExitBlock();
2915   MDNode *OrigLoopID = OrigLoop->getLoopID();
2916   assert(VectorPH && "Invalid loop structure");
2917   assert(ExitBlock && "Must have an exit block");
2918 
2919   // Some loops have a single integer induction variable, while other loops
2920   // don't. One example is c++ iterators that often have multiple pointer
2921   // induction variables. In the code below we also support a case where we
2922   // don't have a single induction variable.
2923   //
2924   // We try to obtain an induction variable from the original loop as hard
2925   // as possible. However if we don't find one that:
2926   //   - is an integer
2927   //   - counts from zero, stepping by one
2928   //   - is the size of the widest induction variable type
2929   // then we create a new one.
2930   OldInduction = Legal->getPrimaryInduction();
2931   Type *IdxTy = Legal->getWidestInductionType();
2932 
2933   // Split the single block loop into the two loop structure described above.
2934   BasicBlock *VecBody =
2935       VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.body");
2936   BasicBlock *MiddleBlock =
2937       VecBody->splitBasicBlock(VecBody->getTerminator(), "middle.block");
2938   BasicBlock *ScalarPH =
2939       MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), "scalar.ph");
2940 
2941   // Create and register the new vector loop.
2942   Loop *Lp = LI->AllocateLoop();
2943   Loop *ParentLoop = OrigLoop->getParentLoop();
2944 
2945   // Insert the new loop into the loop nest and register the new basic blocks
2946   // before calling any utilities such as SCEV that require valid LoopInfo.
2947   if (ParentLoop) {
2948     ParentLoop->addChildLoop(Lp);
2949     ParentLoop->addBasicBlockToLoop(ScalarPH, *LI);
2950     ParentLoop->addBasicBlockToLoop(MiddleBlock, *LI);
2951   } else {
2952     LI->addTopLevelLoop(Lp);
2953   }
2954   Lp->addBasicBlockToLoop(VecBody, *LI);
2955 
2956   // Find the loop boundaries.
2957   Value *Count = getOrCreateTripCount(Lp);
2958 
2959   Value *StartIdx = ConstantInt::get(IdxTy, 0);
2960 
2961   // Now, compare the new count to zero. If it is zero skip the vector loop and
2962   // jump to the scalar loop. This check also covers the case where the
2963   // backedge-taken count is uint##_max: adding one to it will overflow leading
2964   // to an incorrect trip count of zero. In this (rare) case we will also jump
2965   // to the scalar loop.
2966   emitMinimumIterationCountCheck(Lp, ScalarPH);
2967 
2968   // Generate the code to check any assumptions that we've made for SCEV
2969   // expressions.
2970   emitSCEVChecks(Lp, ScalarPH);
2971 
2972   // Generate the code that checks in runtime if arrays overlap. We put the
2973   // checks into a separate block to make the more common case of few elements
2974   // faster.
2975   emitMemRuntimeChecks(Lp, ScalarPH);
2976 
2977   // Generate the induction variable.
2978   // The loop step is equal to the vectorization factor (num of SIMD elements)
2979   // times the unroll factor (num of SIMD instructions).
2980   Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
2981   Constant *Step = ConstantInt::get(IdxTy, VF * UF);
2982   Induction =
2983       createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
2984                               getDebugLocFromInstOrOperands(OldInduction));
2985 
2986   // We are going to resume the execution of the scalar loop.
2987   // Go over all of the induction variables that we found and fix the
2988   // PHIs that are left in the scalar version of the loop.
2989   // The starting values of PHI nodes depend on the counter of the last
2990   // iteration in the vectorized loop.
2991   // If we come from a bypass edge then we need to start from the original
2992   // start value.
2993 
2994   // This variable saves the new starting index for the scalar loop. It is used
2995   // to test if there are any tail iterations left once the vector loop has
2996   // completed.
2997   LoopVectorizationLegality::InductionList *List = Legal->getInductionVars();
2998   for (auto &InductionEntry : *List) {
2999     PHINode *OrigPhi = InductionEntry.first;
3000     InductionDescriptor II = InductionEntry.second;
3001 
3002     // Create phi nodes to merge from the  backedge-taken check block.
3003     PHINode *BCResumeVal = PHINode::Create(
3004         OrigPhi->getType(), 3, "bc.resume.val", ScalarPH->getTerminator());
3005     // Copy original phi DL over to the new one.
3006     BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
3007     Value *&EndValue = IVEndValues[OrigPhi];
3008     if (OrigPhi == OldInduction) {
3009       // We know what the end value is.
3010       EndValue = CountRoundDown;
3011     } else {
3012       IRBuilder<> B(Lp->getLoopPreheader()->getTerminator());
3013       Type *StepType = II.getStep()->getType();
3014       Instruction::CastOps CastOp =
3015         CastInst::getCastOpcode(CountRoundDown, true, StepType, true);
3016       Value *CRD = B.CreateCast(CastOp, CountRoundDown, StepType, "cast.crd");
3017       const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
3018       EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
3019       EndValue->setName("ind.end");
3020     }
3021 
3022     // The new PHI merges the original incoming value, in case of a bypass,
3023     // or the value at the end of the vectorized loop.
3024     BCResumeVal->addIncoming(EndValue, MiddleBlock);
3025 
3026     // Fix the scalar body counter (PHI node).
3027     // The old induction's phi node in the scalar body needs the truncated
3028     // value.
3029     for (BasicBlock *BB : LoopBypassBlocks)
3030       BCResumeVal->addIncoming(II.getStartValue(), BB);
3031     OrigPhi->setIncomingValueForBlock(ScalarPH, BCResumeVal);
3032   }
3033 
3034   // We need the OrigLoop (scalar loop part) latch terminator to help
3035   // produce correct debug info for the middle block BB instructions.
3036   // The legality check stage guarantees that the loop will have a single
3037   // latch.
3038   assert(isa<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()) &&
3039          "Scalar loop latch terminator isn't a branch");
3040   BranchInst *ScalarLatchBr =
3041       cast<BranchInst>(OrigLoop->getLoopLatch()->getTerminator());
3042 
3043   // Add a check in the middle block to see if we have completed
3044   // all of the iterations in the first vector loop.
3045   // If (N - N%VF) == N, then we *don't* need to run the remainder.
3046   // If tail is to be folded, we know we don't need to run the remainder.
3047   Value *CmpN = Builder.getTrue();
3048   if (!Cost->foldTailByMasking()) {
3049     CmpN =
3050         CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count,
3051                         CountRoundDown, "cmp.n", MiddleBlock->getTerminator());
3052 
3053     // Here we use the same DebugLoc as the scalar loop latch branch instead
3054     // of the corresponding compare because they may have ended up with
3055     // different line numbers and we want to avoid awkward line stepping while
3056     // debugging. Eg. if the compare has got a line number inside the loop.
3057     cast<Instruction>(CmpN)->setDebugLoc(ScalarLatchBr->getDebugLoc());
3058   }
3059 
3060   BranchInst *BrInst = BranchInst::Create(ExitBlock, ScalarPH, CmpN);
3061   BrInst->setDebugLoc(ScalarLatchBr->getDebugLoc());
3062   ReplaceInstWithInst(MiddleBlock->getTerminator(), BrInst);
3063 
3064   // Get ready to start creating new instructions into the vectorized body.
3065   Builder.SetInsertPoint(&*VecBody->getFirstInsertionPt());
3066 
3067   // Save the state.
3068   LoopVectorPreHeader = Lp->getLoopPreheader();
3069   LoopScalarPreHeader = ScalarPH;
3070   LoopMiddleBlock = MiddleBlock;
3071   LoopExitBlock = ExitBlock;
3072   LoopVectorBody = VecBody;
3073   LoopScalarBody = OldBasicBlock;
3074 
3075   Optional<MDNode *> VectorizedLoopID =
3076       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
3077                                       LLVMLoopVectorizeFollowupVectorized});
3078   if (VectorizedLoopID.hasValue()) {
3079     Lp->setLoopID(VectorizedLoopID.getValue());
3080 
3081     // Do not setAlreadyVectorized if loop attributes have been defined
3082     // explicitly.
3083     return LoopVectorPreHeader;
3084   }
3085 
3086   // Keep all loop hints from the original loop on the vector loop (we'll
3087   // replace the vectorizer-specific hints below).
3088   if (MDNode *LID = OrigLoop->getLoopID())
3089     Lp->setLoopID(LID);
3090 
3091   LoopVectorizeHints Hints(Lp, true, *ORE);
3092   Hints.setAlreadyVectorized();
3093 
3094   return LoopVectorPreHeader;
3095 }
3096 
3097 // Fix up external users of the induction variable. At this point, we are
3098 // in LCSSA form, with all external PHIs that use the IV having one input value,
3099 // coming from the remainder loop. We need those PHIs to also have a correct
3100 // value for the IV when arriving directly from the middle block.
3101 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
3102                                        const InductionDescriptor &II,
3103                                        Value *CountRoundDown, Value *EndValue,
3104                                        BasicBlock *MiddleBlock) {
3105   // There are two kinds of external IV usages - those that use the value
3106   // computed in the last iteration (the PHI) and those that use the penultimate
3107   // value (the value that feeds into the phi from the loop latch).
3108   // We allow both, but they, obviously, have different values.
3109 
3110   assert(OrigLoop->getExitBlock() && "Expected a single exit block");
3111 
3112   DenseMap<Value *, Value *> MissingVals;
3113 
3114   // An external user of the last iteration's value should see the value that
3115   // the remainder loop uses to initialize its own IV.
3116   Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
3117   for (User *U : PostInc->users()) {
3118     Instruction *UI = cast<Instruction>(U);
3119     if (!OrigLoop->contains(UI)) {
3120       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3121       MissingVals[UI] = EndValue;
3122     }
3123   }
3124 
3125   // An external user of the penultimate value need to see EndValue - Step.
3126   // The simplest way to get this is to recompute it from the constituent SCEVs,
3127   // that is Start + (Step * (CRD - 1)).
3128   for (User *U : OrigPhi->users()) {
3129     auto *UI = cast<Instruction>(U);
3130     if (!OrigLoop->contains(UI)) {
3131       const DataLayout &DL =
3132           OrigLoop->getHeader()->getModule()->getDataLayout();
3133       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3134 
3135       IRBuilder<> B(MiddleBlock->getTerminator());
3136       Value *CountMinusOne = B.CreateSub(
3137           CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1));
3138       Value *CMO =
3139           !II.getStep()->getType()->isIntegerTy()
3140               ? B.CreateCast(Instruction::SIToFP, CountMinusOne,
3141                              II.getStep()->getType())
3142               : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType());
3143       CMO->setName("cast.cmo");
3144       Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II);
3145       Escape->setName("ind.escape");
3146       MissingVals[UI] = Escape;
3147     }
3148   }
3149 
3150   for (auto &I : MissingVals) {
3151     PHINode *PHI = cast<PHINode>(I.first);
3152     // One corner case we have to handle is two IVs "chasing" each-other,
3153     // that is %IV2 = phi [...], [ %IV1, %latch ]
3154     // In this case, if IV1 has an external use, we need to avoid adding both
3155     // "last value of IV1" and "penultimate value of IV2". So, verify that we
3156     // don't already have an incoming value for the middle block.
3157     if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
3158       PHI->addIncoming(I.second, MiddleBlock);
3159   }
3160 }
3161 
3162 namespace {
3163 
3164 struct CSEDenseMapInfo {
3165   static bool canHandle(const Instruction *I) {
3166     return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
3167            isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
3168   }
3169 
3170   static inline Instruction *getEmptyKey() {
3171     return DenseMapInfo<Instruction *>::getEmptyKey();
3172   }
3173 
3174   static inline Instruction *getTombstoneKey() {
3175     return DenseMapInfo<Instruction *>::getTombstoneKey();
3176   }
3177 
3178   static unsigned getHashValue(const Instruction *I) {
3179     assert(canHandle(I) && "Unknown instruction!");
3180     return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
3181                                                            I->value_op_end()));
3182   }
3183 
3184   static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
3185     if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3186         LHS == getTombstoneKey() || RHS == getTombstoneKey())
3187       return LHS == RHS;
3188     return LHS->isIdenticalTo(RHS);
3189   }
3190 };
3191 
3192 } // end anonymous namespace
3193 
3194 ///Perform cse of induction variable instructions.
3195 static void cse(BasicBlock *BB) {
3196   // Perform simple cse.
3197   SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
3198   for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
3199     Instruction *In = &*I++;
3200 
3201     if (!CSEDenseMapInfo::canHandle(In))
3202       continue;
3203 
3204     // Check if we can replace this instruction with any of the
3205     // visited instructions.
3206     if (Instruction *V = CSEMap.lookup(In)) {
3207       In->replaceAllUsesWith(V);
3208       In->eraseFromParent();
3209       continue;
3210     }
3211 
3212     CSEMap[In] = In;
3213   }
3214 }
3215 
3216 unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
3217                                                        unsigned VF,
3218                                                        bool &NeedToScalarize) {
3219   Function *F = CI->getCalledFunction();
3220   StringRef FnName = CI->getCalledFunction()->getName();
3221   Type *ScalarRetTy = CI->getType();
3222   SmallVector<Type *, 4> Tys, ScalarTys;
3223   for (auto &ArgOp : CI->arg_operands())
3224     ScalarTys.push_back(ArgOp->getType());
3225 
3226   // Estimate cost of scalarized vector call. The source operands are assumed
3227   // to be vectors, so we need to extract individual elements from there,
3228   // execute VF scalar calls, and then gather the result into the vector return
3229   // value.
3230   unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys);
3231   if (VF == 1)
3232     return ScalarCallCost;
3233 
3234   // Compute corresponding vector type for return value and arguments.
3235   Type *RetTy = ToVectorTy(ScalarRetTy, VF);
3236   for (Type *ScalarTy : ScalarTys)
3237     Tys.push_back(ToVectorTy(ScalarTy, VF));
3238 
3239   // Compute costs of unpacking argument values for the scalar calls and
3240   // packing the return values to a vector.
3241   unsigned ScalarizationCost = getScalarizationOverhead(CI, VF);
3242 
3243   unsigned Cost = ScalarCallCost * VF + ScalarizationCost;
3244 
3245   // If we can't emit a vector call for this function, then the currently found
3246   // cost is the cost we need to return.
3247   NeedToScalarize = true;
3248   if (!TLI || !TLI->isFunctionVectorizable(FnName, VF) || CI->isNoBuiltin())
3249     return Cost;
3250 
3251   // If the corresponding vector cost is cheaper, return its cost.
3252   unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys);
3253   if (VectorCallCost < Cost) {
3254     NeedToScalarize = false;
3255     return VectorCallCost;
3256   }
3257   return Cost;
3258 }
3259 
3260 unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
3261                                                             unsigned VF) {
3262   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
3263   assert(ID && "Expected intrinsic call!");
3264 
3265   FastMathFlags FMF;
3266   if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
3267     FMF = FPMO->getFastMathFlags();
3268 
3269   SmallVector<Value *, 4> Operands(CI->arg_operands());
3270   return TTI.getIntrinsicInstrCost(ID, CI->getType(), Operands, FMF, VF);
3271 }
3272 
3273 static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
3274   auto *I1 = cast<IntegerType>(T1->getVectorElementType());
3275   auto *I2 = cast<IntegerType>(T2->getVectorElementType());
3276   return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3277 }
3278 static Type *largestIntegerVectorType(Type *T1, Type *T2) {
3279   auto *I1 = cast<IntegerType>(T1->getVectorElementType());
3280   auto *I2 = cast<IntegerType>(T2->getVectorElementType());
3281   return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3282 }
3283 
3284 void InnerLoopVectorizer::truncateToMinimalBitwidths() {
3285   // For every instruction `I` in MinBWs, truncate the operands, create a
3286   // truncated version of `I` and reextend its result. InstCombine runs
3287   // later and will remove any ext/trunc pairs.
3288   SmallPtrSet<Value *, 4> Erased;
3289   for (const auto &KV : Cost->getMinimalBitwidths()) {
3290     // If the value wasn't vectorized, we must maintain the original scalar
3291     // type. The absence of the value from VectorLoopValueMap indicates that it
3292     // wasn't vectorized.
3293     if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3294       continue;
3295     for (unsigned Part = 0; Part < UF; ++Part) {
3296       Value *I = getOrCreateVectorValue(KV.first, Part);
3297       if (Erased.find(I) != Erased.end() || I->use_empty() ||
3298           !isa<Instruction>(I))
3299         continue;
3300       Type *OriginalTy = I->getType();
3301       Type *ScalarTruncatedTy =
3302           IntegerType::get(OriginalTy->getContext(), KV.second);
3303       Type *TruncatedTy = VectorType::get(ScalarTruncatedTy,
3304                                           OriginalTy->getVectorNumElements());
3305       if (TruncatedTy == OriginalTy)
3306         continue;
3307 
3308       IRBuilder<> B(cast<Instruction>(I));
3309       auto ShrinkOperand = [&](Value *V) -> Value * {
3310         if (auto *ZI = dyn_cast<ZExtInst>(V))
3311           if (ZI->getSrcTy() == TruncatedTy)
3312             return ZI->getOperand(0);
3313         return B.CreateZExtOrTrunc(V, TruncatedTy);
3314       };
3315 
3316       // The actual instruction modification depends on the instruction type,
3317       // unfortunately.
3318       Value *NewI = nullptr;
3319       if (auto *BO = dyn_cast<BinaryOperator>(I)) {
3320         NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
3321                              ShrinkOperand(BO->getOperand(1)));
3322 
3323         // Any wrapping introduced by shrinking this operation shouldn't be
3324         // considered undefined behavior. So, we can't unconditionally copy
3325         // arithmetic wrapping flags to NewI.
3326         cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
3327       } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
3328         NewI =
3329             B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
3330                          ShrinkOperand(CI->getOperand(1)));
3331       } else if (auto *SI = dyn_cast<SelectInst>(I)) {
3332         NewI = B.CreateSelect(SI->getCondition(),
3333                               ShrinkOperand(SI->getTrueValue()),
3334                               ShrinkOperand(SI->getFalseValue()));
3335       } else if (auto *CI = dyn_cast<CastInst>(I)) {
3336         switch (CI->getOpcode()) {
3337         default:
3338           llvm_unreachable("Unhandled cast!");
3339         case Instruction::Trunc:
3340           NewI = ShrinkOperand(CI->getOperand(0));
3341           break;
3342         case Instruction::SExt:
3343           NewI = B.CreateSExtOrTrunc(
3344               CI->getOperand(0),
3345               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3346           break;
3347         case Instruction::ZExt:
3348           NewI = B.CreateZExtOrTrunc(
3349               CI->getOperand(0),
3350               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3351           break;
3352         }
3353       } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
3354         auto Elements0 = SI->getOperand(0)->getType()->getVectorNumElements();
3355         auto *O0 = B.CreateZExtOrTrunc(
3356             SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0));
3357         auto Elements1 = SI->getOperand(1)->getType()->getVectorNumElements();
3358         auto *O1 = B.CreateZExtOrTrunc(
3359             SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1));
3360 
3361         NewI = B.CreateShuffleVector(O0, O1, SI->getMask());
3362       } else if (isa<LoadInst>(I) || isa<PHINode>(I)) {
3363         // Don't do anything with the operands, just extend the result.
3364         continue;
3365       } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
3366         auto Elements = IE->getOperand(0)->getType()->getVectorNumElements();
3367         auto *O0 = B.CreateZExtOrTrunc(
3368             IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3369         auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
3370         NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
3371       } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
3372         auto Elements = EE->getOperand(0)->getType()->getVectorNumElements();
3373         auto *O0 = B.CreateZExtOrTrunc(
3374             EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3375         NewI = B.CreateExtractElement(O0, EE->getOperand(2));
3376       } else {
3377         // If we don't know what to do, be conservative and don't do anything.
3378         continue;
3379       }
3380 
3381       // Lastly, extend the result.
3382       NewI->takeName(cast<Instruction>(I));
3383       Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
3384       I->replaceAllUsesWith(Res);
3385       cast<Instruction>(I)->eraseFromParent();
3386       Erased.insert(I);
3387       VectorLoopValueMap.resetVectorValue(KV.first, Part, Res);
3388     }
3389   }
3390 
3391   // We'll have created a bunch of ZExts that are now parentless. Clean up.
3392   for (const auto &KV : Cost->getMinimalBitwidths()) {
3393     // If the value wasn't vectorized, we must maintain the original scalar
3394     // type. The absence of the value from VectorLoopValueMap indicates that it
3395     // wasn't vectorized.
3396     if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3397       continue;
3398     for (unsigned Part = 0; Part < UF; ++Part) {
3399       Value *I = getOrCreateVectorValue(KV.first, Part);
3400       ZExtInst *Inst = dyn_cast<ZExtInst>(I);
3401       if (Inst && Inst->use_empty()) {
3402         Value *NewI = Inst->getOperand(0);
3403         Inst->eraseFromParent();
3404         VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI);
3405       }
3406     }
3407   }
3408 }
3409 
3410 void InnerLoopVectorizer::fixVectorizedLoop() {
3411   // Insert truncates and extends for any truncated instructions as hints to
3412   // InstCombine.
3413   if (VF > 1)
3414     truncateToMinimalBitwidths();
3415 
3416   // Fix widened non-induction PHIs by setting up the PHI operands.
3417   if (OrigPHIsToFix.size()) {
3418     assert(EnableVPlanNativePath &&
3419            "Unexpected non-induction PHIs for fixup in non VPlan-native path");
3420     fixNonInductionPHIs();
3421   }
3422 
3423   // At this point every instruction in the original loop is widened to a
3424   // vector form. Now we need to fix the recurrences in the loop. These PHI
3425   // nodes are currently empty because we did not want to introduce cycles.
3426   // This is the second stage of vectorizing recurrences.
3427   fixCrossIterationPHIs();
3428 
3429   // Update the dominator tree.
3430   //
3431   // FIXME: After creating the structure of the new loop, the dominator tree is
3432   //        no longer up-to-date, and it remains that way until we update it
3433   //        here. An out-of-date dominator tree is problematic for SCEV,
3434   //        because SCEVExpander uses it to guide code generation. The
3435   //        vectorizer use SCEVExpanders in several places. Instead, we should
3436   //        keep the dominator tree up-to-date as we go.
3437   updateAnalysis();
3438 
3439   // Fix-up external users of the induction variables.
3440   for (auto &Entry : *Legal->getInductionVars())
3441     fixupIVUsers(Entry.first, Entry.second,
3442                  getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)),
3443                  IVEndValues[Entry.first], LoopMiddleBlock);
3444 
3445   fixLCSSAPHIs();
3446   for (Instruction *PI : PredicatedInstructions)
3447     sinkScalarOperands(&*PI);
3448 
3449   // Remove redundant induction instructions.
3450   cse(LoopVectorBody);
3451 }
3452 
3453 void InnerLoopVectorizer::fixCrossIterationPHIs() {
3454   // In order to support recurrences we need to be able to vectorize Phi nodes.
3455   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3456   // stage #2: We now need to fix the recurrences by adding incoming edges to
3457   // the currently empty PHI nodes. At this point every instruction in the
3458   // original loop is widened to a vector form so we can use them to construct
3459   // the incoming edges.
3460   for (PHINode &Phi : OrigLoop->getHeader()->phis()) {
3461     // Handle first-order recurrences and reductions that need to be fixed.
3462     if (Legal->isFirstOrderRecurrence(&Phi))
3463       fixFirstOrderRecurrence(&Phi);
3464     else if (Legal->isReductionVariable(&Phi))
3465       fixReduction(&Phi);
3466   }
3467 }
3468 
3469 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
3470   // This is the second phase of vectorizing first-order recurrences. An
3471   // overview of the transformation is described below. Suppose we have the
3472   // following loop.
3473   //
3474   //   for (int i = 0; i < n; ++i)
3475   //     b[i] = a[i] - a[i - 1];
3476   //
3477   // There is a first-order recurrence on "a". For this loop, the shorthand
3478   // scalar IR looks like:
3479   //
3480   //   scalar.ph:
3481   //     s_init = a[-1]
3482   //     br scalar.body
3483   //
3484   //   scalar.body:
3485   //     i = phi [0, scalar.ph], [i+1, scalar.body]
3486   //     s1 = phi [s_init, scalar.ph], [s2, scalar.body]
3487   //     s2 = a[i]
3488   //     b[i] = s2 - s1
3489   //     br cond, scalar.body, ...
3490   //
3491   // In this example, s1 is a recurrence because it's value depends on the
3492   // previous iteration. In the first phase of vectorization, we created a
3493   // temporary value for s1. We now complete the vectorization and produce the
3494   // shorthand vector IR shown below (for VF = 4, UF = 1).
3495   //
3496   //   vector.ph:
3497   //     v_init = vector(..., ..., ..., a[-1])
3498   //     br vector.body
3499   //
3500   //   vector.body
3501   //     i = phi [0, vector.ph], [i+4, vector.body]
3502   //     v1 = phi [v_init, vector.ph], [v2, vector.body]
3503   //     v2 = a[i, i+1, i+2, i+3];
3504   //     v3 = vector(v1(3), v2(0, 1, 2))
3505   //     b[i, i+1, i+2, i+3] = v2 - v3
3506   //     br cond, vector.body, middle.block
3507   //
3508   //   middle.block:
3509   //     x = v2(3)
3510   //     br scalar.ph
3511   //
3512   //   scalar.ph:
3513   //     s_init = phi [x, middle.block], [a[-1], otherwise]
3514   //     br scalar.body
3515   //
3516   // After execution completes the vector loop, we extract the next value of
3517   // the recurrence (x) to use as the initial value in the scalar loop.
3518 
3519   // Get the original loop preheader and single loop latch.
3520   auto *Preheader = OrigLoop->getLoopPreheader();
3521   auto *Latch = OrigLoop->getLoopLatch();
3522 
3523   // Get the initial and previous values of the scalar recurrence.
3524   auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader);
3525   auto *Previous = Phi->getIncomingValueForBlock(Latch);
3526 
3527   // Create a vector from the initial value.
3528   auto *VectorInit = ScalarInit;
3529   if (VF > 1) {
3530     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
3531     VectorInit = Builder.CreateInsertElement(
3532         UndefValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit,
3533         Builder.getInt32(VF - 1), "vector.recur.init");
3534   }
3535 
3536   // We constructed a temporary phi node in the first phase of vectorization.
3537   // This phi node will eventually be deleted.
3538   Builder.SetInsertPoint(
3539       cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0)));
3540 
3541   // Create a phi node for the new recurrence. The current value will either be
3542   // the initial value inserted into a vector or loop-varying vector value.
3543   auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur");
3544   VecPhi->addIncoming(VectorInit, LoopVectorPreHeader);
3545 
3546   // Get the vectorized previous value of the last part UF - 1. It appears last
3547   // among all unrolled iterations, due to the order of their construction.
3548   Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1);
3549 
3550   // Set the insertion point after the previous value if it is an instruction.
3551   // Note that the previous value may have been constant-folded so it is not
3552   // guaranteed to be an instruction in the vector loop. Also, if the previous
3553   // value is a phi node, we should insert after all the phi nodes to avoid
3554   // breaking basic block verification.
3555   if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart) ||
3556       isa<PHINode>(PreviousLastPart))
3557     Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt());
3558   else
3559     Builder.SetInsertPoint(
3560         &*++BasicBlock::iterator(cast<Instruction>(PreviousLastPart)));
3561 
3562   // We will construct a vector for the recurrence by combining the values for
3563   // the current and previous iterations. This is the required shuffle mask.
3564   SmallVector<Constant *, 8> ShuffleMask(VF);
3565   ShuffleMask[0] = Builder.getInt32(VF - 1);
3566   for (unsigned I = 1; I < VF; ++I)
3567     ShuffleMask[I] = Builder.getInt32(I + VF - 1);
3568 
3569   // The vector from which to take the initial value for the current iteration
3570   // (actual or unrolled). Initially, this is the vector phi node.
3571   Value *Incoming = VecPhi;
3572 
3573   // Shuffle the current and previous vector and update the vector parts.
3574   for (unsigned Part = 0; Part < UF; ++Part) {
3575     Value *PreviousPart = getOrCreateVectorValue(Previous, Part);
3576     Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part);
3577     auto *Shuffle =
3578         VF > 1 ? Builder.CreateShuffleVector(Incoming, PreviousPart,
3579                                              ConstantVector::get(ShuffleMask))
3580                : Incoming;
3581     PhiPart->replaceAllUsesWith(Shuffle);
3582     cast<Instruction>(PhiPart)->eraseFromParent();
3583     VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle);
3584     Incoming = PreviousPart;
3585   }
3586 
3587   // Fix the latch value of the new recurrence in the vector loop.
3588   VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
3589 
3590   // Extract the last vector element in the middle block. This will be the
3591   // initial value for the recurrence when jumping to the scalar loop.
3592   auto *ExtractForScalar = Incoming;
3593   if (VF > 1) {
3594     Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
3595     ExtractForScalar = Builder.CreateExtractElement(
3596         ExtractForScalar, Builder.getInt32(VF - 1), "vector.recur.extract");
3597   }
3598   // Extract the second last element in the middle block if the
3599   // Phi is used outside the loop. We need to extract the phi itself
3600   // and not the last element (the phi update in the current iteration). This
3601   // will be the value when jumping to the exit block from the LoopMiddleBlock,
3602   // when the scalar loop is not run at all.
3603   Value *ExtractForPhiUsedOutsideLoop = nullptr;
3604   if (VF > 1)
3605     ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
3606         Incoming, Builder.getInt32(VF - 2), "vector.recur.extract.for.phi");
3607   // When loop is unrolled without vectorizing, initialize
3608   // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of
3609   // `Incoming`. This is analogous to the vectorized case above: extracting the
3610   // second last element when VF > 1.
3611   else if (UF > 1)
3612     ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2);
3613 
3614   // Fix the initial value of the original recurrence in the scalar loop.
3615   Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
3616   auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
3617   for (auto *BB : predecessors(LoopScalarPreHeader)) {
3618     auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
3619     Start->addIncoming(Incoming, BB);
3620   }
3621 
3622   Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start);
3623   Phi->setName("scalar.recur");
3624 
3625   // Finally, fix users of the recurrence outside the loop. The users will need
3626   // either the last value of the scalar recurrence or the last value of the
3627   // vector recurrence we extracted in the middle block. Since the loop is in
3628   // LCSSA form, we just need to find all the phi nodes for the original scalar
3629   // recurrence in the exit block, and then add an edge for the middle block.
3630   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3631     if (LCSSAPhi.getIncomingValue(0) == Phi) {
3632       LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
3633     }
3634   }
3635 }
3636 
3637 void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
3638   Constant *Zero = Builder.getInt32(0);
3639 
3640   // Get it's reduction variable descriptor.
3641   assert(Legal->isReductionVariable(Phi) &&
3642          "Unable to find the reduction variable");
3643   RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[Phi];
3644 
3645   RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();
3646   TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
3647   Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
3648   RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind =
3649     RdxDesc.getMinMaxRecurrenceKind();
3650   setDebugLocFromInst(Builder, ReductionStartValue);
3651 
3652   // We need to generate a reduction vector from the incoming scalar.
3653   // To do so, we need to generate the 'identity' vector and override
3654   // one of the elements with the incoming scalar reduction. We need
3655   // to do it in the vector-loop preheader.
3656   Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
3657 
3658   // This is the vector-clone of the value that leaves the loop.
3659   Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType();
3660 
3661   // Find the reduction identity variable. Zero for addition, or, xor,
3662   // one for multiplication, -1 for And.
3663   Value *Identity;
3664   Value *VectorStart;
3665   if (RK == RecurrenceDescriptor::RK_IntegerMinMax ||
3666       RK == RecurrenceDescriptor::RK_FloatMinMax) {
3667     // MinMax reduction have the start value as their identify.
3668     if (VF == 1) {
3669       VectorStart = Identity = ReductionStartValue;
3670     } else {
3671       VectorStart = Identity =
3672         Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident");
3673     }
3674   } else {
3675     // Handle other reduction kinds:
3676     Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity(
3677         RK, VecTy->getScalarType());
3678     if (VF == 1) {
3679       Identity = Iden;
3680       // This vector is the Identity vector where the first element is the
3681       // incoming scalar reduction.
3682       VectorStart = ReductionStartValue;
3683     } else {
3684       Identity = ConstantVector::getSplat(VF, Iden);
3685 
3686       // This vector is the Identity vector where the first element is the
3687       // incoming scalar reduction.
3688       VectorStart =
3689         Builder.CreateInsertElement(Identity, ReductionStartValue, Zero);
3690     }
3691   }
3692 
3693   // Fix the vector-loop phi.
3694 
3695   // Reductions do not have to start at zero. They can start with
3696   // any loop invariant values.
3697   BasicBlock *Latch = OrigLoop->getLoopLatch();
3698   Value *LoopVal = Phi->getIncomingValueForBlock(Latch);
3699   for (unsigned Part = 0; Part < UF; ++Part) {
3700     Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part);
3701     Value *Val = getOrCreateVectorValue(LoopVal, Part);
3702     // Make sure to add the reduction stat value only to the
3703     // first unroll part.
3704     Value *StartVal = (Part == 0) ? VectorStart : Identity;
3705     cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader);
3706     cast<PHINode>(VecRdxPhi)
3707       ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
3708   }
3709 
3710   // Before each round, move the insertion point right between
3711   // the PHIs and the values we are going to write.
3712   // This allows us to write both PHINodes and the extractelement
3713   // instructions.
3714   Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3715 
3716   setDebugLocFromInst(Builder, LoopExitInst);
3717 
3718   // If tail is folded by masking, the vector value to leave the loop should be
3719   // a Select choosing between the vectorized LoopExitInst and vectorized Phi,
3720   // instead of the former.
3721   if (Cost->foldTailByMasking()) {
3722     for (unsigned Part = 0; Part < UF; ++Part) {
3723       Value *VecLoopExitInst =
3724           VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3725       Value *Sel = nullptr;
3726       for (User *U : VecLoopExitInst->users()) {
3727         if (isa<SelectInst>(U)) {
3728           assert(!Sel && "Reduction exit feeding two selects");
3729           Sel = U;
3730         } else
3731           assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select");
3732       }
3733       assert(Sel && "Reduction exit feeds no select");
3734       VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, Sel);
3735     }
3736   }
3737 
3738   // If the vector reduction can be performed in a smaller type, we truncate
3739   // then extend the loop exit value to enable InstCombine to evaluate the
3740   // entire expression in the smaller type.
3741   if (VF > 1 && Phi->getType() != RdxDesc.getRecurrenceType()) {
3742     Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
3743     Builder.SetInsertPoint(
3744         LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator());
3745     VectorParts RdxParts(UF);
3746     for (unsigned Part = 0; Part < UF; ++Part) {
3747       RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3748       Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3749       Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
3750                                         : Builder.CreateZExt(Trunc, VecTy);
3751       for (Value::user_iterator UI = RdxParts[Part]->user_begin();
3752            UI != RdxParts[Part]->user_end();)
3753         if (*UI != Trunc) {
3754           (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd);
3755           RdxParts[Part] = Extnd;
3756         } else {
3757           ++UI;
3758         }
3759     }
3760     Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3761     for (unsigned Part = 0; Part < UF; ++Part) {
3762       RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3763       VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]);
3764     }
3765   }
3766 
3767   // Reduce all of the unrolled parts into a single vector.
3768   Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0);
3769   unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK);
3770 
3771   // The middle block terminator has already been assigned a DebugLoc here (the
3772   // OrigLoop's single latch terminator). We want the whole middle block to
3773   // appear to execute on this line because: (a) it is all compiler generated,
3774   // (b) these instructions are always executed after evaluating the latch
3775   // conditional branch, and (c) other passes may add new predecessors which
3776   // terminate on this line. This is the easiest way to ensure we don't
3777   // accidentally cause an extra step back into the loop while debugging.
3778   setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator());
3779   for (unsigned Part = 1; Part < UF; ++Part) {
3780     Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3781     if (Op != Instruction::ICmp && Op != Instruction::FCmp)
3782       // Floating point operations had to be 'fast' to enable the reduction.
3783       ReducedPartRdx = addFastMathFlag(
3784           Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxPart,
3785                               ReducedPartRdx, "bin.rdx"),
3786           RdxDesc.getFastMathFlags());
3787     else
3788       ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx,
3789                                       RdxPart);
3790   }
3791 
3792   if (VF > 1) {
3793     bool NoNaN = Legal->hasFunNoNaNAttr();
3794     ReducedPartRdx =
3795         createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN);
3796     // If the reduction can be performed in a smaller type, we need to extend
3797     // the reduction to the wider type before we branch to the original loop.
3798     if (Phi->getType() != RdxDesc.getRecurrenceType())
3799       ReducedPartRdx =
3800         RdxDesc.isSigned()
3801         ? Builder.CreateSExt(ReducedPartRdx, Phi->getType())
3802         : Builder.CreateZExt(ReducedPartRdx, Phi->getType());
3803   }
3804 
3805   // Create a phi node that merges control-flow from the backedge-taken check
3806   // block and the middle block.
3807   PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx",
3808                                         LoopScalarPreHeader->getTerminator());
3809   for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
3810     BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]);
3811   BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
3812 
3813   // Now, we need to fix the users of the reduction variable
3814   // inside and outside of the scalar remainder loop.
3815   // We know that the loop is in LCSSA form. We need to update the
3816   // PHI nodes in the exit blocks.
3817   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3818     // All PHINodes need to have a single entry edge, or two if
3819     // we already fixed them.
3820     assert(LCSSAPhi.getNumIncomingValues() < 3 && "Invalid LCSSA PHI");
3821 
3822     // We found a reduction value exit-PHI. Update it with the
3823     // incoming bypass edge.
3824     if (LCSSAPhi.getIncomingValue(0) == LoopExitInst)
3825       LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
3826   } // end of the LCSSA phi scan.
3827 
3828     // Fix the scalar loop reduction variable with the incoming reduction sum
3829     // from the vector body and from the backedge value.
3830   int IncomingEdgeBlockIdx =
3831     Phi->getBasicBlockIndex(OrigLoop->getLoopLatch());
3832   assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
3833   // Pick the other block.
3834   int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
3835   Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
3836   Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
3837 }
3838 
3839 void InnerLoopVectorizer::fixLCSSAPHIs() {
3840   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3841     if (LCSSAPhi.getNumIncomingValues() == 1) {
3842       auto *IncomingValue = LCSSAPhi.getIncomingValue(0);
3843       // Non-instruction incoming values will have only one value.
3844       unsigned LastLane = 0;
3845       if (isa<Instruction>(IncomingValue))
3846           LastLane = Cost->isUniformAfterVectorization(
3847                          cast<Instruction>(IncomingValue), VF)
3848                          ? 0
3849                          : VF - 1;
3850       // Can be a loop invariant incoming value or the last scalar value to be
3851       // extracted from the vectorized loop.
3852       Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
3853       Value *lastIncomingValue =
3854           getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane });
3855       LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock);
3856     }
3857   }
3858 }
3859 
3860 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
3861   // The basic block and loop containing the predicated instruction.
3862   auto *PredBB = PredInst->getParent();
3863   auto *VectorLoop = LI->getLoopFor(PredBB);
3864 
3865   // Initialize a worklist with the operands of the predicated instruction.
3866   SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
3867 
3868   // Holds instructions that we need to analyze again. An instruction may be
3869   // reanalyzed if we don't yet know if we can sink it or not.
3870   SmallVector<Instruction *, 8> InstsToReanalyze;
3871 
3872   // Returns true if a given use occurs in the predicated block. Phi nodes use
3873   // their operands in their corresponding predecessor blocks.
3874   auto isBlockOfUsePredicated = [&](Use &U) -> bool {
3875     auto *I = cast<Instruction>(U.getUser());
3876     BasicBlock *BB = I->getParent();
3877     if (auto *Phi = dyn_cast<PHINode>(I))
3878       BB = Phi->getIncomingBlock(
3879           PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
3880     return BB == PredBB;
3881   };
3882 
3883   // Iteratively sink the scalarized operands of the predicated instruction
3884   // into the block we created for it. When an instruction is sunk, it's
3885   // operands are then added to the worklist. The algorithm ends after one pass
3886   // through the worklist doesn't sink a single instruction.
3887   bool Changed;
3888   do {
3889     // Add the instructions that need to be reanalyzed to the worklist, and
3890     // reset the changed indicator.
3891     Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
3892     InstsToReanalyze.clear();
3893     Changed = false;
3894 
3895     while (!Worklist.empty()) {
3896       auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
3897 
3898       // We can't sink an instruction if it is a phi node, is already in the
3899       // predicated block, is not in the loop, or may have side effects.
3900       if (!I || isa<PHINode>(I) || I->getParent() == PredBB ||
3901           !VectorLoop->contains(I) || I->mayHaveSideEffects())
3902         continue;
3903 
3904       // It's legal to sink the instruction if all its uses occur in the
3905       // predicated block. Otherwise, there's nothing to do yet, and we may
3906       // need to reanalyze the instruction.
3907       if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
3908         InstsToReanalyze.push_back(I);
3909         continue;
3910       }
3911 
3912       // Move the instruction to the beginning of the predicated block, and add
3913       // it's operands to the worklist.
3914       I->moveBefore(&*PredBB->getFirstInsertionPt());
3915       Worklist.insert(I->op_begin(), I->op_end());
3916 
3917       // The sinking may have enabled other instructions to be sunk, so we will
3918       // need to iterate.
3919       Changed = true;
3920     }
3921   } while (Changed);
3922 }
3923 
3924 void InnerLoopVectorizer::fixNonInductionPHIs() {
3925   for (PHINode *OrigPhi : OrigPHIsToFix) {
3926     PHINode *NewPhi =
3927         cast<PHINode>(VectorLoopValueMap.getVectorValue(OrigPhi, 0));
3928     unsigned NumIncomingValues = OrigPhi->getNumIncomingValues();
3929 
3930     SmallVector<BasicBlock *, 2> ScalarBBPredecessors(
3931         predecessors(OrigPhi->getParent()));
3932     SmallVector<BasicBlock *, 2> VectorBBPredecessors(
3933         predecessors(NewPhi->getParent()));
3934     assert(ScalarBBPredecessors.size() == VectorBBPredecessors.size() &&
3935            "Scalar and Vector BB should have the same number of predecessors");
3936 
3937     // The insertion point in Builder may be invalidated by the time we get
3938     // here. Force the Builder insertion point to something valid so that we do
3939     // not run into issues during insertion point restore in
3940     // getOrCreateVectorValue calls below.
3941     Builder.SetInsertPoint(NewPhi);
3942 
3943     // The predecessor order is preserved and we can rely on mapping between
3944     // scalar and vector block predecessors.
3945     for (unsigned i = 0; i < NumIncomingValues; ++i) {
3946       BasicBlock *NewPredBB = VectorBBPredecessors[i];
3947 
3948       // When looking up the new scalar/vector values to fix up, use incoming
3949       // values from original phi.
3950       Value *ScIncV =
3951           OrigPhi->getIncomingValueForBlock(ScalarBBPredecessors[i]);
3952 
3953       // Scalar incoming value may need a broadcast
3954       Value *NewIncV = getOrCreateVectorValue(ScIncV, 0);
3955       NewPhi->addIncoming(NewIncV, NewPredBB);
3956     }
3957   }
3958 }
3959 
3960 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
3961                                               unsigned VF) {
3962   PHINode *P = cast<PHINode>(PN);
3963   if (EnableVPlanNativePath) {
3964     // Currently we enter here in the VPlan-native path for non-induction
3965     // PHIs where all control flow is uniform. We simply widen these PHIs.
3966     // Create a vector phi with no operands - the vector phi operands will be
3967     // set at the end of vector code generation.
3968     Type *VecTy =
3969         (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF);
3970     Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi");
3971     VectorLoopValueMap.setVectorValue(P, 0, VecPhi);
3972     OrigPHIsToFix.push_back(P);
3973 
3974     return;
3975   }
3976 
3977   assert(PN->getParent() == OrigLoop->getHeader() &&
3978          "Non-header phis should have been handled elsewhere");
3979 
3980   // In order to support recurrences we need to be able to vectorize Phi nodes.
3981   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3982   // stage #1: We create a new vector PHI node with no incoming edges. We'll use
3983   // this value when we vectorize all of the instructions that use the PHI.
3984   if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) {
3985     for (unsigned Part = 0; Part < UF; ++Part) {
3986       // This is phase one of vectorizing PHIs.
3987       Type *VecTy =
3988           (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF);
3989       Value *EntryPart = PHINode::Create(
3990           VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt());
3991       VectorLoopValueMap.setVectorValue(P, Part, EntryPart);
3992     }
3993     return;
3994   }
3995 
3996   setDebugLocFromInst(Builder, P);
3997 
3998   // This PHINode must be an induction variable.
3999   // Make sure that we know about it.
4000   assert(Legal->getInductionVars()->count(P) && "Not an induction variable");
4001 
4002   InductionDescriptor II = Legal->getInductionVars()->lookup(P);
4003   const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
4004 
4005   // FIXME: The newly created binary instructions should contain nsw/nuw flags,
4006   // which can be found from the original scalar operations.
4007   switch (II.getKind()) {
4008   case InductionDescriptor::IK_NoInduction:
4009     llvm_unreachable("Unknown induction");
4010   case InductionDescriptor::IK_IntInduction:
4011   case InductionDescriptor::IK_FpInduction:
4012     llvm_unreachable("Integer/fp induction is handled elsewhere.");
4013   case InductionDescriptor::IK_PtrInduction: {
4014     // Handle the pointer induction variable case.
4015     assert(P->getType()->isPointerTy() && "Unexpected type.");
4016     // This is the normalized GEP that starts counting at zero.
4017     Value *PtrInd = Induction;
4018     PtrInd = Builder.CreateSExtOrTrunc(PtrInd, II.getStep()->getType());
4019     // Determine the number of scalars we need to generate for each unroll
4020     // iteration. If the instruction is uniform, we only need to generate the
4021     // first lane. Otherwise, we generate all VF values.
4022     unsigned Lanes = Cost->isUniformAfterVectorization(P, VF) ? 1 : VF;
4023     // These are the scalar results. Notice that we don't generate vector GEPs
4024     // because scalar GEPs result in better code.
4025     for (unsigned Part = 0; Part < UF; ++Part) {
4026       for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
4027         Constant *Idx = ConstantInt::get(PtrInd->getType(), Lane + Part * VF);
4028         Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
4029         Value *SclrGep =
4030             emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II);
4031         SclrGep->setName("next.gep");
4032         VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep);
4033       }
4034     }
4035     return;
4036   }
4037   }
4038 }
4039 
4040 /// A helper function for checking whether an integer division-related
4041 /// instruction may divide by zero (in which case it must be predicated if
4042 /// executed conditionally in the scalar code).
4043 /// TODO: It may be worthwhile to generalize and check isKnownNonZero().
4044 /// Non-zero divisors that are non compile-time constants will not be
4045 /// converted into multiplication, so we will still end up scalarizing
4046 /// the division, but can do so w/o predication.
4047 static bool mayDivideByZero(Instruction &I) {
4048   assert((I.getOpcode() == Instruction::UDiv ||
4049           I.getOpcode() == Instruction::SDiv ||
4050           I.getOpcode() == Instruction::URem ||
4051           I.getOpcode() == Instruction::SRem) &&
4052          "Unexpected instruction");
4053   Value *Divisor = I.getOperand(1);
4054   auto *CInt = dyn_cast<ConstantInt>(Divisor);
4055   return !CInt || CInt->isZero();
4056 }
4057 
4058 void InnerLoopVectorizer::widenInstruction(Instruction &I) {
4059   switch (I.getOpcode()) {
4060   case Instruction::Br:
4061   case Instruction::PHI:
4062     llvm_unreachable("This instruction is handled by a different recipe.");
4063   case Instruction::GetElementPtr: {
4064     // Construct a vector GEP by widening the operands of the scalar GEP as
4065     // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
4066     // results in a vector of pointers when at least one operand of the GEP
4067     // is vector-typed. Thus, to keep the representation compact, we only use
4068     // vector-typed operands for loop-varying values.
4069     auto *GEP = cast<GetElementPtrInst>(&I);
4070 
4071     if (VF > 1 && OrigLoop->hasLoopInvariantOperands(GEP)) {
4072       // If we are vectorizing, but the GEP has only loop-invariant operands,
4073       // the GEP we build (by only using vector-typed operands for
4074       // loop-varying values) would be a scalar pointer. Thus, to ensure we
4075       // produce a vector of pointers, we need to either arbitrarily pick an
4076       // operand to broadcast, or broadcast a clone of the original GEP.
4077       // Here, we broadcast a clone of the original.
4078       //
4079       // TODO: If at some point we decide to scalarize instructions having
4080       //       loop-invariant operands, this special case will no longer be
4081       //       required. We would add the scalarization decision to
4082       //       collectLoopScalars() and teach getVectorValue() to broadcast
4083       //       the lane-zero scalar value.
4084       auto *Clone = Builder.Insert(GEP->clone());
4085       for (unsigned Part = 0; Part < UF; ++Part) {
4086         Value *EntryPart = Builder.CreateVectorSplat(VF, Clone);
4087         VectorLoopValueMap.setVectorValue(&I, Part, EntryPart);
4088         addMetadata(EntryPart, GEP);
4089       }
4090     } else {
4091       // If the GEP has at least one loop-varying operand, we are sure to
4092       // produce a vector of pointers. But if we are only unrolling, we want
4093       // to produce a scalar GEP for each unroll part. Thus, the GEP we
4094       // produce with the code below will be scalar (if VF == 1) or vector
4095       // (otherwise). Note that for the unroll-only case, we still maintain
4096       // values in the vector mapping with initVector, as we do for other
4097       // instructions.
4098       for (unsigned Part = 0; Part < UF; ++Part) {
4099         // The pointer operand of the new GEP. If it's loop-invariant, we
4100         // won't broadcast it.
4101         auto *Ptr =
4102             OrigLoop->isLoopInvariant(GEP->getPointerOperand())
4103                 ? GEP->getPointerOperand()
4104                 : getOrCreateVectorValue(GEP->getPointerOperand(), Part);
4105 
4106         // Collect all the indices for the new GEP. If any index is
4107         // loop-invariant, we won't broadcast it.
4108         SmallVector<Value *, 4> Indices;
4109         for (auto &U : make_range(GEP->idx_begin(), GEP->idx_end())) {
4110           if (OrigLoop->isLoopInvariant(U.get()))
4111             Indices.push_back(U.get());
4112           else
4113             Indices.push_back(getOrCreateVectorValue(U.get(), Part));
4114         }
4115 
4116         // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
4117         // but it should be a vector, otherwise.
4118         auto *NewGEP =
4119             GEP->isInBounds()
4120                 ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr,
4121                                             Indices)
4122                 : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices);
4123         assert((VF == 1 || NewGEP->getType()->isVectorTy()) &&
4124                "NewGEP is not a pointer vector");
4125         VectorLoopValueMap.setVectorValue(&I, Part, NewGEP);
4126         addMetadata(NewGEP, GEP);
4127       }
4128     }
4129 
4130     break;
4131   }
4132   case Instruction::UDiv:
4133   case Instruction::SDiv:
4134   case Instruction::SRem:
4135   case Instruction::URem:
4136   case Instruction::Add:
4137   case Instruction::FAdd:
4138   case Instruction::Sub:
4139   case Instruction::FSub:
4140   case Instruction::FNeg:
4141   case Instruction::Mul:
4142   case Instruction::FMul:
4143   case Instruction::FDiv:
4144   case Instruction::FRem:
4145   case Instruction::Shl:
4146   case Instruction::LShr:
4147   case Instruction::AShr:
4148   case Instruction::And:
4149   case Instruction::Or:
4150   case Instruction::Xor: {
4151     // Just widen unops and binops.
4152     setDebugLocFromInst(Builder, &I);
4153 
4154     for (unsigned Part = 0; Part < UF; ++Part) {
4155       SmallVector<Value *, 2> Ops;
4156       for (Value *Op : I.operands())
4157         Ops.push_back(getOrCreateVectorValue(Op, Part));
4158 
4159       Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops);
4160 
4161       if (auto *VecOp = dyn_cast<Instruction>(V))
4162         VecOp->copyIRFlags(&I);
4163 
4164       // Use this vector value for all users of the original instruction.
4165       VectorLoopValueMap.setVectorValue(&I, Part, V);
4166       addMetadata(V, &I);
4167     }
4168 
4169     break;
4170   }
4171   case Instruction::Select: {
4172     // Widen selects.
4173     // If the selector is loop invariant we can create a select
4174     // instruction with a scalar condition. Otherwise, use vector-select.
4175     auto *SE = PSE.getSE();
4176     bool InvariantCond =
4177         SE->isLoopInvariant(PSE.getSCEV(I.getOperand(0)), OrigLoop);
4178     setDebugLocFromInst(Builder, &I);
4179 
4180     // The condition can be loop invariant  but still defined inside the
4181     // loop. This means that we can't just use the original 'cond' value.
4182     // We have to take the 'vectorized' value and pick the first lane.
4183     // Instcombine will make this a no-op.
4184 
4185     auto *ScalarCond = getOrCreateScalarValue(I.getOperand(0), {0, 0});
4186 
4187     for (unsigned Part = 0; Part < UF; ++Part) {
4188       Value *Cond = getOrCreateVectorValue(I.getOperand(0), Part);
4189       Value *Op0 = getOrCreateVectorValue(I.getOperand(1), Part);
4190       Value *Op1 = getOrCreateVectorValue(I.getOperand(2), Part);
4191       Value *Sel =
4192           Builder.CreateSelect(InvariantCond ? ScalarCond : Cond, Op0, Op1);
4193       VectorLoopValueMap.setVectorValue(&I, Part, Sel);
4194       addMetadata(Sel, &I);
4195     }
4196 
4197     break;
4198   }
4199 
4200   case Instruction::ICmp:
4201   case Instruction::FCmp: {
4202     // Widen compares. Generate vector compares.
4203     bool FCmp = (I.getOpcode() == Instruction::FCmp);
4204     auto *Cmp = cast<CmpInst>(&I);
4205     setDebugLocFromInst(Builder, Cmp);
4206     for (unsigned Part = 0; Part < UF; ++Part) {
4207       Value *A = getOrCreateVectorValue(Cmp->getOperand(0), Part);
4208       Value *B = getOrCreateVectorValue(Cmp->getOperand(1), Part);
4209       Value *C = nullptr;
4210       if (FCmp) {
4211         // Propagate fast math flags.
4212         IRBuilder<>::FastMathFlagGuard FMFG(Builder);
4213         Builder.setFastMathFlags(Cmp->getFastMathFlags());
4214         C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
4215       } else {
4216         C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
4217       }
4218       VectorLoopValueMap.setVectorValue(&I, Part, C);
4219       addMetadata(C, &I);
4220     }
4221 
4222     break;
4223   }
4224 
4225   case Instruction::ZExt:
4226   case Instruction::SExt:
4227   case Instruction::FPToUI:
4228   case Instruction::FPToSI:
4229   case Instruction::FPExt:
4230   case Instruction::PtrToInt:
4231   case Instruction::IntToPtr:
4232   case Instruction::SIToFP:
4233   case Instruction::UIToFP:
4234   case Instruction::Trunc:
4235   case Instruction::FPTrunc:
4236   case Instruction::BitCast: {
4237     auto *CI = cast<CastInst>(&I);
4238     setDebugLocFromInst(Builder, CI);
4239 
4240     /// Vectorize casts.
4241     Type *DestTy =
4242         (VF == 1) ? CI->getType() : VectorType::get(CI->getType(), VF);
4243 
4244     for (unsigned Part = 0; Part < UF; ++Part) {
4245       Value *A = getOrCreateVectorValue(CI->getOperand(0), Part);
4246       Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
4247       VectorLoopValueMap.setVectorValue(&I, Part, Cast);
4248       addMetadata(Cast, &I);
4249     }
4250     break;
4251   }
4252 
4253   case Instruction::Call: {
4254     // Ignore dbg intrinsics.
4255     if (isa<DbgInfoIntrinsic>(I))
4256       break;
4257     setDebugLocFromInst(Builder, &I);
4258 
4259     Module *M = I.getParent()->getParent()->getParent();
4260     auto *CI = cast<CallInst>(&I);
4261 
4262     StringRef FnName = CI->getCalledFunction()->getName();
4263     Function *F = CI->getCalledFunction();
4264     Type *RetTy = ToVectorTy(CI->getType(), VF);
4265     SmallVector<Type *, 4> Tys;
4266     for (Value *ArgOperand : CI->arg_operands())
4267       Tys.push_back(ToVectorTy(ArgOperand->getType(), VF));
4268 
4269     Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
4270 
4271     // The flag shows whether we use Intrinsic or a usual Call for vectorized
4272     // version of the instruction.
4273     // Is it beneficial to perform intrinsic call compared to lib call?
4274     bool NeedToScalarize;
4275     unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize);
4276     bool UseVectorIntrinsic =
4277         ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost;
4278     assert((UseVectorIntrinsic || !NeedToScalarize) &&
4279            "Instruction should be scalarized elsewhere.");
4280 
4281     for (unsigned Part = 0; Part < UF; ++Part) {
4282       SmallVector<Value *, 4> Args;
4283       for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) {
4284         Value *Arg = CI->getArgOperand(i);
4285         // Some intrinsics have a scalar argument - don't replace it with a
4286         // vector.
4287         if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, i))
4288           Arg = getOrCreateVectorValue(CI->getArgOperand(i), Part);
4289         Args.push_back(Arg);
4290       }
4291 
4292       Function *VectorF;
4293       if (UseVectorIntrinsic) {
4294         // Use vector version of the intrinsic.
4295         Type *TysForDecl[] = {CI->getType()};
4296         if (VF > 1)
4297           TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
4298         VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
4299       } else {
4300         // Use vector version of the library call.
4301         StringRef VFnName = TLI->getVectorizedFunction(FnName, VF);
4302         assert(!VFnName.empty() && "Vector function name is empty.");
4303         VectorF = M->getFunction(VFnName);
4304         if (!VectorF) {
4305           // Generate a declaration
4306           FunctionType *FTy = FunctionType::get(RetTy, Tys, false);
4307           VectorF =
4308               Function::Create(FTy, Function::ExternalLinkage, VFnName, M);
4309           VectorF->copyAttributesFrom(F);
4310         }
4311       }
4312       assert(VectorF && "Can't create vector function.");
4313 
4314       SmallVector<OperandBundleDef, 1> OpBundles;
4315       CI->getOperandBundlesAsDefs(OpBundles);
4316       CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
4317 
4318       if (isa<FPMathOperator>(V))
4319         V->copyFastMathFlags(CI);
4320 
4321       VectorLoopValueMap.setVectorValue(&I, Part, V);
4322       addMetadata(V, &I);
4323     }
4324 
4325     break;
4326   }
4327 
4328   default:
4329     // This instruction is not vectorized by simple widening.
4330     LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
4331     llvm_unreachable("Unhandled instruction!");
4332   } // end of switch.
4333 }
4334 
4335 void InnerLoopVectorizer::updateAnalysis() {
4336   // Forget the original basic block.
4337   PSE.getSE()->forgetLoop(OrigLoop);
4338 
4339   // DT is not kept up-to-date for outer loop vectorization
4340   if (EnableVPlanNativePath)
4341     return;
4342 
4343   // Update the dominator tree information.
4344   assert(DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) &&
4345          "Entry does not dominate exit.");
4346 
4347   DT->addNewBlock(LoopMiddleBlock,
4348                   LI->getLoopFor(LoopVectorBody)->getLoopLatch());
4349   DT->addNewBlock(LoopScalarPreHeader, LoopBypassBlocks[0]);
4350   DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader);
4351   DT->changeImmediateDominator(LoopExitBlock, LoopBypassBlocks[0]);
4352   assert(DT->verify(DominatorTree::VerificationLevel::Fast));
4353 }
4354 
4355 void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
4356   // We should not collect Scalars more than once per VF. Right now, this
4357   // function is called from collectUniformsAndScalars(), which already does
4358   // this check. Collecting Scalars for VF=1 does not make any sense.
4359   assert(VF >= 2 && Scalars.find(VF) == Scalars.end() &&
4360          "This function should not be visited twice for the same VF");
4361 
4362   SmallSetVector<Instruction *, 8> Worklist;
4363 
4364   // These sets are used to seed the analysis with pointers used by memory
4365   // accesses that will remain scalar.
4366   SmallSetVector<Instruction *, 8> ScalarPtrs;
4367   SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
4368 
4369   // A helper that returns true if the use of Ptr by MemAccess will be scalar.
4370   // The pointer operands of loads and stores will be scalar as long as the
4371   // memory access is not a gather or scatter operation. The value operand of a
4372   // store will remain scalar if the store is scalarized.
4373   auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
4374     InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
4375     assert(WideningDecision != CM_Unknown &&
4376            "Widening decision should be ready at this moment");
4377     if (auto *Store = dyn_cast<StoreInst>(MemAccess))
4378       if (Ptr == Store->getValueOperand())
4379         return WideningDecision == CM_Scalarize;
4380     assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
4381            "Ptr is neither a value or pointer operand");
4382     return WideningDecision != CM_GatherScatter;
4383   };
4384 
4385   // A helper that returns true if the given value is a bitcast or
4386   // getelementptr instruction contained in the loop.
4387   auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
4388     return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
4389             isa<GetElementPtrInst>(V)) &&
4390            !TheLoop->isLoopInvariant(V);
4391   };
4392 
4393   // A helper that evaluates a memory access's use of a pointer. If the use
4394   // will be a scalar use, and the pointer is only used by memory accesses, we
4395   // place the pointer in ScalarPtrs. Otherwise, the pointer is placed in
4396   // PossibleNonScalarPtrs.
4397   auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
4398     // We only care about bitcast and getelementptr instructions contained in
4399     // the loop.
4400     if (!isLoopVaryingBitCastOrGEP(Ptr))
4401       return;
4402 
4403     // If the pointer has already been identified as scalar (e.g., if it was
4404     // also identified as uniform), there's nothing to do.
4405     auto *I = cast<Instruction>(Ptr);
4406     if (Worklist.count(I))
4407       return;
4408 
4409     // If the use of the pointer will be a scalar use, and all users of the
4410     // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
4411     // place the pointer in PossibleNonScalarPtrs.
4412     if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
4413           return isa<LoadInst>(U) || isa<StoreInst>(U);
4414         }))
4415       ScalarPtrs.insert(I);
4416     else
4417       PossibleNonScalarPtrs.insert(I);
4418   };
4419 
4420   // We seed the scalars analysis with three classes of instructions: (1)
4421   // instructions marked uniform-after-vectorization, (2) bitcast and
4422   // getelementptr instructions used by memory accesses requiring a scalar use,
4423   // and (3) pointer induction variables and their update instructions (we
4424   // currently only scalarize these).
4425   //
4426   // (1) Add to the worklist all instructions that have been identified as
4427   // uniform-after-vectorization.
4428   Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
4429 
4430   // (2) Add to the worklist all bitcast and getelementptr instructions used by
4431   // memory accesses requiring a scalar use. The pointer operands of loads and
4432   // stores will be scalar as long as the memory accesses is not a gather or
4433   // scatter operation. The value operand of a store will remain scalar if the
4434   // store is scalarized.
4435   for (auto *BB : TheLoop->blocks())
4436     for (auto &I : *BB) {
4437       if (auto *Load = dyn_cast<LoadInst>(&I)) {
4438         evaluatePtrUse(Load, Load->getPointerOperand());
4439       } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
4440         evaluatePtrUse(Store, Store->getPointerOperand());
4441         evaluatePtrUse(Store, Store->getValueOperand());
4442       }
4443     }
4444   for (auto *I : ScalarPtrs)
4445     if (PossibleNonScalarPtrs.find(I) == PossibleNonScalarPtrs.end()) {
4446       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
4447       Worklist.insert(I);
4448     }
4449 
4450   // (3) Add to the worklist all pointer induction variables and their update
4451   // instructions.
4452   //
4453   // TODO: Once we are able to vectorize pointer induction variables we should
4454   //       no longer insert them into the worklist here.
4455   auto *Latch = TheLoop->getLoopLatch();
4456   for (auto &Induction : *Legal->getInductionVars()) {
4457     auto *Ind = Induction.first;
4458     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4459     if (Induction.second.getKind() != InductionDescriptor::IK_PtrInduction)
4460       continue;
4461     Worklist.insert(Ind);
4462     Worklist.insert(IndUpdate);
4463     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
4464     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
4465                       << "\n");
4466   }
4467 
4468   // Insert the forced scalars.
4469   // FIXME: Currently widenPHIInstruction() often creates a dead vector
4470   // induction variable when the PHI user is scalarized.
4471   auto ForcedScalar = ForcedScalars.find(VF);
4472   if (ForcedScalar != ForcedScalars.end())
4473     for (auto *I : ForcedScalar->second)
4474       Worklist.insert(I);
4475 
4476   // Expand the worklist by looking through any bitcasts and getelementptr
4477   // instructions we've already identified as scalar. This is similar to the
4478   // expansion step in collectLoopUniforms(); however, here we're only
4479   // expanding to include additional bitcasts and getelementptr instructions.
4480   unsigned Idx = 0;
4481   while (Idx != Worklist.size()) {
4482     Instruction *Dst = Worklist[Idx++];
4483     if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
4484       continue;
4485     auto *Src = cast<Instruction>(Dst->getOperand(0));
4486     if (llvm::all_of(Src->users(), [&](User *U) -> bool {
4487           auto *J = cast<Instruction>(U);
4488           return !TheLoop->contains(J) || Worklist.count(J) ||
4489                  ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
4490                   isScalarUse(J, Src));
4491         })) {
4492       Worklist.insert(Src);
4493       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
4494     }
4495   }
4496 
4497   // An induction variable will remain scalar if all users of the induction
4498   // variable and induction variable update remain scalar.
4499   for (auto &Induction : *Legal->getInductionVars()) {
4500     auto *Ind = Induction.first;
4501     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4502 
4503     // We already considered pointer induction variables, so there's no reason
4504     // to look at their users again.
4505     //
4506     // TODO: Once we are able to vectorize pointer induction variables we
4507     //       should no longer skip over them here.
4508     if (Induction.second.getKind() == InductionDescriptor::IK_PtrInduction)
4509       continue;
4510 
4511     // Determine if all users of the induction variable are scalar after
4512     // vectorization.
4513     auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4514       auto *I = cast<Instruction>(U);
4515       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I);
4516     });
4517     if (!ScalarInd)
4518       continue;
4519 
4520     // Determine if all users of the induction variable update instruction are
4521     // scalar after vectorization.
4522     auto ScalarIndUpdate =
4523         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4524           auto *I = cast<Instruction>(U);
4525           return I == Ind || !TheLoop->contains(I) || Worklist.count(I);
4526         });
4527     if (!ScalarIndUpdate)
4528       continue;
4529 
4530     // The induction variable and its update instruction will remain scalar.
4531     Worklist.insert(Ind);
4532     Worklist.insert(IndUpdate);
4533     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
4534     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
4535                       << "\n");
4536   }
4537 
4538   Scalars[VF].insert(Worklist.begin(), Worklist.end());
4539 }
4540 
4541 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, unsigned VF) {
4542   if (!blockNeedsPredication(I->getParent()))
4543     return false;
4544   switch(I->getOpcode()) {
4545   default:
4546     break;
4547   case Instruction::Load:
4548   case Instruction::Store: {
4549     if (!Legal->isMaskRequired(I))
4550       return false;
4551     auto *Ptr = getLoadStorePointerOperand(I);
4552     auto *Ty = getMemInstValueType(I);
4553     // We have already decided how to vectorize this instruction, get that
4554     // result.
4555     if (VF > 1) {
4556       InstWidening WideningDecision = getWideningDecision(I, VF);
4557       assert(WideningDecision != CM_Unknown &&
4558              "Widening decision should be ready at this moment");
4559       return WideningDecision == CM_Scalarize;
4560     }
4561     return isa<LoadInst>(I) ?
4562         !(isLegalMaskedLoad(Ty, Ptr)  || isLegalMaskedGather(Ty))
4563       : !(isLegalMaskedStore(Ty, Ptr) || isLegalMaskedScatter(Ty));
4564   }
4565   case Instruction::UDiv:
4566   case Instruction::SDiv:
4567   case Instruction::SRem:
4568   case Instruction::URem:
4569     return mayDivideByZero(*I);
4570   }
4571   return false;
4572 }
4573 
4574 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(Instruction *I,
4575                                                                unsigned VF) {
4576   assert(isAccessInterleaved(I) && "Expecting interleaved access.");
4577   assert(getWideningDecision(I, VF) == CM_Unknown &&
4578          "Decision should not be set yet.");
4579   auto *Group = getInterleavedAccessGroup(I);
4580   assert(Group && "Must have a group.");
4581 
4582   // If the instruction's allocated size doesn't equal it's type size, it
4583   // requires padding and will be scalarized.
4584   auto &DL = I->getModule()->getDataLayout();
4585   auto *ScalarTy = getMemInstValueType(I);
4586   if (hasIrregularType(ScalarTy, DL, VF))
4587     return false;
4588 
4589   // Check if masking is required.
4590   // A Group may need masking for one of two reasons: it resides in a block that
4591   // needs predication, or it was decided to use masking to deal with gaps.
4592   bool PredicatedAccessRequiresMasking =
4593       Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I);
4594   bool AccessWithGapsRequiresMasking =
4595       Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
4596   if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking)
4597     return true;
4598 
4599   // If masked interleaving is required, we expect that the user/target had
4600   // enabled it, because otherwise it either wouldn't have been created or
4601   // it should have been invalidated by the CostModel.
4602   assert(useMaskedInterleavedAccesses(TTI) &&
4603          "Masked interleave-groups for predicated accesses are not enabled.");
4604 
4605   auto *Ty = getMemInstValueType(I);
4606   return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty)
4607                           : TTI.isLegalMaskedStore(Ty);
4608 }
4609 
4610 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(Instruction *I,
4611                                                                unsigned VF) {
4612   // Get and ensure we have a valid memory instruction.
4613   LoadInst *LI = dyn_cast<LoadInst>(I);
4614   StoreInst *SI = dyn_cast<StoreInst>(I);
4615   assert((LI || SI) && "Invalid memory instruction");
4616 
4617   auto *Ptr = getLoadStorePointerOperand(I);
4618 
4619   // In order to be widened, the pointer should be consecutive, first of all.
4620   if (!Legal->isConsecutivePtr(Ptr))
4621     return false;
4622 
4623   // If the instruction is a store located in a predicated block, it will be
4624   // scalarized.
4625   if (isScalarWithPredication(I))
4626     return false;
4627 
4628   // If the instruction's allocated size doesn't equal it's type size, it
4629   // requires padding and will be scalarized.
4630   auto &DL = I->getModule()->getDataLayout();
4631   auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType();
4632   if (hasIrregularType(ScalarTy, DL, VF))
4633     return false;
4634 
4635   return true;
4636 }
4637 
4638 void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
4639   // We should not collect Uniforms more than once per VF. Right now,
4640   // this function is called from collectUniformsAndScalars(), which
4641   // already does this check. Collecting Uniforms for VF=1 does not make any
4642   // sense.
4643 
4644   assert(VF >= 2 && Uniforms.find(VF) == Uniforms.end() &&
4645          "This function should not be visited twice for the same VF");
4646 
4647   // Visit the list of Uniforms. If we'll not find any uniform value, we'll
4648   // not analyze again.  Uniforms.count(VF) will return 1.
4649   Uniforms[VF].clear();
4650 
4651   // We now know that the loop is vectorizable!
4652   // Collect instructions inside the loop that will remain uniform after
4653   // vectorization.
4654 
4655   // Global values, params and instructions outside of current loop are out of
4656   // scope.
4657   auto isOutOfScope = [&](Value *V) -> bool {
4658     Instruction *I = dyn_cast<Instruction>(V);
4659     return (!I || !TheLoop->contains(I));
4660   };
4661 
4662   SetVector<Instruction *> Worklist;
4663   BasicBlock *Latch = TheLoop->getLoopLatch();
4664 
4665   // Start with the conditional branch. If the branch condition is an
4666   // instruction contained in the loop that is only used by the branch, it is
4667   // uniform.
4668   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
4669   if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) {
4670     Worklist.insert(Cmp);
4671     LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *Cmp << "\n");
4672   }
4673 
4674   // Holds consecutive and consecutive-like pointers. Consecutive-like pointers
4675   // are pointers that are treated like consecutive pointers during
4676   // vectorization. The pointer operands of interleaved accesses are an
4677   // example.
4678   SmallSetVector<Instruction *, 8> ConsecutiveLikePtrs;
4679 
4680   // Holds pointer operands of instructions that are possibly non-uniform.
4681   SmallPtrSet<Instruction *, 8> PossibleNonUniformPtrs;
4682 
4683   auto isUniformDecision = [&](Instruction *I, unsigned VF) {
4684     InstWidening WideningDecision = getWideningDecision(I, VF);
4685     assert(WideningDecision != CM_Unknown &&
4686            "Widening decision should be ready at this moment");
4687 
4688     return (WideningDecision == CM_Widen ||
4689             WideningDecision == CM_Widen_Reverse ||
4690             WideningDecision == CM_Interleave);
4691   };
4692   // Iterate over the instructions in the loop, and collect all
4693   // consecutive-like pointer operands in ConsecutiveLikePtrs. If it's possible
4694   // that a consecutive-like pointer operand will be scalarized, we collect it
4695   // in PossibleNonUniformPtrs instead. We use two sets here because a single
4696   // getelementptr instruction can be used by both vectorized and scalarized
4697   // memory instructions. For example, if a loop loads and stores from the same
4698   // location, but the store is conditional, the store will be scalarized, and
4699   // the getelementptr won't remain uniform.
4700   for (auto *BB : TheLoop->blocks())
4701     for (auto &I : *BB) {
4702       // If there's no pointer operand, there's nothing to do.
4703       auto *Ptr = dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
4704       if (!Ptr)
4705         continue;
4706 
4707       // True if all users of Ptr are memory accesses that have Ptr as their
4708       // pointer operand.
4709       auto UsersAreMemAccesses =
4710           llvm::all_of(Ptr->users(), [&](User *U) -> bool {
4711             return getLoadStorePointerOperand(U) == Ptr;
4712           });
4713 
4714       // Ensure the memory instruction will not be scalarized or used by
4715       // gather/scatter, making its pointer operand non-uniform. If the pointer
4716       // operand is used by any instruction other than a memory access, we
4717       // conservatively assume the pointer operand may be non-uniform.
4718       if (!UsersAreMemAccesses || !isUniformDecision(&I, VF))
4719         PossibleNonUniformPtrs.insert(Ptr);
4720 
4721       // If the memory instruction will be vectorized and its pointer operand
4722       // is consecutive-like, or interleaving - the pointer operand should
4723       // remain uniform.
4724       else
4725         ConsecutiveLikePtrs.insert(Ptr);
4726     }
4727 
4728   // Add to the Worklist all consecutive and consecutive-like pointers that
4729   // aren't also identified as possibly non-uniform.
4730   for (auto *V : ConsecutiveLikePtrs)
4731     if (PossibleNonUniformPtrs.find(V) == PossibleNonUniformPtrs.end()) {
4732       LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *V << "\n");
4733       Worklist.insert(V);
4734     }
4735 
4736   // Expand Worklist in topological order: whenever a new instruction
4737   // is added , its users should be already inside Worklist.  It ensures
4738   // a uniform instruction will only be used by uniform instructions.
4739   unsigned idx = 0;
4740   while (idx != Worklist.size()) {
4741     Instruction *I = Worklist[idx++];
4742 
4743     for (auto OV : I->operand_values()) {
4744       // isOutOfScope operands cannot be uniform instructions.
4745       if (isOutOfScope(OV))
4746         continue;
4747       // First order recurrence Phi's should typically be considered
4748       // non-uniform.
4749       auto *OP = dyn_cast<PHINode>(OV);
4750       if (OP && Legal->isFirstOrderRecurrence(OP))
4751         continue;
4752       // If all the users of the operand are uniform, then add the
4753       // operand into the uniform worklist.
4754       auto *OI = cast<Instruction>(OV);
4755       if (llvm::all_of(OI->users(), [&](User *U) -> bool {
4756             auto *J = cast<Instruction>(U);
4757             return Worklist.count(J) ||
4758                    (OI == getLoadStorePointerOperand(J) &&
4759                     isUniformDecision(J, VF));
4760           })) {
4761         Worklist.insert(OI);
4762         LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *OI << "\n");
4763       }
4764     }
4765   }
4766 
4767   // Returns true if Ptr is the pointer operand of a memory access instruction
4768   // I, and I is known to not require scalarization.
4769   auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
4770     return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF);
4771   };
4772 
4773   // For an instruction to be added into Worklist above, all its users inside
4774   // the loop should also be in Worklist. However, this condition cannot be
4775   // true for phi nodes that form a cyclic dependence. We must process phi
4776   // nodes separately. An induction variable will remain uniform if all users
4777   // of the induction variable and induction variable update remain uniform.
4778   // The code below handles both pointer and non-pointer induction variables.
4779   for (auto &Induction : *Legal->getInductionVars()) {
4780     auto *Ind = Induction.first;
4781     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4782 
4783     // Determine if all users of the induction variable are uniform after
4784     // vectorization.
4785     auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4786       auto *I = cast<Instruction>(U);
4787       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
4788              isVectorizedMemAccessUse(I, Ind);
4789     });
4790     if (!UniformInd)
4791       continue;
4792 
4793     // Determine if all users of the induction variable update instruction are
4794     // uniform after vectorization.
4795     auto UniformIndUpdate =
4796         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4797           auto *I = cast<Instruction>(U);
4798           return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
4799                  isVectorizedMemAccessUse(I, IndUpdate);
4800         });
4801     if (!UniformIndUpdate)
4802       continue;
4803 
4804     // The induction variable and its update instruction will remain uniform.
4805     Worklist.insert(Ind);
4806     Worklist.insert(IndUpdate);
4807     LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *Ind << "\n");
4808     LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *IndUpdate
4809                       << "\n");
4810   }
4811 
4812   Uniforms[VF].insert(Worklist.begin(), Worklist.end());
4813 }
4814 
4815 bool LoopVectorizationCostModel::runtimeChecksRequired() {
4816   LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
4817 
4818   if (Legal->getRuntimePointerChecking()->Need) {
4819     reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
4820         "runtime pointer checks needed. Enable vectorization of this "
4821         "loop with '#pragma clang loop vectorize(enable)' when "
4822         "compiling with -Os/-Oz",
4823         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4824     return true;
4825   }
4826 
4827   if (!PSE.getUnionPredicate().getPredicates().empty()) {
4828     reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
4829         "runtime SCEV checks needed. Enable vectorization of this "
4830         "loop with '#pragma clang loop vectorize(enable)' when "
4831         "compiling with -Os/-Oz",
4832         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4833     return true;
4834   }
4835 
4836   // FIXME: Avoid specializing for stride==1 instead of bailing out.
4837   if (!Legal->getLAI()->getSymbolicStrides().empty()) {
4838     reportVectorizationFailure("Runtime stride check is required with -Os/-Oz",
4839         "runtime stride == 1 checks needed. Enable vectorization of "
4840         "this loop with '#pragma clang loop vectorize(enable)' when "
4841         "compiling with -Os/-Oz",
4842         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4843     return true;
4844   }
4845 
4846   return false;
4847 }
4848 
4849 Optional<unsigned> LoopVectorizationCostModel::computeMaxVF() {
4850   if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
4851     // TODO: It may by useful to do since it's still likely to be dynamically
4852     // uniform if the target can skip.
4853     reportVectorizationFailure(
4854         "Not inserting runtime ptr check for divergent target",
4855         "runtime pointer checks needed. Not enabled for divergent target",
4856         "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
4857     return None;
4858   }
4859 
4860   unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
4861   LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
4862   if (TC == 1) {
4863     reportVectorizationFailure("Single iteration (non) loop",
4864         "loop trip count is one, irrelevant for vectorization",
4865         "SingleIterationLoop", ORE, TheLoop);
4866     return None;
4867   }
4868 
4869   switch (ScalarEpilogueStatus) {
4870   case CM_ScalarEpilogueAllowed:
4871     return computeFeasibleMaxVF(TC);
4872   case CM_ScalarEpilogueNotNeededUsePredicate:
4873     LLVM_DEBUG(
4874         dbgs() << "LV: vector predicate hint/switch found.\n"
4875                << "LV: Not allowing scalar epilogue, creating predicated "
4876                << "vector loop.\n");
4877     break;
4878   case CM_ScalarEpilogueNotAllowedLowTripLoop:
4879     // fallthrough as a special case of OptForSize
4880   case CM_ScalarEpilogueNotAllowedOptSize:
4881     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
4882       LLVM_DEBUG(
4883           dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
4884     else
4885       LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
4886                         << "count.\n");
4887 
4888     // Bail if runtime checks are required, which are not good when optimising
4889     // for size.
4890     if (runtimeChecksRequired())
4891       return None;
4892     break;
4893   }
4894 
4895   // Now try the tail folding
4896 
4897   // Invalidate interleave groups that require an epilogue if we can't mask
4898   // the interleave-group.
4899   if (!useMaskedInterleavedAccesses(TTI))
4900     InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
4901 
4902   unsigned MaxVF = computeFeasibleMaxVF(TC);
4903   if (TC > 0 && TC % MaxVF == 0) {
4904     // Accept MaxVF if we do not have a tail.
4905     LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
4906     return MaxVF;
4907   }
4908 
4909   // If we don't know the precise trip count, or if the trip count that we
4910   // found modulo the vectorization factor is not zero, try to fold the tail
4911   // by masking.
4912   // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
4913   if (Legal->prepareToFoldTailByMasking()) {
4914     FoldTailByMasking = true;
4915     return MaxVF;
4916   }
4917 
4918   if (TC == 0) {
4919     reportVectorizationFailure(
4920         "Unable to calculate the loop count due to complex control flow",
4921         "unable to calculate the loop count due to complex control flow",
4922         "UnknownLoopCountComplexCFG", ORE, TheLoop);
4923     return None;
4924   }
4925 
4926   reportVectorizationFailure(
4927       "Cannot optimize for size and vectorize at the same time.",
4928       "cannot optimize for size and vectorize at the same time. "
4929       "Enable vectorization of this loop with '#pragma clang loop "
4930       "vectorize(enable)' when compiling with -Os/-Oz",
4931       "NoTailLoopWithOptForSize", ORE, TheLoop);
4932   return None;
4933 }
4934 
4935 unsigned
4936 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) {
4937   MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
4938   unsigned SmallestType, WidestType;
4939   std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
4940   unsigned WidestRegister = TTI.getRegisterBitWidth(true);
4941 
4942   // Get the maximum safe dependence distance in bits computed by LAA.
4943   // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
4944   // the memory accesses that is most restrictive (involved in the smallest
4945   // dependence distance).
4946   unsigned MaxSafeRegisterWidth = Legal->getMaxSafeRegisterWidth();
4947 
4948   WidestRegister = std::min(WidestRegister, MaxSafeRegisterWidth);
4949 
4950   unsigned MaxVectorSize = WidestRegister / WidestType;
4951 
4952   LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
4953                     << " / " << WidestType << " bits.\n");
4954   LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
4955                     << WidestRegister << " bits.\n");
4956 
4957   assert(MaxVectorSize <= 256 && "Did not expect to pack so many elements"
4958                                  " into one vector!");
4959   if (MaxVectorSize == 0) {
4960     LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n");
4961     MaxVectorSize = 1;
4962     return MaxVectorSize;
4963   } else if (ConstTripCount && ConstTripCount < MaxVectorSize &&
4964              isPowerOf2_32(ConstTripCount)) {
4965     // We need to clamp the VF to be the ConstTripCount. There is no point in
4966     // choosing a higher viable VF as done in the loop below.
4967     LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: "
4968                       << ConstTripCount << "\n");
4969     MaxVectorSize = ConstTripCount;
4970     return MaxVectorSize;
4971   }
4972 
4973   unsigned MaxVF = MaxVectorSize;
4974   if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) ||
4975       (MaximizeBandwidth && isScalarEpilogueAllowed())) {
4976     // Collect all viable vectorization factors larger than the default MaxVF
4977     // (i.e. MaxVectorSize).
4978     SmallVector<unsigned, 8> VFs;
4979     unsigned NewMaxVectorSize = WidestRegister / SmallestType;
4980     for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2)
4981       VFs.push_back(VS);
4982 
4983     // For each VF calculate its register usage.
4984     auto RUs = calculateRegisterUsage(VFs);
4985 
4986     // Select the largest VF which doesn't require more registers than existing
4987     // ones.
4988     unsigned TargetNumRegisters = TTI.getNumberOfRegisters(true);
4989     for (int i = RUs.size() - 1; i >= 0; --i) {
4990       if (RUs[i].MaxLocalUsers <= TargetNumRegisters) {
4991         MaxVF = VFs[i];
4992         break;
4993       }
4994     }
4995     if (unsigned MinVF = TTI.getMinimumVF(SmallestType)) {
4996       if (MaxVF < MinVF) {
4997         LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
4998                           << ") with target's minimum: " << MinVF << '\n');
4999         MaxVF = MinVF;
5000       }
5001     }
5002   }
5003   return MaxVF;
5004 }
5005 
5006 VectorizationFactor
5007 LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) {
5008   float Cost = expectedCost(1).first;
5009   const float ScalarCost = Cost;
5010   unsigned Width = 1;
5011   LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n");
5012 
5013   bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
5014   if (ForceVectorization && MaxVF > 1) {
5015     // Ignore scalar width, because the user explicitly wants vectorization.
5016     // Initialize cost to max so that VF = 2 is, at least, chosen during cost
5017     // evaluation.
5018     Cost = std::numeric_limits<float>::max();
5019   }
5020 
5021   for (unsigned i = 2; i <= MaxVF; i *= 2) {
5022     // Notice that the vector loop needs to be executed less times, so
5023     // we need to divide the cost of the vector loops by the width of
5024     // the vector elements.
5025     VectorizationCostTy C = expectedCost(i);
5026     float VectorCost = C.first / (float)i;
5027     LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
5028                       << " costs: " << (int)VectorCost << ".\n");
5029     if (!C.second && !ForceVectorization) {
5030       LLVM_DEBUG(
5031           dbgs() << "LV: Not considering vector loop of width " << i
5032                  << " because it will not generate any vector instructions.\n");
5033       continue;
5034     }
5035     if (VectorCost < Cost) {
5036       Cost = VectorCost;
5037       Width = i;
5038     }
5039   }
5040 
5041   if (!EnableCondStoresVectorization && NumPredStores) {
5042     reportVectorizationFailure("There are conditional stores.",
5043         "store that is conditionally executed prevents vectorization",
5044         "ConditionalStore", ORE, TheLoop);
5045     Width = 1;
5046     Cost = ScalarCost;
5047   }
5048 
5049   LLVM_DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs()
5050              << "LV: Vectorization seems to be not beneficial, "
5051              << "but was forced by a user.\n");
5052   LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n");
5053   VectorizationFactor Factor = {Width, (unsigned)(Width * Cost)};
5054   return Factor;
5055 }
5056 
5057 std::pair<unsigned, unsigned>
5058 LoopVectorizationCostModel::getSmallestAndWidestTypes() {
5059   unsigned MinWidth = -1U;
5060   unsigned MaxWidth = 8;
5061   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5062 
5063   // For each block.
5064   for (BasicBlock *BB : TheLoop->blocks()) {
5065     // For each instruction in the loop.
5066     for (Instruction &I : BB->instructionsWithoutDebug()) {
5067       Type *T = I.getType();
5068 
5069       // Skip ignored values.
5070       if (ValuesToIgnore.find(&I) != ValuesToIgnore.end())
5071         continue;
5072 
5073       // Only examine Loads, Stores and PHINodes.
5074       if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
5075         continue;
5076 
5077       // Examine PHI nodes that are reduction variables. Update the type to
5078       // account for the recurrence type.
5079       if (auto *PN = dyn_cast<PHINode>(&I)) {
5080         if (!Legal->isReductionVariable(PN))
5081           continue;
5082         RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[PN];
5083         T = RdxDesc.getRecurrenceType();
5084       }
5085 
5086       // Examine the stored values.
5087       if (auto *ST = dyn_cast<StoreInst>(&I))
5088         T = ST->getValueOperand()->getType();
5089 
5090       // Ignore loaded pointer types and stored pointer types that are not
5091       // vectorizable.
5092       //
5093       // FIXME: The check here attempts to predict whether a load or store will
5094       //        be vectorized. We only know this for certain after a VF has
5095       //        been selected. Here, we assume that if an access can be
5096       //        vectorized, it will be. We should also look at extending this
5097       //        optimization to non-pointer types.
5098       //
5099       if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) &&
5100           !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I))
5101         continue;
5102 
5103       MinWidth = std::min(MinWidth,
5104                           (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
5105       MaxWidth = std::max(MaxWidth,
5106                           (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
5107     }
5108   }
5109 
5110   return {MinWidth, MaxWidth};
5111 }
5112 
5113 unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF,
5114                                                            unsigned LoopCost) {
5115   // -- The interleave heuristics --
5116   // We interleave the loop in order to expose ILP and reduce the loop overhead.
5117   // There are many micro-architectural considerations that we can't predict
5118   // at this level. For example, frontend pressure (on decode or fetch) due to
5119   // code size, or the number and capabilities of the execution ports.
5120   //
5121   // We use the following heuristics to select the interleave count:
5122   // 1. If the code has reductions, then we interleave to break the cross
5123   // iteration dependency.
5124   // 2. If the loop is really small, then we interleave to reduce the loop
5125   // overhead.
5126   // 3. We don't interleave if we think that we will spill registers to memory
5127   // due to the increased register pressure.
5128 
5129   if (!isScalarEpilogueAllowed())
5130     return 1;
5131 
5132   // We used the distance for the interleave count.
5133   if (Legal->getMaxSafeDepDistBytes() != -1U)
5134     return 1;
5135 
5136   // Do not interleave loops with a relatively small trip count.
5137   unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
5138   if (TC > 1 && TC < TinyTripCountInterleaveThreshold)
5139     return 1;
5140 
5141   unsigned TargetNumRegisters = TTI.getNumberOfRegisters(VF > 1);
5142   LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
5143                     << " registers\n");
5144 
5145   if (VF == 1) {
5146     if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
5147       TargetNumRegisters = ForceTargetNumScalarRegs;
5148   } else {
5149     if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
5150       TargetNumRegisters = ForceTargetNumVectorRegs;
5151   }
5152 
5153   RegisterUsage R = calculateRegisterUsage({VF})[0];
5154   // We divide by these constants so assume that we have at least one
5155   // instruction that uses at least one register.
5156   R.MaxLocalUsers = std::max(R.MaxLocalUsers, 1U);
5157 
5158   // We calculate the interleave count using the following formula.
5159   // Subtract the number of loop invariants from the number of available
5160   // registers. These registers are used by all of the interleaved instances.
5161   // Next, divide the remaining registers by the number of registers that is
5162   // required by the loop, in order to estimate how many parallel instances
5163   // fit without causing spills. All of this is rounded down if necessary to be
5164   // a power of two. We want power of two interleave count to simplify any
5165   // addressing operations or alignment considerations.
5166   // We also want power of two interleave counts to ensure that the induction
5167   // variable of the vector loop wraps to zero, when tail is folded by masking;
5168   // this currently happens when OptForSize, in which case IC is set to 1 above.
5169   unsigned IC = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs) /
5170                               R.MaxLocalUsers);
5171 
5172   // Don't count the induction variable as interleaved.
5173   if (EnableIndVarRegisterHeur)
5174     IC = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs - 1) /
5175                        std::max(1U, (R.MaxLocalUsers - 1)));
5176 
5177   // Clamp the interleave ranges to reasonable counts.
5178   unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF);
5179 
5180   // Check if the user has overridden the max.
5181   if (VF == 1) {
5182     if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
5183       MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
5184   } else {
5185     if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
5186       MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
5187   }
5188 
5189   // If the trip count is constant, limit the interleave count to be less than
5190   // the trip count divided by VF.
5191   if (TC > 0) {
5192     assert(TC >= VF && "VF exceeds trip count?");
5193     if ((TC / VF) < MaxInterleaveCount)
5194       MaxInterleaveCount = (TC / VF);
5195   }
5196 
5197   // If we did not calculate the cost for VF (because the user selected the VF)
5198   // then we calculate the cost of VF here.
5199   if (LoopCost == 0)
5200     LoopCost = expectedCost(VF).first;
5201 
5202   assert(LoopCost && "Non-zero loop cost expected");
5203 
5204   // Clamp the calculated IC to be between the 1 and the max interleave count
5205   // that the target and trip count allows.
5206   if (IC > MaxInterleaveCount)
5207     IC = MaxInterleaveCount;
5208   else if (IC < 1)
5209     IC = 1;
5210 
5211   // Interleave if we vectorized this loop and there is a reduction that could
5212   // benefit from interleaving.
5213   if (VF > 1 && !Legal->getReductionVars()->empty()) {
5214     LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
5215     return IC;
5216   }
5217 
5218   // Note that if we've already vectorized the loop we will have done the
5219   // runtime check and so interleaving won't require further checks.
5220   bool InterleavingRequiresRuntimePointerCheck =
5221       (VF == 1 && Legal->getRuntimePointerChecking()->Need);
5222 
5223   // We want to interleave small loops in order to reduce the loop overhead and
5224   // potentially expose ILP opportunities.
5225   LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n');
5226   if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) {
5227     // We assume that the cost overhead is 1 and we use the cost model
5228     // to estimate the cost of the loop and interleave until the cost of the
5229     // loop overhead is about 5% of the cost of the loop.
5230     unsigned SmallIC =
5231         std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost));
5232 
5233     // Interleave until store/load ports (estimated by max interleave count) are
5234     // saturated.
5235     unsigned NumStores = Legal->getNumStores();
5236     unsigned NumLoads = Legal->getNumLoads();
5237     unsigned StoresIC = IC / (NumStores ? NumStores : 1);
5238     unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
5239 
5240     // If we have a scalar reduction (vector reductions are already dealt with
5241     // by this point), we can increase the critical path length if the loop
5242     // we're interleaving is inside another loop. Limit, by default to 2, so the
5243     // critical path only gets increased by one reduction operation.
5244     if (!Legal->getReductionVars()->empty() && TheLoop->getLoopDepth() > 1) {
5245       unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
5246       SmallIC = std::min(SmallIC, F);
5247       StoresIC = std::min(StoresIC, F);
5248       LoadsIC = std::min(LoadsIC, F);
5249     }
5250 
5251     if (EnableLoadStoreRuntimeInterleave &&
5252         std::max(StoresIC, LoadsIC) > SmallIC) {
5253       LLVM_DEBUG(
5254           dbgs() << "LV: Interleaving to saturate store or load ports.\n");
5255       return std::max(StoresIC, LoadsIC);
5256     }
5257 
5258     LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
5259     return SmallIC;
5260   }
5261 
5262   // Interleave if this is a large loop (small loops are already dealt with by
5263   // this point) that could benefit from interleaving.
5264   bool HasReductions = !Legal->getReductionVars()->empty();
5265   if (TTI.enableAggressiveInterleaving(HasReductions)) {
5266     LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5267     return IC;
5268   }
5269 
5270   LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
5271   return 1;
5272 }
5273 
5274 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
5275 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {
5276   // This function calculates the register usage by measuring the highest number
5277   // of values that are alive at a single location. Obviously, this is a very
5278   // rough estimation. We scan the loop in a topological order in order and
5279   // assign a number to each instruction. We use RPO to ensure that defs are
5280   // met before their users. We assume that each instruction that has in-loop
5281   // users starts an interval. We record every time that an in-loop value is
5282   // used, so we have a list of the first and last occurrences of each
5283   // instruction. Next, we transpose this data structure into a multi map that
5284   // holds the list of intervals that *end* at a specific location. This multi
5285   // map allows us to perform a linear search. We scan the instructions linearly
5286   // and record each time that a new interval starts, by placing it in a set.
5287   // If we find this value in the multi-map then we remove it from the set.
5288   // The max register usage is the maximum size of the set.
5289   // We also search for instructions that are defined outside the loop, but are
5290   // used inside the loop. We need this number separately from the max-interval
5291   // usage number because when we unroll, loop-invariant values do not take
5292   // more register.
5293   LoopBlocksDFS DFS(TheLoop);
5294   DFS.perform(LI);
5295 
5296   RegisterUsage RU;
5297 
5298   // Each 'key' in the map opens a new interval. The values
5299   // of the map are the index of the 'last seen' usage of the
5300   // instruction that is the key.
5301   using IntervalMap = DenseMap<Instruction *, unsigned>;
5302 
5303   // Maps instruction to its index.
5304   SmallVector<Instruction *, 64> IdxToInstr;
5305   // Marks the end of each interval.
5306   IntervalMap EndPoint;
5307   // Saves the list of instruction indices that are used in the loop.
5308   SmallPtrSet<Instruction *, 8> Ends;
5309   // Saves the list of values that are used in the loop but are
5310   // defined outside the loop, such as arguments and constants.
5311   SmallPtrSet<Value *, 8> LoopInvariants;
5312 
5313   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
5314     for (Instruction &I : BB->instructionsWithoutDebug()) {
5315       IdxToInstr.push_back(&I);
5316 
5317       // Save the end location of each USE.
5318       for (Value *U : I.operands()) {
5319         auto *Instr = dyn_cast<Instruction>(U);
5320 
5321         // Ignore non-instruction values such as arguments, constants, etc.
5322         if (!Instr)
5323           continue;
5324 
5325         // If this instruction is outside the loop then record it and continue.
5326         if (!TheLoop->contains(Instr)) {
5327           LoopInvariants.insert(Instr);
5328           continue;
5329         }
5330 
5331         // Overwrite previous end points.
5332         EndPoint[Instr] = IdxToInstr.size();
5333         Ends.insert(Instr);
5334       }
5335     }
5336   }
5337 
5338   // Saves the list of intervals that end with the index in 'key'.
5339   using InstrList = SmallVector<Instruction *, 2>;
5340   DenseMap<unsigned, InstrList> TransposeEnds;
5341 
5342   // Transpose the EndPoints to a list of values that end at each index.
5343   for (auto &Interval : EndPoint)
5344     TransposeEnds[Interval.second].push_back(Interval.first);
5345 
5346   SmallPtrSet<Instruction *, 8> OpenIntervals;
5347 
5348   // Get the size of the widest register.
5349   unsigned MaxSafeDepDist = -1U;
5350   if (Legal->getMaxSafeDepDistBytes() != -1U)
5351     MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8;
5352   unsigned WidestRegister =
5353       std::min(TTI.getRegisterBitWidth(true), MaxSafeDepDist);
5354   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5355 
5356   SmallVector<RegisterUsage, 8> RUs(VFs.size());
5357   SmallVector<unsigned, 8> MaxUsages(VFs.size(), 0);
5358 
5359   LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
5360 
5361   // A lambda that gets the register usage for the given type and VF.
5362   auto GetRegUsage = [&DL, WidestRegister](Type *Ty, unsigned VF) {
5363     if (Ty->isTokenTy())
5364       return 0U;
5365     unsigned TypeSize = DL.getTypeSizeInBits(Ty->getScalarType());
5366     return std::max<unsigned>(1, VF * TypeSize / WidestRegister);
5367   };
5368 
5369   for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
5370     Instruction *I = IdxToInstr[i];
5371 
5372     // Remove all of the instructions that end at this location.
5373     InstrList &List = TransposeEnds[i];
5374     for (Instruction *ToRemove : List)
5375       OpenIntervals.erase(ToRemove);
5376 
5377     // Ignore instructions that are never used within the loop.
5378     if (Ends.find(I) == Ends.end())
5379       continue;
5380 
5381     // Skip ignored values.
5382     if (ValuesToIgnore.find(I) != ValuesToIgnore.end())
5383       continue;
5384 
5385     // For each VF find the maximum usage of registers.
5386     for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
5387       if (VFs[j] == 1) {
5388         MaxUsages[j] = std::max(MaxUsages[j], OpenIntervals.size());
5389         continue;
5390       }
5391       collectUniformsAndScalars(VFs[j]);
5392       // Count the number of live intervals.
5393       unsigned RegUsage = 0;
5394       for (auto Inst : OpenIntervals) {
5395         // Skip ignored values for VF > 1.
5396         if (VecValuesToIgnore.find(Inst) != VecValuesToIgnore.end() ||
5397             isScalarAfterVectorization(Inst, VFs[j]))
5398           continue;
5399         RegUsage += GetRegUsage(Inst->getType(), VFs[j]);
5400       }
5401       MaxUsages[j] = std::max(MaxUsages[j], RegUsage);
5402     }
5403 
5404     LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
5405                       << OpenIntervals.size() << '\n');
5406 
5407     // Add the current instruction to the list of open intervals.
5408     OpenIntervals.insert(I);
5409   }
5410 
5411   for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
5412     unsigned Invariant = 0;
5413     if (VFs[i] == 1)
5414       Invariant = LoopInvariants.size();
5415     else {
5416       for (auto Inst : LoopInvariants)
5417         Invariant += GetRegUsage(Inst->getType(), VFs[i]);
5418     }
5419 
5420     LLVM_DEBUG(dbgs() << "LV(REG): VF = " << VFs[i] << '\n');
5421     LLVM_DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsages[i] << '\n');
5422     LLVM_DEBUG(dbgs() << "LV(REG): Found invariant usage: " << Invariant
5423                       << '\n');
5424 
5425     RU.LoopInvariantRegs = Invariant;
5426     RU.MaxLocalUsers = MaxUsages[i];
5427     RUs[i] = RU;
5428   }
5429 
5430   return RUs;
5431 }
5432 
5433 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){
5434   // TODO: Cost model for emulated masked load/store is completely
5435   // broken. This hack guides the cost model to use an artificially
5436   // high enough value to practically disable vectorization with such
5437   // operations, except where previously deployed legality hack allowed
5438   // using very low cost values. This is to avoid regressions coming simply
5439   // from moving "masked load/store" check from legality to cost model.
5440   // Masked Load/Gather emulation was previously never allowed.
5441   // Limited number of Masked Store/Scatter emulation was allowed.
5442   assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction");
5443   return isa<LoadInst>(I) ||
5444          (isa<StoreInst>(I) &&
5445           NumPredStores > NumberOfStoresToPredicate);
5446 }
5447 
5448 void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) {
5449   // If we aren't vectorizing the loop, or if we've already collected the
5450   // instructions to scalarize, there's nothing to do. Collection may already
5451   // have occurred if we have a user-selected VF and are now computing the
5452   // expected cost for interleaving.
5453   if (VF < 2 || InstsToScalarize.find(VF) != InstsToScalarize.end())
5454     return;
5455 
5456   // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
5457   // not profitable to scalarize any instructions, the presence of VF in the
5458   // map will indicate that we've analyzed it already.
5459   ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
5460 
5461   // Find all the instructions that are scalar with predication in the loop and
5462   // determine if it would be better to not if-convert the blocks they are in.
5463   // If so, we also record the instructions to scalarize.
5464   for (BasicBlock *BB : TheLoop->blocks()) {
5465     if (!blockNeedsPredication(BB))
5466       continue;
5467     for (Instruction &I : *BB)
5468       if (isScalarWithPredication(&I)) {
5469         ScalarCostsTy ScalarCosts;
5470         // Do not apply discount logic if hacked cost is needed
5471         // for emulated masked memrefs.
5472         if (!useEmulatedMaskMemRefHack(&I) &&
5473             computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
5474           ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
5475         // Remember that BB will remain after vectorization.
5476         PredicatedBBsAfterVectorization.insert(BB);
5477       }
5478   }
5479 }
5480 
5481 int LoopVectorizationCostModel::computePredInstDiscount(
5482     Instruction *PredInst, DenseMap<Instruction *, unsigned> &ScalarCosts,
5483     unsigned VF) {
5484   assert(!isUniformAfterVectorization(PredInst, VF) &&
5485          "Instruction marked uniform-after-vectorization will be predicated");
5486 
5487   // Initialize the discount to zero, meaning that the scalar version and the
5488   // vector version cost the same.
5489   int Discount = 0;
5490 
5491   // Holds instructions to analyze. The instructions we visit are mapped in
5492   // ScalarCosts. Those instructions are the ones that would be scalarized if
5493   // we find that the scalar version costs less.
5494   SmallVector<Instruction *, 8> Worklist;
5495 
5496   // Returns true if the given instruction can be scalarized.
5497   auto canBeScalarized = [&](Instruction *I) -> bool {
5498     // We only attempt to scalarize instructions forming a single-use chain
5499     // from the original predicated block that would otherwise be vectorized.
5500     // Although not strictly necessary, we give up on instructions we know will
5501     // already be scalar to avoid traversing chains that are unlikely to be
5502     // beneficial.
5503     if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
5504         isScalarAfterVectorization(I, VF))
5505       return false;
5506 
5507     // If the instruction is scalar with predication, it will be analyzed
5508     // separately. We ignore it within the context of PredInst.
5509     if (isScalarWithPredication(I))
5510       return false;
5511 
5512     // If any of the instruction's operands are uniform after vectorization,
5513     // the instruction cannot be scalarized. This prevents, for example, a
5514     // masked load from being scalarized.
5515     //
5516     // We assume we will only emit a value for lane zero of an instruction
5517     // marked uniform after vectorization, rather than VF identical values.
5518     // Thus, if we scalarize an instruction that uses a uniform, we would
5519     // create uses of values corresponding to the lanes we aren't emitting code
5520     // for. This behavior can be changed by allowing getScalarValue to clone
5521     // the lane zero values for uniforms rather than asserting.
5522     for (Use &U : I->operands())
5523       if (auto *J = dyn_cast<Instruction>(U.get()))
5524         if (isUniformAfterVectorization(J, VF))
5525           return false;
5526 
5527     // Otherwise, we can scalarize the instruction.
5528     return true;
5529   };
5530 
5531   // Compute the expected cost discount from scalarizing the entire expression
5532   // feeding the predicated instruction. We currently only consider expressions
5533   // that are single-use instruction chains.
5534   Worklist.push_back(PredInst);
5535   while (!Worklist.empty()) {
5536     Instruction *I = Worklist.pop_back_val();
5537 
5538     // If we've already analyzed the instruction, there's nothing to do.
5539     if (ScalarCosts.find(I) != ScalarCosts.end())
5540       continue;
5541 
5542     // Compute the cost of the vector instruction. Note that this cost already
5543     // includes the scalarization overhead of the predicated instruction.
5544     unsigned VectorCost = getInstructionCost(I, VF).first;
5545 
5546     // Compute the cost of the scalarized instruction. This cost is the cost of
5547     // the instruction as if it wasn't if-converted and instead remained in the
5548     // predicated block. We will scale this cost by block probability after
5549     // computing the scalarization overhead.
5550     unsigned ScalarCost = VF * getInstructionCost(I, 1).first;
5551 
5552     // Compute the scalarization overhead of needed insertelement instructions
5553     // and phi nodes.
5554     if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) {
5555       ScalarCost += TTI.getScalarizationOverhead(ToVectorTy(I->getType(), VF),
5556                                                  true, false);
5557       ScalarCost += VF * TTI.getCFInstrCost(Instruction::PHI);
5558     }
5559 
5560     // Compute the scalarization overhead of needed extractelement
5561     // instructions. For each of the instruction's operands, if the operand can
5562     // be scalarized, add it to the worklist; otherwise, account for the
5563     // overhead.
5564     for (Use &U : I->operands())
5565       if (auto *J = dyn_cast<Instruction>(U.get())) {
5566         assert(VectorType::isValidElementType(J->getType()) &&
5567                "Instruction has non-scalar type");
5568         if (canBeScalarized(J))
5569           Worklist.push_back(J);
5570         else if (needsExtract(J, VF))
5571           ScalarCost += TTI.getScalarizationOverhead(
5572                               ToVectorTy(J->getType(),VF), false, true);
5573       }
5574 
5575     // Scale the total scalar cost by block probability.
5576     ScalarCost /= getReciprocalPredBlockProb();
5577 
5578     // Compute the discount. A non-negative discount means the vector version
5579     // of the instruction costs more, and scalarizing would be beneficial.
5580     Discount += VectorCost - ScalarCost;
5581     ScalarCosts[I] = ScalarCost;
5582   }
5583 
5584   return Discount;
5585 }
5586 
5587 LoopVectorizationCostModel::VectorizationCostTy
5588 LoopVectorizationCostModel::expectedCost(unsigned VF) {
5589   VectorizationCostTy Cost;
5590 
5591   // For each block.
5592   for (BasicBlock *BB : TheLoop->blocks()) {
5593     VectorizationCostTy BlockCost;
5594 
5595     // For each instruction in the old loop.
5596     for (Instruction &I : BB->instructionsWithoutDebug()) {
5597       // Skip ignored values.
5598       if (ValuesToIgnore.find(&I) != ValuesToIgnore.end() ||
5599           (VF > 1 && VecValuesToIgnore.find(&I) != VecValuesToIgnore.end()))
5600         continue;
5601 
5602       VectorizationCostTy C = getInstructionCost(&I, VF);
5603 
5604       // Check if we should override the cost.
5605       if (ForceTargetInstructionCost.getNumOccurrences() > 0)
5606         C.first = ForceTargetInstructionCost;
5607 
5608       BlockCost.first += C.first;
5609       BlockCost.second |= C.second;
5610       LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first
5611                         << " for VF " << VF << " For instruction: " << I
5612                         << '\n');
5613     }
5614 
5615     // If we are vectorizing a predicated block, it will have been
5616     // if-converted. This means that the block's instructions (aside from
5617     // stores and instructions that may divide by zero) will now be
5618     // unconditionally executed. For the scalar case, we may not always execute
5619     // the predicated block. Thus, scale the block's cost by the probability of
5620     // executing it.
5621     if (VF == 1 && blockNeedsPredication(BB))
5622       BlockCost.first /= getReciprocalPredBlockProb();
5623 
5624     Cost.first += BlockCost.first;
5625     Cost.second |= BlockCost.second;
5626   }
5627 
5628   return Cost;
5629 }
5630 
5631 /// Gets Address Access SCEV after verifying that the access pattern
5632 /// is loop invariant except the induction variable dependence.
5633 ///
5634 /// This SCEV can be sent to the Target in order to estimate the address
5635 /// calculation cost.
5636 static const SCEV *getAddressAccessSCEV(
5637               Value *Ptr,
5638               LoopVectorizationLegality *Legal,
5639               PredicatedScalarEvolution &PSE,
5640               const Loop *TheLoop) {
5641 
5642   auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
5643   if (!Gep)
5644     return nullptr;
5645 
5646   // We are looking for a gep with all loop invariant indices except for one
5647   // which should be an induction variable.
5648   auto SE = PSE.getSE();
5649   unsigned NumOperands = Gep->getNumOperands();
5650   for (unsigned i = 1; i < NumOperands; ++i) {
5651     Value *Opd = Gep->getOperand(i);
5652     if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
5653         !Legal->isInductionVariable(Opd))
5654       return nullptr;
5655   }
5656 
5657   // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
5658   return PSE.getSCEV(Ptr);
5659 }
5660 
5661 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
5662   return Legal->hasStride(I->getOperand(0)) ||
5663          Legal->hasStride(I->getOperand(1));
5664 }
5665 
5666 unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
5667                                                                  unsigned VF) {
5668   assert(VF > 1 && "Scalarization cost of instruction implies vectorization.");
5669   Type *ValTy = getMemInstValueType(I);
5670   auto SE = PSE.getSE();
5671 
5672   unsigned Alignment = getLoadStoreAlignment(I);
5673   unsigned AS = getLoadStoreAddressSpace(I);
5674   Value *Ptr = getLoadStorePointerOperand(I);
5675   Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
5676 
5677   // Figure out whether the access is strided and get the stride value
5678   // if it's known in compile time
5679   const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
5680 
5681   // Get the cost of the scalar memory instruction and address computation.
5682   unsigned Cost = VF * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
5683 
5684   // Don't pass *I here, since it is scalar but will actually be part of a
5685   // vectorized loop where the user of it is a vectorized instruction.
5686   Cost += VF *
5687           TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment,
5688                               AS);
5689 
5690   // Get the overhead of the extractelement and insertelement instructions
5691   // we might create due to scalarization.
5692   Cost += getScalarizationOverhead(I, VF);
5693 
5694   // If we have a predicated store, it may not be executed for each vector
5695   // lane. Scale the cost by the probability of executing the predicated
5696   // block.
5697   if (isPredicatedInst(I)) {
5698     Cost /= getReciprocalPredBlockProb();
5699 
5700     if (useEmulatedMaskMemRefHack(I))
5701       // Artificially setting to a high enough value to practically disable
5702       // vectorization with such operations.
5703       Cost = 3000000;
5704   }
5705 
5706   return Cost;
5707 }
5708 
5709 unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
5710                                                              unsigned VF) {
5711   Type *ValTy = getMemInstValueType(I);
5712   Type *VectorTy = ToVectorTy(ValTy, VF);
5713   unsigned Alignment = getLoadStoreAlignment(I);
5714   Value *Ptr = getLoadStorePointerOperand(I);
5715   unsigned AS = getLoadStoreAddressSpace(I);
5716   int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
5717 
5718   assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
5719          "Stride should be 1 or -1 for consecutive memory access");
5720   unsigned Cost = 0;
5721   if (Legal->isMaskRequired(I))
5722     Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS);
5723   else
5724     Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, I);
5725 
5726   bool Reverse = ConsecutiveStride < 0;
5727   if (Reverse)
5728     Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
5729   return Cost;
5730 }
5731 
5732 unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
5733                                                          unsigned VF) {
5734   Type *ValTy = getMemInstValueType(I);
5735   Type *VectorTy = ToVectorTy(ValTy, VF);
5736   unsigned Alignment = getLoadStoreAlignment(I);
5737   unsigned AS = getLoadStoreAddressSpace(I);
5738   if (isa<LoadInst>(I)) {
5739     return TTI.getAddressComputationCost(ValTy) +
5740            TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS) +
5741            TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
5742   }
5743   StoreInst *SI = cast<StoreInst>(I);
5744 
5745   bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand());
5746   return TTI.getAddressComputationCost(ValTy) +
5747          TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS) +
5748          (isLoopInvariantStoreValue ? 0 : TTI.getVectorInstrCost(
5749                                                Instruction::ExtractElement,
5750                                                VectorTy, VF - 1));
5751 }
5752 
5753 unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
5754                                                           unsigned VF) {
5755   Type *ValTy = getMemInstValueType(I);
5756   Type *VectorTy = ToVectorTy(ValTy, VF);
5757   unsigned Alignment = getLoadStoreAlignment(I);
5758   Value *Ptr = getLoadStorePointerOperand(I);
5759 
5760   return TTI.getAddressComputationCost(VectorTy) +
5761          TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr,
5762                                     Legal->isMaskRequired(I), Alignment);
5763 }
5764 
5765 unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
5766                                                             unsigned VF) {
5767   Type *ValTy = getMemInstValueType(I);
5768   Type *VectorTy = ToVectorTy(ValTy, VF);
5769   unsigned AS = getLoadStoreAddressSpace(I);
5770 
5771   auto Group = getInterleavedAccessGroup(I);
5772   assert(Group && "Fail to get an interleaved access group.");
5773 
5774   unsigned InterleaveFactor = Group->getFactor();
5775   Type *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
5776 
5777   // Holds the indices of existing members in an interleaved load group.
5778   // An interleaved store group doesn't need this as it doesn't allow gaps.
5779   SmallVector<unsigned, 4> Indices;
5780   if (isa<LoadInst>(I)) {
5781     for (unsigned i = 0; i < InterleaveFactor; i++)
5782       if (Group->getMember(i))
5783         Indices.push_back(i);
5784   }
5785 
5786   // Calculate the cost of the whole interleaved group.
5787   bool UseMaskForGaps =
5788       Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
5789   unsigned Cost = TTI.getInterleavedMemoryOpCost(
5790       I->getOpcode(), WideVecTy, Group->getFactor(), Indices,
5791       Group->getAlignment(), AS, Legal->isMaskRequired(I), UseMaskForGaps);
5792 
5793   if (Group->isReverse()) {
5794     // TODO: Add support for reversed masked interleaved access.
5795     assert(!Legal->isMaskRequired(I) &&
5796            "Reverse masked interleaved access not supported.");
5797     Cost += Group->getNumMembers() *
5798             TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
5799   }
5800   return Cost;
5801 }
5802 
5803 unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
5804                                                               unsigned VF) {
5805   // Calculate scalar cost only. Vectorization cost should be ready at this
5806   // moment.
5807   if (VF == 1) {
5808     Type *ValTy = getMemInstValueType(I);
5809     unsigned Alignment = getLoadStoreAlignment(I);
5810     unsigned AS = getLoadStoreAddressSpace(I);
5811 
5812     return TTI.getAddressComputationCost(ValTy) +
5813            TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, I);
5814   }
5815   return getWideningCost(I, VF);
5816 }
5817 
5818 LoopVectorizationCostModel::VectorizationCostTy
5819 LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
5820   // If we know that this instruction will remain uniform, check the cost of
5821   // the scalar version.
5822   if (isUniformAfterVectorization(I, VF))
5823     VF = 1;
5824 
5825   if (VF > 1 && isProfitableToScalarize(I, VF))
5826     return VectorizationCostTy(InstsToScalarize[VF][I], false);
5827 
5828   // Forced scalars do not have any scalarization overhead.
5829   auto ForcedScalar = ForcedScalars.find(VF);
5830   if (VF > 1 && ForcedScalar != ForcedScalars.end()) {
5831     auto InstSet = ForcedScalar->second;
5832     if (InstSet.find(I) != InstSet.end())
5833       return VectorizationCostTy((getInstructionCost(I, 1).first * VF), false);
5834   }
5835 
5836   Type *VectorTy;
5837   unsigned C = getInstructionCost(I, VF, VectorTy);
5838 
5839   bool TypeNotScalarized =
5840       VF > 1 && VectorTy->isVectorTy() && TTI.getNumberOfParts(VectorTy) < VF;
5841   return VectorizationCostTy(C, TypeNotScalarized);
5842 }
5843 
5844 unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
5845                                                               unsigned VF) {
5846 
5847   if (VF == 1)
5848     return 0;
5849 
5850   unsigned Cost = 0;
5851   Type *RetTy = ToVectorTy(I->getType(), VF);
5852   if (!RetTy->isVoidTy() &&
5853       (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
5854     Cost += TTI.getScalarizationOverhead(RetTy, true, false);
5855 
5856   // Some targets keep addresses scalar.
5857   if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
5858     return Cost;
5859 
5860   // Some targets support efficient element stores.
5861   if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
5862     return Cost;
5863 
5864   // Collect operands to consider.
5865   CallInst *CI = dyn_cast<CallInst>(I);
5866   Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands();
5867 
5868   // Skip operands that do not require extraction/scalarization and do not incur
5869   // any overhead.
5870   return Cost + TTI.getOperandsScalarizationOverhead(
5871                     filterExtractingOperands(Ops, VF), VF);
5872 }
5873 
5874 void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {
5875   if (VF == 1)
5876     return;
5877   NumPredStores = 0;
5878   for (BasicBlock *BB : TheLoop->blocks()) {
5879     // For each instruction in the old loop.
5880     for (Instruction &I : *BB) {
5881       Value *Ptr =  getLoadStorePointerOperand(&I);
5882       if (!Ptr)
5883         continue;
5884 
5885       // TODO: We should generate better code and update the cost model for
5886       // predicated uniform stores. Today they are treated as any other
5887       // predicated store (see added test cases in
5888       // invariant-store-vectorization.ll).
5889       if (isa<StoreInst>(&I) && isScalarWithPredication(&I))
5890         NumPredStores++;
5891 
5892       if (Legal->isUniform(Ptr) &&
5893           // Conditional loads and stores should be scalarized and predicated.
5894           // isScalarWithPredication cannot be used here since masked
5895           // gather/scatters are not considered scalar with predication.
5896           !Legal->blockNeedsPredication(I.getParent())) {
5897         // TODO: Avoid replicating loads and stores instead of
5898         // relying on instcombine to remove them.
5899         // Load: Scalar load + broadcast
5900         // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
5901         unsigned Cost = getUniformMemOpCost(&I, VF);
5902         setWideningDecision(&I, VF, CM_Scalarize, Cost);
5903         continue;
5904       }
5905 
5906       // We assume that widening is the best solution when possible.
5907       if (memoryInstructionCanBeWidened(&I, VF)) {
5908         unsigned Cost = getConsecutiveMemOpCost(&I, VF);
5909         int ConsecutiveStride =
5910                Legal->isConsecutivePtr(getLoadStorePointerOperand(&I));
5911         assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
5912                "Expected consecutive stride.");
5913         InstWidening Decision =
5914             ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
5915         setWideningDecision(&I, VF, Decision, Cost);
5916         continue;
5917       }
5918 
5919       // Choose between Interleaving, Gather/Scatter or Scalarization.
5920       unsigned InterleaveCost = std::numeric_limits<unsigned>::max();
5921       unsigned NumAccesses = 1;
5922       if (isAccessInterleaved(&I)) {
5923         auto Group = getInterleavedAccessGroup(&I);
5924         assert(Group && "Fail to get an interleaved access group.");
5925 
5926         // Make one decision for the whole group.
5927         if (getWideningDecision(&I, VF) != CM_Unknown)
5928           continue;
5929 
5930         NumAccesses = Group->getNumMembers();
5931         if (interleavedAccessCanBeWidened(&I, VF))
5932           InterleaveCost = getInterleaveGroupCost(&I, VF);
5933       }
5934 
5935       unsigned GatherScatterCost =
5936           isLegalGatherOrScatter(&I)
5937               ? getGatherScatterCost(&I, VF) * NumAccesses
5938               : std::numeric_limits<unsigned>::max();
5939 
5940       unsigned ScalarizationCost =
5941           getMemInstScalarizationCost(&I, VF) * NumAccesses;
5942 
5943       // Choose better solution for the current VF,
5944       // write down this decision and use it during vectorization.
5945       unsigned Cost;
5946       InstWidening Decision;
5947       if (InterleaveCost <= GatherScatterCost &&
5948           InterleaveCost < ScalarizationCost) {
5949         Decision = CM_Interleave;
5950         Cost = InterleaveCost;
5951       } else if (GatherScatterCost < ScalarizationCost) {
5952         Decision = CM_GatherScatter;
5953         Cost = GatherScatterCost;
5954       } else {
5955         Decision = CM_Scalarize;
5956         Cost = ScalarizationCost;
5957       }
5958       // If the instructions belongs to an interleave group, the whole group
5959       // receives the same decision. The whole group receives the cost, but
5960       // the cost will actually be assigned to one instruction.
5961       if (auto Group = getInterleavedAccessGroup(&I))
5962         setWideningDecision(Group, VF, Decision, Cost);
5963       else
5964         setWideningDecision(&I, VF, Decision, Cost);
5965     }
5966   }
5967 
5968   // Make sure that any load of address and any other address computation
5969   // remains scalar unless there is gather/scatter support. This avoids
5970   // inevitable extracts into address registers, and also has the benefit of
5971   // activating LSR more, since that pass can't optimize vectorized
5972   // addresses.
5973   if (TTI.prefersVectorizedAddressing())
5974     return;
5975 
5976   // Start with all scalar pointer uses.
5977   SmallPtrSet<Instruction *, 8> AddrDefs;
5978   for (BasicBlock *BB : TheLoop->blocks())
5979     for (Instruction &I : *BB) {
5980       Instruction *PtrDef =
5981         dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
5982       if (PtrDef && TheLoop->contains(PtrDef) &&
5983           getWideningDecision(&I, VF) != CM_GatherScatter)
5984         AddrDefs.insert(PtrDef);
5985     }
5986 
5987   // Add all instructions used to generate the addresses.
5988   SmallVector<Instruction *, 4> Worklist;
5989   for (auto *I : AddrDefs)
5990     Worklist.push_back(I);
5991   while (!Worklist.empty()) {
5992     Instruction *I = Worklist.pop_back_val();
5993     for (auto &Op : I->operands())
5994       if (auto *InstOp = dyn_cast<Instruction>(Op))
5995         if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
5996             AddrDefs.insert(InstOp).second)
5997           Worklist.push_back(InstOp);
5998   }
5999 
6000   for (auto *I : AddrDefs) {
6001     if (isa<LoadInst>(I)) {
6002       // Setting the desired widening decision should ideally be handled in
6003       // by cost functions, but since this involves the task of finding out
6004       // if the loaded register is involved in an address computation, it is
6005       // instead changed here when we know this is the case.
6006       InstWidening Decision = getWideningDecision(I, VF);
6007       if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
6008         // Scalarize a widened load of address.
6009         setWideningDecision(I, VF, CM_Scalarize,
6010                             (VF * getMemoryInstructionCost(I, 1)));
6011       else if (auto Group = getInterleavedAccessGroup(I)) {
6012         // Scalarize an interleave group of address loads.
6013         for (unsigned I = 0; I < Group->getFactor(); ++I) {
6014           if (Instruction *Member = Group->getMember(I))
6015             setWideningDecision(Member, VF, CM_Scalarize,
6016                                 (VF * getMemoryInstructionCost(Member, 1)));
6017         }
6018       }
6019     } else
6020       // Make sure I gets scalarized and a cost estimate without
6021       // scalarization overhead.
6022       ForcedScalars[VF].insert(I);
6023   }
6024 }
6025 
6026 unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
6027                                                         unsigned VF,
6028                                                         Type *&VectorTy) {
6029   Type *RetTy = I->getType();
6030   if (canTruncateToMinimalBitwidth(I, VF))
6031     RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
6032   VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF);
6033   auto SE = PSE.getSE();
6034 
6035   // TODO: We need to estimate the cost of intrinsic calls.
6036   switch (I->getOpcode()) {
6037   case Instruction::GetElementPtr:
6038     // We mark this instruction as zero-cost because the cost of GEPs in
6039     // vectorized code depends on whether the corresponding memory instruction
6040     // is scalarized or not. Therefore, we handle GEPs with the memory
6041     // instruction cost.
6042     return 0;
6043   case Instruction::Br: {
6044     // In cases of scalarized and predicated instructions, there will be VF
6045     // predicated blocks in the vectorized loop. Each branch around these
6046     // blocks requires also an extract of its vector compare i1 element.
6047     bool ScalarPredicatedBB = false;
6048     BranchInst *BI = cast<BranchInst>(I);
6049     if (VF > 1 && BI->isConditional() &&
6050         (PredicatedBBsAfterVectorization.find(BI->getSuccessor(0)) !=
6051              PredicatedBBsAfterVectorization.end() ||
6052          PredicatedBBsAfterVectorization.find(BI->getSuccessor(1)) !=
6053              PredicatedBBsAfterVectorization.end()))
6054       ScalarPredicatedBB = true;
6055 
6056     if (ScalarPredicatedBB) {
6057       // Return cost for branches around scalarized and predicated blocks.
6058       Type *Vec_i1Ty =
6059           VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
6060       return (TTI.getScalarizationOverhead(Vec_i1Ty, false, true) +
6061               (TTI.getCFInstrCost(Instruction::Br) * VF));
6062     } else if (I->getParent() == TheLoop->getLoopLatch() || VF == 1)
6063       // The back-edge branch will remain, as will all scalar branches.
6064       return TTI.getCFInstrCost(Instruction::Br);
6065     else
6066       // This branch will be eliminated by if-conversion.
6067       return 0;
6068     // Note: We currently assume zero cost for an unconditional branch inside
6069     // a predicated block since it will become a fall-through, although we
6070     // may decide in the future to call TTI for all branches.
6071   }
6072   case Instruction::PHI: {
6073     auto *Phi = cast<PHINode>(I);
6074 
6075     // First-order recurrences are replaced by vector shuffles inside the loop.
6076     // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type.
6077     if (VF > 1 && Legal->isFirstOrderRecurrence(Phi))
6078       return TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector,
6079                                 VectorTy, VF - 1, VectorType::get(RetTy, 1));
6080 
6081     // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
6082     // converted into select instructions. We require N - 1 selects per phi
6083     // node, where N is the number of incoming values.
6084     if (VF > 1 && Phi->getParent() != TheLoop->getHeader())
6085       return (Phi->getNumIncomingValues() - 1) *
6086              TTI.getCmpSelInstrCost(
6087                  Instruction::Select, ToVectorTy(Phi->getType(), VF),
6088                  ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF));
6089 
6090     return TTI.getCFInstrCost(Instruction::PHI);
6091   }
6092   case Instruction::UDiv:
6093   case Instruction::SDiv:
6094   case Instruction::URem:
6095   case Instruction::SRem:
6096     // If we have a predicated instruction, it may not be executed for each
6097     // vector lane. Get the scalarization cost and scale this amount by the
6098     // probability of executing the predicated block. If the instruction is not
6099     // predicated, we fall through to the next case.
6100     if (VF > 1 && isScalarWithPredication(I)) {
6101       unsigned Cost = 0;
6102 
6103       // These instructions have a non-void type, so account for the phi nodes
6104       // that we will create. This cost is likely to be zero. The phi node
6105       // cost, if any, should be scaled by the block probability because it
6106       // models a copy at the end of each predicated block.
6107       Cost += VF * TTI.getCFInstrCost(Instruction::PHI);
6108 
6109       // The cost of the non-predicated instruction.
6110       Cost += VF * TTI.getArithmeticInstrCost(I->getOpcode(), RetTy);
6111 
6112       // The cost of insertelement and extractelement instructions needed for
6113       // scalarization.
6114       Cost += getScalarizationOverhead(I, VF);
6115 
6116       // Scale the cost by the probability of executing the predicated blocks.
6117       // This assumes the predicated block for each vector lane is equally
6118       // likely.
6119       return Cost / getReciprocalPredBlockProb();
6120     }
6121     LLVM_FALLTHROUGH;
6122   case Instruction::Add:
6123   case Instruction::FAdd:
6124   case Instruction::Sub:
6125   case Instruction::FSub:
6126   case Instruction::Mul:
6127   case Instruction::FMul:
6128   case Instruction::FDiv:
6129   case Instruction::FRem:
6130   case Instruction::Shl:
6131   case Instruction::LShr:
6132   case Instruction::AShr:
6133   case Instruction::And:
6134   case Instruction::Or:
6135   case Instruction::Xor: {
6136     // Since we will replace the stride by 1 the multiplication should go away.
6137     if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal))
6138       return 0;
6139     // Certain instructions can be cheaper to vectorize if they have a constant
6140     // second vector operand. One example of this are shifts on x86.
6141     Value *Op2 = I->getOperand(1);
6142     TargetTransformInfo::OperandValueProperties Op2VP;
6143     TargetTransformInfo::OperandValueKind Op2VK =
6144         TTI.getOperandInfo(Op2, Op2VP);
6145     if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2))
6146       Op2VK = TargetTransformInfo::OK_UniformValue;
6147 
6148     SmallVector<const Value *, 4> Operands(I->operand_values());
6149     unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
6150     return N * TTI.getArithmeticInstrCost(
6151                    I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue,
6152                    Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands);
6153   }
6154   case Instruction::FNeg: {
6155     unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
6156     return N * TTI.getArithmeticInstrCost(
6157                    I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue,
6158                    TargetTransformInfo::OK_AnyValue,
6159                    TargetTransformInfo::OP_None, TargetTransformInfo::OP_None,
6160                    I->getOperand(0));
6161   }
6162   case Instruction::Select: {
6163     SelectInst *SI = cast<SelectInst>(I);
6164     const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
6165     bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
6166     Type *CondTy = SI->getCondition()->getType();
6167     if (!ScalarCond)
6168       CondTy = VectorType::get(CondTy, VF);
6169 
6170     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, I);
6171   }
6172   case Instruction::ICmp:
6173   case Instruction::FCmp: {
6174     Type *ValTy = I->getOperand(0)->getType();
6175     Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
6176     if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
6177       ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
6178     VectorTy = ToVectorTy(ValTy, VF);
6179     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, I);
6180   }
6181   case Instruction::Store:
6182   case Instruction::Load: {
6183     unsigned Width = VF;
6184     if (Width > 1) {
6185       InstWidening Decision = getWideningDecision(I, Width);
6186       assert(Decision != CM_Unknown &&
6187              "CM decision should be taken at this point");
6188       if (Decision == CM_Scalarize)
6189         Width = 1;
6190     }
6191     VectorTy = ToVectorTy(getMemInstValueType(I), Width);
6192     return getMemoryInstructionCost(I, VF);
6193   }
6194   case Instruction::ZExt:
6195   case Instruction::SExt:
6196   case Instruction::FPToUI:
6197   case Instruction::FPToSI:
6198   case Instruction::FPExt:
6199   case Instruction::PtrToInt:
6200   case Instruction::IntToPtr:
6201   case Instruction::SIToFP:
6202   case Instruction::UIToFP:
6203   case Instruction::Trunc:
6204   case Instruction::FPTrunc:
6205   case Instruction::BitCast: {
6206     // We optimize the truncation of induction variables having constant
6207     // integer steps. The cost of these truncations is the same as the scalar
6208     // operation.
6209     if (isOptimizableIVTruncate(I, VF)) {
6210       auto *Trunc = cast<TruncInst>(I);
6211       return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
6212                                   Trunc->getSrcTy(), Trunc);
6213     }
6214 
6215     Type *SrcScalarTy = I->getOperand(0)->getType();
6216     Type *SrcVecTy =
6217         VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
6218     if (canTruncateToMinimalBitwidth(I, VF)) {
6219       // This cast is going to be shrunk. This may remove the cast or it might
6220       // turn it into slightly different cast. For example, if MinBW == 16,
6221       // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
6222       //
6223       // Calculate the modified src and dest types.
6224       Type *MinVecTy = VectorTy;
6225       if (I->getOpcode() == Instruction::Trunc) {
6226         SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
6227         VectorTy =
6228             largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
6229       } else if (I->getOpcode() == Instruction::ZExt ||
6230                  I->getOpcode() == Instruction::SExt) {
6231         SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy);
6232         VectorTy =
6233             smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
6234       }
6235     }
6236 
6237     unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
6238     return N * TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy, I);
6239   }
6240   case Instruction::Call: {
6241     bool NeedToScalarize;
6242     CallInst *CI = cast<CallInst>(I);
6243     unsigned CallCost = getVectorCallCost(CI, VF, NeedToScalarize);
6244     if (getVectorIntrinsicIDForCall(CI, TLI))
6245       return std::min(CallCost, getVectorIntrinsicCost(CI, VF));
6246     return CallCost;
6247   }
6248   default:
6249     // The cost of executing VF copies of the scalar instruction. This opcode
6250     // is unknown. Assume that it is the same as 'mul'.
6251     return VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy) +
6252            getScalarizationOverhead(I, VF);
6253   } // end of switch.
6254 }
6255 
6256 char LoopVectorize::ID = 0;
6257 
6258 static const char lv_name[] = "Loop Vectorization";
6259 
6260 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
6261 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
6262 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)
6263 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
6264 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
6265 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
6266 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
6267 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
6268 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
6269 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
6270 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
6271 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
6272 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
6273 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
6274 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
6275 
6276 namespace llvm {
6277 
6278 Pass *createLoopVectorizePass() { return new LoopVectorize(); }
6279 
6280 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced,
6281                               bool VectorizeOnlyWhenForced) {
6282   return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced);
6283 }
6284 
6285 } // end namespace llvm
6286 
6287 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
6288   // Check if the pointer operand of a load or store instruction is
6289   // consecutive.
6290   if (auto *Ptr = getLoadStorePointerOperand(Inst))
6291     return Legal->isConsecutivePtr(Ptr);
6292   return false;
6293 }
6294 
6295 void LoopVectorizationCostModel::collectValuesToIgnore() {
6296   // Ignore ephemeral values.
6297   CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
6298 
6299   // Ignore type-promoting instructions we identified during reduction
6300   // detection.
6301   for (auto &Reduction : *Legal->getReductionVars()) {
6302     RecurrenceDescriptor &RedDes = Reduction.second;
6303     SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
6304     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
6305   }
6306   // Ignore type-casting instructions we identified during induction
6307   // detection.
6308   for (auto &Induction : *Legal->getInductionVars()) {
6309     InductionDescriptor &IndDes = Induction.second;
6310     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
6311     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
6312   }
6313 }
6314 
6315 // TODO: we could return a pair of values that specify the max VF and
6316 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
6317 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
6318 // doesn't have a cost model that can choose which plan to execute if
6319 // more than one is generated.
6320 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits,
6321                                  LoopVectorizationCostModel &CM) {
6322   unsigned WidestType;
6323   std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
6324   return WidestVectorRegBits / WidestType;
6325 }
6326 
6327 VectorizationFactor
6328 LoopVectorizationPlanner::planInVPlanNativePath(unsigned UserVF) {
6329   unsigned VF = UserVF;
6330   // Outer loop handling: They may require CFG and instruction level
6331   // transformations before even evaluating whether vectorization is profitable.
6332   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
6333   // the vectorization pipeline.
6334   if (!OrigLoop->empty()) {
6335     // If the user doesn't provide a vectorization factor, determine a
6336     // reasonable one.
6337     if (!UserVF) {
6338       VF = determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM);
6339       LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
6340 
6341       // Make sure we have a VF > 1 for stress testing.
6342       if (VPlanBuildStressTest && VF < 2) {
6343         LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
6344                           << "overriding computed VF.\n");
6345         VF = 4;
6346       }
6347     }
6348     assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
6349     assert(isPowerOf2_32(VF) && "VF needs to be a power of two");
6350     LLVM_DEBUG(dbgs() << "LV: Using " << (UserVF ? "user " : "") << "VF " << VF
6351                       << " to build VPlans.\n");
6352     buildVPlans(VF, VF);
6353 
6354     // For VPlan build stress testing, we bail out after VPlan construction.
6355     if (VPlanBuildStressTest)
6356       return VectorizationFactor::Disabled();
6357 
6358     return {VF, 0};
6359   }
6360 
6361   LLVM_DEBUG(
6362       dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
6363                 "VPlan-native path.\n");
6364   return VectorizationFactor::Disabled();
6365 }
6366 
6367 Optional<VectorizationFactor> LoopVectorizationPlanner::plan(unsigned UserVF) {
6368   assert(OrigLoop->empty() && "Inner loop expected.");
6369   Optional<unsigned> MaybeMaxVF = CM.computeMaxVF();
6370   if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved.
6371     return None;
6372 
6373   // Invalidate interleave groups if all blocks of loop will be predicated.
6374   if (CM.blockNeedsPredication(OrigLoop->getHeader()) &&
6375       !useMaskedInterleavedAccesses(*TTI)) {
6376     LLVM_DEBUG(
6377         dbgs()
6378         << "LV: Invalidate all interleaved groups due to fold-tail by masking "
6379            "which requires masked-interleaved support.\n");
6380     CM.InterleaveInfo.reset();
6381   }
6382 
6383   if (UserVF) {
6384     LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
6385     assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two");
6386     // Collect the instructions (and their associated costs) that will be more
6387     // profitable to scalarize.
6388     CM.selectUserVectorizationFactor(UserVF);
6389     buildVPlansWithVPRecipes(UserVF, UserVF);
6390     LLVM_DEBUG(printPlans(dbgs()));
6391     return {{UserVF, 0}};
6392   }
6393 
6394   unsigned MaxVF = MaybeMaxVF.getValue();
6395   assert(MaxVF != 0 && "MaxVF is zero.");
6396 
6397   for (unsigned VF = 1; VF <= MaxVF; VF *= 2) {
6398     // Collect Uniform and Scalar instructions after vectorization with VF.
6399     CM.collectUniformsAndScalars(VF);
6400 
6401     // Collect the instructions (and their associated costs) that will be more
6402     // profitable to scalarize.
6403     if (VF > 1)
6404       CM.collectInstsToScalarize(VF);
6405   }
6406 
6407   buildVPlansWithVPRecipes(1, MaxVF);
6408   LLVM_DEBUG(printPlans(dbgs()));
6409   if (MaxVF == 1)
6410     return VectorizationFactor::Disabled();
6411 
6412   // Select the optimal vectorization factor.
6413   return CM.selectVectorizationFactor(MaxVF);
6414 }
6415 
6416 void LoopVectorizationPlanner::setBestPlan(unsigned VF, unsigned UF) {
6417   LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF
6418                     << '\n');
6419   BestVF = VF;
6420   BestUF = UF;
6421 
6422   erase_if(VPlans, [VF](const VPlanPtr &Plan) {
6423     return !Plan->hasVF(VF);
6424   });
6425   assert(VPlans.size() == 1 && "Best VF has not a single VPlan.");
6426 }
6427 
6428 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV,
6429                                            DominatorTree *DT) {
6430   // Perform the actual loop transformation.
6431 
6432   // 1. Create a new empty loop. Unlink the old loop and connect the new one.
6433   VPCallbackILV CallbackILV(ILV);
6434 
6435   VPTransformState State{BestVF, BestUF,      LI,
6436                          DT,     ILV.Builder, ILV.VectorLoopValueMap,
6437                          &ILV,   CallbackILV};
6438   State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
6439   State.TripCount = ILV.getOrCreateTripCount(nullptr);
6440 
6441   //===------------------------------------------------===//
6442   //
6443   // Notice: any optimization or new instruction that go
6444   // into the code below should also be implemented in
6445   // the cost-model.
6446   //
6447   //===------------------------------------------------===//
6448 
6449   // 2. Copy and widen instructions from the old loop into the new loop.
6450   assert(VPlans.size() == 1 && "Not a single VPlan to execute.");
6451   VPlans.front()->execute(&State);
6452 
6453   // 3. Fix the vectorized code: take care of header phi's, live-outs,
6454   //    predication, updating analyses.
6455   ILV.fixVectorizedLoop();
6456 }
6457 
6458 void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
6459     SmallPtrSetImpl<Instruction *> &DeadInstructions) {
6460   BasicBlock *Latch = OrigLoop->getLoopLatch();
6461 
6462   // We create new control-flow for the vectorized loop, so the original
6463   // condition will be dead after vectorization if it's only used by the
6464   // branch.
6465   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
6466   if (Cmp && Cmp->hasOneUse())
6467     DeadInstructions.insert(Cmp);
6468 
6469   // We create new "steps" for induction variable updates to which the original
6470   // induction variables map. An original update instruction will be dead if
6471   // all its users except the induction variable are dead.
6472   for (auto &Induction : *Legal->getInductionVars()) {
6473     PHINode *Ind = Induction.first;
6474     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
6475     if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
6476           return U == Ind || DeadInstructions.find(cast<Instruction>(U)) !=
6477                                  DeadInstructions.end();
6478         }))
6479       DeadInstructions.insert(IndUpdate);
6480 
6481     // We record as "Dead" also the type-casting instructions we had identified
6482     // during induction analysis. We don't need any handling for them in the
6483     // vectorized loop because we have proven that, under a proper runtime
6484     // test guarding the vectorized loop, the value of the phi, and the casted
6485     // value of the phi, are the same. The last instruction in this casting chain
6486     // will get its scalar/vector/widened def from the scalar/vector/widened def
6487     // of the respective phi node. Any other casts in the induction def-use chain
6488     // have no other uses outside the phi update chain, and will be ignored.
6489     InductionDescriptor &IndDes = Induction.second;
6490     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
6491     DeadInstructions.insert(Casts.begin(), Casts.end());
6492   }
6493 }
6494 
6495 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; }
6496 
6497 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; }
6498 
6499 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step,
6500                                         Instruction::BinaryOps BinOp) {
6501   // When unrolling and the VF is 1, we only need to add a simple scalar.
6502   Type *Ty = Val->getType();
6503   assert(!Ty->isVectorTy() && "Val must be a scalar");
6504 
6505   if (Ty->isFloatingPointTy()) {
6506     Constant *C = ConstantFP::get(Ty, (double)StartIdx);
6507 
6508     // Floating point operations had to be 'fast' to enable the unrolling.
6509     Value *MulOp = addFastMathFlag(Builder.CreateFMul(C, Step));
6510     return addFastMathFlag(Builder.CreateBinOp(BinOp, Val, MulOp));
6511   }
6512   Constant *C = ConstantInt::get(Ty, StartIdx);
6513   return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction");
6514 }
6515 
6516 static void AddRuntimeUnrollDisableMetaData(Loop *L) {
6517   SmallVector<Metadata *, 4> MDs;
6518   // Reserve first location for self reference to the LoopID metadata node.
6519   MDs.push_back(nullptr);
6520   bool IsUnrollMetadata = false;
6521   MDNode *LoopID = L->getLoopID();
6522   if (LoopID) {
6523     // First find existing loop unrolling disable metadata.
6524     for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
6525       auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
6526       if (MD) {
6527         const auto *S = dyn_cast<MDString>(MD->getOperand(0));
6528         IsUnrollMetadata =
6529             S && S->getString().startswith("llvm.loop.unroll.disable");
6530       }
6531       MDs.push_back(LoopID->getOperand(i));
6532     }
6533   }
6534 
6535   if (!IsUnrollMetadata) {
6536     // Add runtime unroll disable metadata.
6537     LLVMContext &Context = L->getHeader()->getContext();
6538     SmallVector<Metadata *, 1> DisableOperands;
6539     DisableOperands.push_back(
6540         MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
6541     MDNode *DisableNode = MDNode::get(Context, DisableOperands);
6542     MDs.push_back(DisableNode);
6543     MDNode *NewLoopID = MDNode::get(Context, MDs);
6544     // Set operand 0 to refer to the loop id itself.
6545     NewLoopID->replaceOperandWith(0, NewLoopID);
6546     L->setLoopID(NewLoopID);
6547   }
6548 }
6549 
6550 bool LoopVectorizationPlanner::getDecisionAndClampRange(
6551     const std::function<bool(unsigned)> &Predicate, VFRange &Range) {
6552   assert(Range.End > Range.Start && "Trying to test an empty VF range.");
6553   bool PredicateAtRangeStart = Predicate(Range.Start);
6554 
6555   for (unsigned TmpVF = Range.Start * 2; TmpVF < Range.End; TmpVF *= 2)
6556     if (Predicate(TmpVF) != PredicateAtRangeStart) {
6557       Range.End = TmpVF;
6558       break;
6559     }
6560 
6561   return PredicateAtRangeStart;
6562 }
6563 
6564 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
6565 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
6566 /// of VF's starting at a given VF and extending it as much as possible. Each
6567 /// vectorization decision can potentially shorten this sub-range during
6568 /// buildVPlan().
6569 void LoopVectorizationPlanner::buildVPlans(unsigned MinVF, unsigned MaxVF) {
6570   for (unsigned VF = MinVF; VF < MaxVF + 1;) {
6571     VFRange SubRange = {VF, MaxVF + 1};
6572     VPlans.push_back(buildVPlan(SubRange));
6573     VF = SubRange.End;
6574   }
6575 }
6576 
6577 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
6578                                          VPlanPtr &Plan) {
6579   assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
6580 
6581   // Look for cached value.
6582   std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
6583   EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
6584   if (ECEntryIt != EdgeMaskCache.end())
6585     return ECEntryIt->second;
6586 
6587   VPValue *SrcMask = createBlockInMask(Src, Plan);
6588 
6589   // The terminator has to be a branch inst!
6590   BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
6591   assert(BI && "Unexpected terminator found");
6592 
6593   if (!BI->isConditional())
6594     return EdgeMaskCache[Edge] = SrcMask;
6595 
6596   VPValue *EdgeMask = Plan->getVPValue(BI->getCondition());
6597   assert(EdgeMask && "No Edge Mask found for condition");
6598 
6599   if (BI->getSuccessor(0) != Dst)
6600     EdgeMask = Builder.createNot(EdgeMask);
6601 
6602   if (SrcMask) // Otherwise block in-mask is all-one, no need to AND.
6603     EdgeMask = Builder.createAnd(EdgeMask, SrcMask);
6604 
6605   return EdgeMaskCache[Edge] = EdgeMask;
6606 }
6607 
6608 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
6609   assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
6610 
6611   // Look for cached value.
6612   BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB);
6613   if (BCEntryIt != BlockMaskCache.end())
6614     return BCEntryIt->second;
6615 
6616   // All-one mask is modelled as no-mask following the convention for masked
6617   // load/store/gather/scatter. Initialize BlockMask to no-mask.
6618   VPValue *BlockMask = nullptr;
6619 
6620   if (OrigLoop->getHeader() == BB) {
6621     if (!CM.blockNeedsPredication(BB))
6622       return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one.
6623 
6624     // Introduce the early-exit compare IV <= BTC to form header block mask.
6625     // This is used instead of IV < TC because TC may wrap, unlike BTC.
6626     VPValue *IV = Plan->getVPValue(Legal->getPrimaryInduction());
6627     VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
6628     BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
6629     return BlockMaskCache[BB] = BlockMask;
6630   }
6631 
6632   // This is the block mask. We OR all incoming edges.
6633   for (auto *Predecessor : predecessors(BB)) {
6634     VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan);
6635     if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too.
6636       return BlockMaskCache[BB] = EdgeMask;
6637 
6638     if (!BlockMask) { // BlockMask has its initialized nullptr value.
6639       BlockMask = EdgeMask;
6640       continue;
6641     }
6642 
6643     BlockMask = Builder.createOr(BlockMask, EdgeMask);
6644   }
6645 
6646   return BlockMaskCache[BB] = BlockMask;
6647 }
6648 
6649 VPInterleaveRecipe *VPRecipeBuilder::tryToInterleaveMemory(Instruction *I,
6650                                                            VFRange &Range,
6651                                                            VPlanPtr &Plan) {
6652   const InterleaveGroup<Instruction> *IG = CM.getInterleavedAccessGroup(I);
6653   if (!IG)
6654     return nullptr;
6655 
6656   // Now check if IG is relevant for VF's in the given range.
6657   auto isIGMember = [&](Instruction *I) -> std::function<bool(unsigned)> {
6658     return [=](unsigned VF) -> bool {
6659       return (VF >= 2 && // Query is illegal for VF == 1
6660               CM.getWideningDecision(I, VF) ==
6661                   LoopVectorizationCostModel::CM_Interleave);
6662     };
6663   };
6664   if (!LoopVectorizationPlanner::getDecisionAndClampRange(isIGMember(I), Range))
6665     return nullptr;
6666 
6667   // I is a member of an InterleaveGroup for VF's in the (possibly trimmed)
6668   // range. If it's the primary member of the IG construct a VPInterleaveRecipe.
6669   // Otherwise, it's an adjunct member of the IG, do not construct any Recipe.
6670   assert(I == IG->getInsertPos() &&
6671          "Generating a recipe for an adjunct member of an interleave group");
6672 
6673   VPValue *Mask = nullptr;
6674   if (Legal->isMaskRequired(I))
6675     Mask = createBlockInMask(I->getParent(), Plan);
6676 
6677   return new VPInterleaveRecipe(IG, Mask);
6678 }
6679 
6680 VPWidenMemoryInstructionRecipe *
6681 VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range,
6682                                   VPlanPtr &Plan) {
6683   if (!isa<LoadInst>(I) && !isa<StoreInst>(I))
6684     return nullptr;
6685 
6686   auto willWiden = [&](unsigned VF) -> bool {
6687     if (VF == 1)
6688       return false;
6689     if (CM.isScalarAfterVectorization(I, VF) ||
6690         CM.isProfitableToScalarize(I, VF))
6691       return false;
6692     LoopVectorizationCostModel::InstWidening Decision =
6693         CM.getWideningDecision(I, VF);
6694     assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
6695            "CM decision should be taken at this point.");
6696     assert(Decision != LoopVectorizationCostModel::CM_Interleave &&
6697            "Interleave memory opportunity should be caught earlier.");
6698     return Decision != LoopVectorizationCostModel::CM_Scalarize;
6699   };
6700 
6701   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
6702     return nullptr;
6703 
6704   VPValue *Mask = nullptr;
6705   if (Legal->isMaskRequired(I))
6706     Mask = createBlockInMask(I->getParent(), Plan);
6707 
6708   return new VPWidenMemoryInstructionRecipe(*I, Mask);
6709 }
6710 
6711 VPWidenIntOrFpInductionRecipe *
6712 VPRecipeBuilder::tryToOptimizeInduction(Instruction *I, VFRange &Range) {
6713   if (PHINode *Phi = dyn_cast<PHINode>(I)) {
6714     // Check if this is an integer or fp induction. If so, build the recipe that
6715     // produces its scalar and vector values.
6716     InductionDescriptor II = Legal->getInductionVars()->lookup(Phi);
6717     if (II.getKind() == InductionDescriptor::IK_IntInduction ||
6718         II.getKind() == InductionDescriptor::IK_FpInduction)
6719       return new VPWidenIntOrFpInductionRecipe(Phi);
6720 
6721     return nullptr;
6722   }
6723 
6724   // Optimize the special case where the source is a constant integer
6725   // induction variable. Notice that we can only optimize the 'trunc' case
6726   // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
6727   // (c) other casts depend on pointer size.
6728 
6729   // Determine whether \p K is a truncation based on an induction variable that
6730   // can be optimized.
6731   auto isOptimizableIVTruncate =
6732       [&](Instruction *K) -> std::function<bool(unsigned)> {
6733     return
6734         [=](unsigned VF) -> bool { return CM.isOptimizableIVTruncate(K, VF); };
6735   };
6736 
6737   if (isa<TruncInst>(I) && LoopVectorizationPlanner::getDecisionAndClampRange(
6738                                isOptimizableIVTruncate(I), Range))
6739     return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)),
6740                                              cast<TruncInst>(I));
6741   return nullptr;
6742 }
6743 
6744 VPBlendRecipe *VPRecipeBuilder::tryToBlend(Instruction *I, VPlanPtr &Plan) {
6745   PHINode *Phi = dyn_cast<PHINode>(I);
6746   if (!Phi || Phi->getParent() == OrigLoop->getHeader())
6747     return nullptr;
6748 
6749   // We know that all PHIs in non-header blocks are converted into selects, so
6750   // we don't have to worry about the insertion order and we can just use the
6751   // builder. At this point we generate the predication tree. There may be
6752   // duplications since this is a simple recursive scan, but future
6753   // optimizations will clean it up.
6754 
6755   SmallVector<VPValue *, 2> Masks;
6756   unsigned NumIncoming = Phi->getNumIncomingValues();
6757   for (unsigned In = 0; In < NumIncoming; In++) {
6758     VPValue *EdgeMask =
6759       createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan);
6760     assert((EdgeMask || NumIncoming == 1) &&
6761            "Multiple predecessors with one having a full mask");
6762     if (EdgeMask)
6763       Masks.push_back(EdgeMask);
6764   }
6765   return new VPBlendRecipe(Phi, Masks);
6766 }
6767 
6768 bool VPRecipeBuilder::tryToWiden(Instruction *I, VPBasicBlock *VPBB,
6769                                  VFRange &Range) {
6770 
6771   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
6772       [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range);
6773 
6774   if (IsPredicated)
6775     return false;
6776 
6777   auto IsVectorizableOpcode = [](unsigned Opcode) {
6778     switch (Opcode) {
6779     case Instruction::Add:
6780     case Instruction::And:
6781     case Instruction::AShr:
6782     case Instruction::BitCast:
6783     case Instruction::Br:
6784     case Instruction::Call:
6785     case Instruction::FAdd:
6786     case Instruction::FCmp:
6787     case Instruction::FDiv:
6788     case Instruction::FMul:
6789     case Instruction::FNeg:
6790     case Instruction::FPExt:
6791     case Instruction::FPToSI:
6792     case Instruction::FPToUI:
6793     case Instruction::FPTrunc:
6794     case Instruction::FRem:
6795     case Instruction::FSub:
6796     case Instruction::GetElementPtr:
6797     case Instruction::ICmp:
6798     case Instruction::IntToPtr:
6799     case Instruction::Load:
6800     case Instruction::LShr:
6801     case Instruction::Mul:
6802     case Instruction::Or:
6803     case Instruction::PHI:
6804     case Instruction::PtrToInt:
6805     case Instruction::SDiv:
6806     case Instruction::Select:
6807     case Instruction::SExt:
6808     case Instruction::Shl:
6809     case Instruction::SIToFP:
6810     case Instruction::SRem:
6811     case Instruction::Store:
6812     case Instruction::Sub:
6813     case Instruction::Trunc:
6814     case Instruction::UDiv:
6815     case Instruction::UIToFP:
6816     case Instruction::URem:
6817     case Instruction::Xor:
6818     case Instruction::ZExt:
6819       return true;
6820     }
6821     return false;
6822   };
6823 
6824   if (!IsVectorizableOpcode(I->getOpcode()))
6825     return false;
6826 
6827   if (CallInst *CI = dyn_cast<CallInst>(I)) {
6828     Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
6829     if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
6830                ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect))
6831       return false;
6832   }
6833 
6834   auto willWiden = [&](unsigned VF) -> bool {
6835     if (!isa<PHINode>(I) && (CM.isScalarAfterVectorization(I, VF) ||
6836                              CM.isProfitableToScalarize(I, VF)))
6837       return false;
6838     if (CallInst *CI = dyn_cast<CallInst>(I)) {
6839       Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
6840       // The following case may be scalarized depending on the VF.
6841       // The flag shows whether we use Intrinsic or a usual Call for vectorized
6842       // version of the instruction.
6843       // Is it beneficial to perform intrinsic call compared to lib call?
6844       bool NeedToScalarize;
6845       unsigned CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize);
6846       bool UseVectorIntrinsic =
6847           ID && CM.getVectorIntrinsicCost(CI, VF) <= CallCost;
6848       return UseVectorIntrinsic || !NeedToScalarize;
6849     }
6850     if (isa<LoadInst>(I) || isa<StoreInst>(I)) {
6851       assert(CM.getWideningDecision(I, VF) ==
6852                  LoopVectorizationCostModel::CM_Scalarize &&
6853              "Memory widening decisions should have been taken care by now");
6854       return false;
6855     }
6856     return true;
6857   };
6858 
6859   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
6860     return false;
6861 
6862   // Success: widen this instruction. We optimize the common case where
6863   // consecutive instructions can be represented by a single recipe.
6864   if (!VPBB->empty()) {
6865     VPWidenRecipe *LastWidenRecipe = dyn_cast<VPWidenRecipe>(&VPBB->back());
6866     if (LastWidenRecipe && LastWidenRecipe->appendInstruction(I))
6867       return true;
6868   }
6869 
6870   VPBB->appendRecipe(new VPWidenRecipe(I));
6871   return true;
6872 }
6873 
6874 VPBasicBlock *VPRecipeBuilder::handleReplication(
6875     Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
6876     DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe,
6877     VPlanPtr &Plan) {
6878   bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
6879       [&](unsigned VF) { return CM.isUniformAfterVectorization(I, VF); },
6880       Range);
6881 
6882   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
6883       [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range);
6884 
6885   auto *Recipe = new VPReplicateRecipe(I, IsUniform, IsPredicated);
6886 
6887   // Find if I uses a predicated instruction. If so, it will use its scalar
6888   // value. Avoid hoisting the insert-element which packs the scalar value into
6889   // a vector value, as that happens iff all users use the vector value.
6890   for (auto &Op : I->operands())
6891     if (auto *PredInst = dyn_cast<Instruction>(Op))
6892       if (PredInst2Recipe.find(PredInst) != PredInst2Recipe.end())
6893         PredInst2Recipe[PredInst]->setAlsoPack(false);
6894 
6895   // Finalize the recipe for Instr, first if it is not predicated.
6896   if (!IsPredicated) {
6897     LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
6898     VPBB->appendRecipe(Recipe);
6899     return VPBB;
6900   }
6901   LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
6902   assert(VPBB->getSuccessors().empty() &&
6903          "VPBB has successors when handling predicated replication.");
6904   // Record predicated instructions for above packing optimizations.
6905   PredInst2Recipe[I] = Recipe;
6906   VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan);
6907   VPBlockUtils::insertBlockAfter(Region, VPBB);
6908   auto *RegSucc = new VPBasicBlock();
6909   VPBlockUtils::insertBlockAfter(RegSucc, Region);
6910   return RegSucc;
6911 }
6912 
6913 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr,
6914                                                       VPRecipeBase *PredRecipe,
6915                                                       VPlanPtr &Plan) {
6916   // Instructions marked for predication are replicated and placed under an
6917   // if-then construct to prevent side-effects.
6918 
6919   // Generate recipes to compute the block mask for this region.
6920   VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan);
6921 
6922   // Build the triangular if-then region.
6923   std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
6924   assert(Instr->getParent() && "Predicated instruction not in any basic block");
6925   auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask);
6926   auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
6927   auto *PHIRecipe =
6928       Instr->getType()->isVoidTy() ? nullptr : new VPPredInstPHIRecipe(Instr);
6929   auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
6930   auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe);
6931   VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true);
6932 
6933   // Note: first set Entry as region entry and then connect successors starting
6934   // from it in order, to propagate the "parent" of each VPBasicBlock.
6935   VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry);
6936   VPBlockUtils::connectBlocks(Pred, Exit);
6937 
6938   return Region;
6939 }
6940 
6941 bool VPRecipeBuilder::tryToCreateRecipe(Instruction *Instr, VFRange &Range,
6942                                         VPlanPtr &Plan, VPBasicBlock *VPBB) {
6943   VPRecipeBase *Recipe = nullptr;
6944   // Check if Instr should belong to an interleave memory recipe, or already
6945   // does. In the latter case Instr is irrelevant.
6946   if ((Recipe = tryToInterleaveMemory(Instr, Range, Plan))) {
6947     VPBB->appendRecipe(Recipe);
6948     return true;
6949   }
6950 
6951   // Check if Instr is a memory operation that should be widened.
6952   if ((Recipe = tryToWidenMemory(Instr, Range, Plan))) {
6953     VPBB->appendRecipe(Recipe);
6954     return true;
6955   }
6956 
6957   // Check if Instr should form some PHI recipe.
6958   if ((Recipe = tryToOptimizeInduction(Instr, Range))) {
6959     VPBB->appendRecipe(Recipe);
6960     return true;
6961   }
6962   if ((Recipe = tryToBlend(Instr, Plan))) {
6963     VPBB->appendRecipe(Recipe);
6964     return true;
6965   }
6966   if (PHINode *Phi = dyn_cast<PHINode>(Instr)) {
6967     VPBB->appendRecipe(new VPWidenPHIRecipe(Phi));
6968     return true;
6969   }
6970 
6971   // Check if Instr is to be widened by a general VPWidenRecipe, after
6972   // having first checked for specific widening recipes that deal with
6973   // Interleave Groups, Inductions and Phi nodes.
6974   if (tryToWiden(Instr, VPBB, Range))
6975     return true;
6976 
6977   return false;
6978 }
6979 
6980 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF,
6981                                                         unsigned MaxVF) {
6982   assert(OrigLoop->empty() && "Inner loop expected.");
6983 
6984   // Collect conditions feeding internal conditional branches; they need to be
6985   // represented in VPlan for it to model masking.
6986   SmallPtrSet<Value *, 1> NeedDef;
6987 
6988   auto *Latch = OrigLoop->getLoopLatch();
6989   for (BasicBlock *BB : OrigLoop->blocks()) {
6990     if (BB == Latch)
6991       continue;
6992     BranchInst *Branch = dyn_cast<BranchInst>(BB->getTerminator());
6993     if (Branch && Branch->isConditional())
6994       NeedDef.insert(Branch->getCondition());
6995   }
6996 
6997   // If the tail is to be folded by masking, the primary induction variable
6998   // needs to be represented in VPlan for it to model early-exit masking.
6999   // Also, both the Phi and the live-out instruction of each reduction are
7000   // required in order to introduce a select between them in VPlan.
7001   if (CM.foldTailByMasking()) {
7002     NeedDef.insert(Legal->getPrimaryInduction());
7003     for (auto &Reduction : *Legal->getReductionVars()) {
7004       NeedDef.insert(Reduction.first);
7005       NeedDef.insert(Reduction.second.getLoopExitInstr());
7006     }
7007   }
7008 
7009   // Collect instructions from the original loop that will become trivially dead
7010   // in the vectorized loop. We don't need to vectorize these instructions. For
7011   // example, original induction update instructions can become dead because we
7012   // separately emit induction "steps" when generating code for the new loop.
7013   // Similarly, we create a new latch condition when setting up the structure
7014   // of the new loop, so the old one can become dead.
7015   SmallPtrSet<Instruction *, 4> DeadInstructions;
7016   collectTriviallyDeadInstructions(DeadInstructions);
7017 
7018   for (unsigned VF = MinVF; VF < MaxVF + 1;) {
7019     VFRange SubRange = {VF, MaxVF + 1};
7020     VPlans.push_back(
7021         buildVPlanWithVPRecipes(SubRange, NeedDef, DeadInstructions));
7022     VF = SubRange.End;
7023   }
7024 }
7025 
7026 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
7027     VFRange &Range, SmallPtrSetImpl<Value *> &NeedDef,
7028     SmallPtrSetImpl<Instruction *> &DeadInstructions) {
7029   // Hold a mapping from predicated instructions to their recipes, in order to
7030   // fix their AlsoPack behavior if a user is determined to replicate and use a
7031   // scalar instead of vector value.
7032   DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe;
7033 
7034   DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter();
7035   DenseMap<Instruction *, Instruction *> SinkAfterInverse;
7036 
7037   // Create a dummy pre-entry VPBasicBlock to start building the VPlan.
7038   VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry");
7039   auto Plan = std::make_unique<VPlan>(VPBB);
7040 
7041   VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, Builder);
7042   // Represent values that will have defs inside VPlan.
7043   for (Value *V : NeedDef)
7044     Plan->addVPValue(V);
7045 
7046   // Scan the body of the loop in a topological order to visit each basic block
7047   // after having visited its predecessor basic blocks.
7048   LoopBlocksDFS DFS(OrigLoop);
7049   DFS.perform(LI);
7050 
7051   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
7052     // Relevant instructions from basic block BB will be grouped into VPRecipe
7053     // ingredients and fill a new VPBasicBlock.
7054     unsigned VPBBsForBB = 0;
7055     auto *FirstVPBBForBB = new VPBasicBlock(BB->getName());
7056     VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB);
7057     VPBB = FirstVPBBForBB;
7058     Builder.setInsertPoint(VPBB);
7059 
7060     std::vector<Instruction *> Ingredients;
7061 
7062     // Organize the ingredients to vectorize from current basic block in the
7063     // right order.
7064     for (Instruction &I : BB->instructionsWithoutDebug()) {
7065       Instruction *Instr = &I;
7066 
7067       // First filter out irrelevant instructions, to ensure no recipes are
7068       // built for them.
7069       if (isa<BranchInst>(Instr) ||
7070           DeadInstructions.find(Instr) != DeadInstructions.end())
7071         continue;
7072 
7073       // I is a member of an InterleaveGroup for Range.Start. If it's an adjunct
7074       // member of the IG, do not construct any Recipe for it.
7075       const InterleaveGroup<Instruction> *IG =
7076           CM.getInterleavedAccessGroup(Instr);
7077       if (IG && Instr != IG->getInsertPos() &&
7078           Range.Start >= 2 && // Query is illegal for VF == 1
7079           CM.getWideningDecision(Instr, Range.Start) ==
7080               LoopVectorizationCostModel::CM_Interleave) {
7081         auto SinkCandidate = SinkAfterInverse.find(Instr);
7082         if (SinkCandidate != SinkAfterInverse.end())
7083           Ingredients.push_back(SinkCandidate->second);
7084         continue;
7085       }
7086 
7087       // Move instructions to handle first-order recurrences, step 1: avoid
7088       // handling this instruction until after we've handled the instruction it
7089       // should follow.
7090       auto SAIt = SinkAfter.find(Instr);
7091       if (SAIt != SinkAfter.end()) {
7092         LLVM_DEBUG(dbgs() << "Sinking" << *SAIt->first << " after"
7093                           << *SAIt->second
7094                           << " to vectorize a 1st order recurrence.\n");
7095         SinkAfterInverse[SAIt->second] = Instr;
7096         continue;
7097       }
7098 
7099       Ingredients.push_back(Instr);
7100 
7101       // Move instructions to handle first-order recurrences, step 2: push the
7102       // instruction to be sunk at its insertion point.
7103       auto SAInvIt = SinkAfterInverse.find(Instr);
7104       if (SAInvIt != SinkAfterInverse.end())
7105         Ingredients.push_back(SAInvIt->second);
7106     }
7107 
7108     // Introduce each ingredient into VPlan.
7109     for (Instruction *Instr : Ingredients) {
7110       if (RecipeBuilder.tryToCreateRecipe(Instr, Range, Plan, VPBB))
7111         continue;
7112 
7113       // Otherwise, if all widening options failed, Instruction is to be
7114       // replicated. This may create a successor for VPBB.
7115       VPBasicBlock *NextVPBB = RecipeBuilder.handleReplication(
7116           Instr, Range, VPBB, PredInst2Recipe, Plan);
7117       if (NextVPBB != VPBB) {
7118         VPBB = NextVPBB;
7119         VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++)
7120                                     : "");
7121       }
7122     }
7123   }
7124 
7125   // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks
7126   // may also be empty, such as the last one VPBB, reflecting original
7127   // basic-blocks with no recipes.
7128   VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry());
7129   assert(PreEntry->empty() && "Expecting empty pre-entry block.");
7130   VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor());
7131   VPBlockUtils::disconnectBlocks(PreEntry, Entry);
7132   delete PreEntry;
7133 
7134   // Finally, if tail is folded by masking, introduce selects between the phi
7135   // and the live-out instruction of each reduction, at the end of the latch.
7136   if (CM.foldTailByMasking()) {
7137     Builder.setInsertPoint(VPBB);
7138     auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan);
7139     for (auto &Reduction : *Legal->getReductionVars()) {
7140       VPValue *Phi = Plan->getVPValue(Reduction.first);
7141       VPValue *Red = Plan->getVPValue(Reduction.second.getLoopExitInstr());
7142       Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi});
7143     }
7144   }
7145 
7146   std::string PlanName;
7147   raw_string_ostream RSO(PlanName);
7148   unsigned VF = Range.Start;
7149   Plan->addVF(VF);
7150   RSO << "Initial VPlan for VF={" << VF;
7151   for (VF *= 2; VF < Range.End; VF *= 2) {
7152     Plan->addVF(VF);
7153     RSO << "," << VF;
7154   }
7155   RSO << "},UF>=1";
7156   RSO.flush();
7157   Plan->setName(PlanName);
7158 
7159   return Plan;
7160 }
7161 
7162 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
7163   // Outer loop handling: They may require CFG and instruction level
7164   // transformations before even evaluating whether vectorization is profitable.
7165   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
7166   // the vectorization pipeline.
7167   assert(!OrigLoop->empty());
7168   assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
7169 
7170   // Create new empty VPlan
7171   auto Plan = std::make_unique<VPlan>();
7172 
7173   // Build hierarchical CFG
7174   VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
7175   HCFGBuilder.buildHierarchicalCFG();
7176 
7177   for (unsigned VF = Range.Start; VF < Range.End; VF *= 2)
7178     Plan->addVF(VF);
7179 
7180   if (EnableVPlanPredication) {
7181     VPlanPredicator VPP(*Plan);
7182     VPP.predicate();
7183 
7184     // Avoid running transformation to recipes until masked code generation in
7185     // VPlan-native path is in place.
7186     return Plan;
7187   }
7188 
7189   SmallPtrSet<Instruction *, 1> DeadInstructions;
7190   VPlanHCFGTransforms::VPInstructionsToVPRecipes(
7191       Plan, Legal->getInductionVars(), DeadInstructions);
7192 
7193   return Plan;
7194 }
7195 
7196 Value* LoopVectorizationPlanner::VPCallbackILV::
7197 getOrCreateVectorValues(Value *V, unsigned Part) {
7198       return ILV.getOrCreateVectorValue(V, Part);
7199 }
7200 
7201 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent) const {
7202   O << " +\n"
7203     << Indent << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
7204   IG->getInsertPos()->printAsOperand(O, false);
7205   if (User) {
7206     O << ", ";
7207     User->getOperand(0)->printAsOperand(O);
7208   }
7209   O << "\\l\"";
7210   for (unsigned i = 0; i < IG->getFactor(); ++i)
7211     if (Instruction *I = IG->getMember(i))
7212       O << " +\n"
7213         << Indent << "\"  " << VPlanIngredient(I) << " " << i << "\\l\"";
7214 }
7215 
7216 void VPWidenRecipe::execute(VPTransformState &State) {
7217   for (auto &Instr : make_range(Begin, End))
7218     State.ILV->widenInstruction(Instr);
7219 }
7220 
7221 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
7222   assert(!State.Instance && "Int or FP induction being replicated.");
7223   State.ILV->widenIntOrFpInduction(IV, Trunc);
7224 }
7225 
7226 void VPWidenPHIRecipe::execute(VPTransformState &State) {
7227   State.ILV->widenPHIInstruction(Phi, State.UF, State.VF);
7228 }
7229 
7230 void VPBlendRecipe::execute(VPTransformState &State) {
7231   State.ILV->setDebugLocFromInst(State.Builder, Phi);
7232   // We know that all PHIs in non-header blocks are converted into
7233   // selects, so we don't have to worry about the insertion order and we
7234   // can just use the builder.
7235   // At this point we generate the predication tree. There may be
7236   // duplications since this is a simple recursive scan, but future
7237   // optimizations will clean it up.
7238 
7239   unsigned NumIncoming = Phi->getNumIncomingValues();
7240 
7241   assert((User || NumIncoming == 1) &&
7242          "Multiple predecessors with predecessors having a full mask");
7243   // Generate a sequence of selects of the form:
7244   // SELECT(Mask3, In3,
7245   //      SELECT(Mask2, In2,
7246   //                   ( ...)))
7247   InnerLoopVectorizer::VectorParts Entry(State.UF);
7248   for (unsigned In = 0; In < NumIncoming; ++In) {
7249     for (unsigned Part = 0; Part < State.UF; ++Part) {
7250       // We might have single edge PHIs (blocks) - use an identity
7251       // 'select' for the first PHI operand.
7252       Value *In0 =
7253           State.ILV->getOrCreateVectorValue(Phi->getIncomingValue(In), Part);
7254       if (In == 0)
7255         Entry[Part] = In0; // Initialize with the first incoming value.
7256       else {
7257         // Select between the current value and the previous incoming edge
7258         // based on the incoming mask.
7259         Value *Cond = State.get(User->getOperand(In), Part);
7260         Entry[Part] =
7261             State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi");
7262       }
7263     }
7264   }
7265   for (unsigned Part = 0; Part < State.UF; ++Part)
7266     State.ValueMap.setVectorValue(Phi, Part, Entry[Part]);
7267 }
7268 
7269 void VPInterleaveRecipe::execute(VPTransformState &State) {
7270   assert(!State.Instance && "Interleave group being replicated.");
7271   if (!User)
7272     return State.ILV->vectorizeInterleaveGroup(IG->getInsertPos());
7273 
7274   // Last (and currently only) operand is a mask.
7275   InnerLoopVectorizer::VectorParts MaskValues(State.UF);
7276   VPValue *Mask = User->getOperand(User->getNumOperands() - 1);
7277   for (unsigned Part = 0; Part < State.UF; ++Part)
7278     MaskValues[Part] = State.get(Mask, Part);
7279   State.ILV->vectorizeInterleaveGroup(IG->getInsertPos(), &MaskValues);
7280 }
7281 
7282 void VPReplicateRecipe::execute(VPTransformState &State) {
7283   if (State.Instance) { // Generate a single instance.
7284     State.ILV->scalarizeInstruction(Ingredient, *State.Instance, IsPredicated);
7285     // Insert scalar instance packing it into a vector.
7286     if (AlsoPack && State.VF > 1) {
7287       // If we're constructing lane 0, initialize to start from undef.
7288       if (State.Instance->Lane == 0) {
7289         Value *Undef =
7290             UndefValue::get(VectorType::get(Ingredient->getType(), State.VF));
7291         State.ValueMap.setVectorValue(Ingredient, State.Instance->Part, Undef);
7292       }
7293       State.ILV->packScalarIntoVectorValue(Ingredient, *State.Instance);
7294     }
7295     return;
7296   }
7297 
7298   // Generate scalar instances for all VF lanes of all UF parts, unless the
7299   // instruction is uniform inwhich case generate only the first lane for each
7300   // of the UF parts.
7301   unsigned EndLane = IsUniform ? 1 : State.VF;
7302   for (unsigned Part = 0; Part < State.UF; ++Part)
7303     for (unsigned Lane = 0; Lane < EndLane; ++Lane)
7304       State.ILV->scalarizeInstruction(Ingredient, {Part, Lane}, IsPredicated);
7305 }
7306 
7307 void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
7308   assert(State.Instance && "Branch on Mask works only on single instance.");
7309 
7310   unsigned Part = State.Instance->Part;
7311   unsigned Lane = State.Instance->Lane;
7312 
7313   Value *ConditionBit = nullptr;
7314   if (!User) // Block in mask is all-one.
7315     ConditionBit = State.Builder.getTrue();
7316   else {
7317     VPValue *BlockInMask = User->getOperand(0);
7318     ConditionBit = State.get(BlockInMask, Part);
7319     if (ConditionBit->getType()->isVectorTy())
7320       ConditionBit = State.Builder.CreateExtractElement(
7321           ConditionBit, State.Builder.getInt32(Lane));
7322   }
7323 
7324   // Replace the temporary unreachable terminator with a new conditional branch,
7325   // whose two destinations will be set later when they are created.
7326   auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
7327   assert(isa<UnreachableInst>(CurrentTerminator) &&
7328          "Expected to replace unreachable terminator with conditional branch.");
7329   auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
7330   CondBr->setSuccessor(0, nullptr);
7331   ReplaceInstWithInst(CurrentTerminator, CondBr);
7332 }
7333 
7334 void VPPredInstPHIRecipe::execute(VPTransformState &State) {
7335   assert(State.Instance && "Predicated instruction PHI works per instance.");
7336   Instruction *ScalarPredInst = cast<Instruction>(
7337       State.ValueMap.getScalarValue(PredInst, *State.Instance));
7338   BasicBlock *PredicatedBB = ScalarPredInst->getParent();
7339   BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
7340   assert(PredicatingBB && "Predicated block has no single predecessor.");
7341 
7342   // By current pack/unpack logic we need to generate only a single phi node: if
7343   // a vector value for the predicated instruction exists at this point it means
7344   // the instruction has vector users only, and a phi for the vector value is
7345   // needed. In this case the recipe of the predicated instruction is marked to
7346   // also do that packing, thereby "hoisting" the insert-element sequence.
7347   // Otherwise, a phi node for the scalar value is needed.
7348   unsigned Part = State.Instance->Part;
7349   if (State.ValueMap.hasVectorValue(PredInst, Part)) {
7350     Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part);
7351     InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
7352     PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
7353     VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
7354     VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
7355     State.ValueMap.resetVectorValue(PredInst, Part, VPhi); // Update cache.
7356   } else {
7357     Type *PredInstType = PredInst->getType();
7358     PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
7359     Phi->addIncoming(UndefValue::get(ScalarPredInst->getType()), PredicatingBB);
7360     Phi->addIncoming(ScalarPredInst, PredicatedBB);
7361     State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi);
7362   }
7363 }
7364 
7365 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
7366   if (!User)
7367     return State.ILV->vectorizeMemoryInstruction(&Instr);
7368 
7369   // Last (and currently only) operand is a mask.
7370   InnerLoopVectorizer::VectorParts MaskValues(State.UF);
7371   VPValue *Mask = User->getOperand(User->getNumOperands() - 1);
7372   for (unsigned Part = 0; Part < State.UF; ++Part)
7373     MaskValues[Part] = State.get(Mask, Part);
7374   State.ILV->vectorizeMemoryInstruction(&Instr, &MaskValues);
7375 }
7376 
7377 static ScalarEpilogueLowering
7378 getScalarEpilogueLowering(Function *F, Loop *L, LoopVectorizeHints &Hints,
7379                           ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) {
7380   ScalarEpilogueLowering SEL = CM_ScalarEpilogueAllowed;
7381   if (Hints.getForce() != LoopVectorizeHints::FK_Enabled &&
7382       (F->hasOptSize() ||
7383        llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI)))
7384     SEL = CM_ScalarEpilogueNotAllowedOptSize;
7385   else if (PreferPredicateOverEpilog || Hints.getPredicate())
7386     SEL = CM_ScalarEpilogueNotNeededUsePredicate;
7387 
7388   return SEL;
7389 }
7390 
7391 // Process the loop in the VPlan-native vectorization path. This path builds
7392 // VPlan upfront in the vectorization pipeline, which allows to apply
7393 // VPlan-to-VPlan transformations from the very beginning without modifying the
7394 // input LLVM IR.
7395 static bool processLoopInVPlanNativePath(
7396     Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
7397     LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
7398     TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
7399     OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
7400     ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) {
7401 
7402   assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
7403   Function *F = L->getHeader()->getParent();
7404   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
7405   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(F, L, Hints, PSI, BFI);
7406 
7407   LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
7408                                 &Hints, IAI);
7409   // Use the planner for outer loop vectorization.
7410   // TODO: CM is not used at this point inside the planner. Turn CM into an
7411   // optional argument if we don't need it in the future.
7412   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM);
7413 
7414   // Get user vectorization factor.
7415   const unsigned UserVF = Hints.getWidth();
7416 
7417   // Plan how to best vectorize, return the best VF and its cost.
7418   const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
7419 
7420   // If we are stress testing VPlan builds, do not attempt to generate vector
7421   // code. Masked vector code generation support will follow soon.
7422   // Also, do not attempt to vectorize if no vector code will be produced.
7423   if (VPlanBuildStressTest || EnableVPlanPredication ||
7424       VectorizationFactor::Disabled() == VF)
7425     return false;
7426 
7427   LVP.setBestPlan(VF.Width, 1);
7428 
7429   InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL,
7430                          &CM);
7431   LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
7432                     << L->getHeader()->getParent()->getName() << "\"\n");
7433   LVP.executePlan(LB, DT);
7434 
7435   // Mark the loop as already vectorized to avoid vectorizing again.
7436   Hints.setAlreadyVectorized();
7437 
7438   LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent()));
7439   return true;
7440 }
7441 
7442 bool LoopVectorizePass::processLoop(Loop *L) {
7443   assert((EnableVPlanNativePath || L->empty()) &&
7444          "VPlan-native path is not enabled. Only process inner loops.");
7445 
7446 #ifndef NDEBUG
7447   const std::string DebugLocStr = getDebugLocString(L);
7448 #endif /* NDEBUG */
7449 
7450   LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \""
7451                     << L->getHeader()->getParent()->getName() << "\" from "
7452                     << DebugLocStr << "\n");
7453 
7454   LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE);
7455 
7456   LLVM_DEBUG(
7457       dbgs() << "LV: Loop hints:"
7458              << " force="
7459              << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
7460                      ? "disabled"
7461                      : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
7462                             ? "enabled"
7463                             : "?"))
7464              << " width=" << Hints.getWidth()
7465              << " unroll=" << Hints.getInterleave() << "\n");
7466 
7467   // Function containing loop
7468   Function *F = L->getHeader()->getParent();
7469 
7470   // Looking at the diagnostic output is the only way to determine if a loop
7471   // was vectorized (other than looking at the IR or machine code), so it
7472   // is important to generate an optimization remark for each loop. Most of
7473   // these messages are generated as OptimizationRemarkAnalysis. Remarks
7474   // generated as OptimizationRemark and OptimizationRemarkMissed are
7475   // less verbose reporting vectorized loops and unvectorized loops that may
7476   // benefit from vectorization, respectively.
7477 
7478   if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
7479     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
7480     return false;
7481   }
7482 
7483   PredicatedScalarEvolution PSE(*SE, *L);
7484 
7485   // Check if it is legal to vectorize the loop.
7486   LoopVectorizationRequirements Requirements(*ORE);
7487   LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE,
7488                                 &Requirements, &Hints, DB, AC);
7489   if (!LVL.canVectorize(EnableVPlanNativePath)) {
7490     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
7491     Hints.emitRemarkWithHints();
7492     return false;
7493   }
7494 
7495   // Check the function attributes and profiles to find out if this function
7496   // should be optimized for size.
7497   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(F, L, Hints, PSI, BFI);
7498 
7499   // Entrance to the VPlan-native vectorization path. Outer loops are processed
7500   // here. They may require CFG and instruction level transformations before
7501   // even evaluating whether vectorization is profitable. Since we cannot modify
7502   // the incoming IR, we need to build VPlan upfront in the vectorization
7503   // pipeline.
7504   if (!L->empty())
7505     return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
7506                                         ORE, BFI, PSI, Hints);
7507 
7508   assert(L->empty() && "Inner loop expected.");
7509 
7510   // Check the loop for a trip count threshold: vectorize loops with a tiny trip
7511   // count by optimizing for size, to minimize overheads.
7512   auto ExpectedTC = getSmallBestKnownTC(*SE, L);
7513   if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
7514     LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
7515                       << "This loop is worth vectorizing only if no scalar "
7516                       << "iteration overheads are incurred.");
7517     if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
7518       LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
7519     else {
7520       LLVM_DEBUG(dbgs() << "\n");
7521       SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
7522     }
7523   }
7524 
7525   // Check the function attributes to see if implicit floats are allowed.
7526   // FIXME: This check doesn't seem possibly correct -- what if the loop is
7527   // an integer loop and the vector instructions selected are purely integer
7528   // vector instructions?
7529   if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
7530     reportVectorizationFailure(
7531         "Can't vectorize when the NoImplicitFloat attribute is used",
7532         "loop not vectorized due to NoImplicitFloat attribute",
7533         "NoImplicitFloat", ORE, L);
7534     Hints.emitRemarkWithHints();
7535     return false;
7536   }
7537 
7538   // Check if the target supports potentially unsafe FP vectorization.
7539   // FIXME: Add a check for the type of safety issue (denormal, signaling)
7540   // for the target we're vectorizing for, to make sure none of the
7541   // additional fp-math flags can help.
7542   if (Hints.isPotentiallyUnsafe() &&
7543       TTI->isFPVectorizationPotentiallyUnsafe()) {
7544     reportVectorizationFailure(
7545         "Potentially unsafe FP op prevents vectorization",
7546         "loop not vectorized due to unsafe FP support.",
7547         "UnsafeFP", ORE, L);
7548     Hints.emitRemarkWithHints();
7549     return false;
7550   }
7551 
7552   bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
7553   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
7554 
7555   // If an override option has been passed in for interleaved accesses, use it.
7556   if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
7557     UseInterleaved = EnableInterleavedMemAccesses;
7558 
7559   // Analyze interleaved memory accesses.
7560   if (UseInterleaved) {
7561     IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
7562   }
7563 
7564   // Use the cost model.
7565   LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
7566                                 F, &Hints, IAI);
7567   CM.collectValuesToIgnore();
7568 
7569   // Use the planner for vectorization.
7570   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM);
7571 
7572   // Get user vectorization factor.
7573   unsigned UserVF = Hints.getWidth();
7574 
7575   // Plan how to best vectorize, return the best VF and its cost.
7576   Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF);
7577 
7578   VectorizationFactor VF = VectorizationFactor::Disabled();
7579   unsigned IC = 1;
7580   unsigned UserIC = Hints.getInterleave();
7581 
7582   if (MaybeVF) {
7583     VF = *MaybeVF;
7584     // Select the interleave count.
7585     IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
7586   }
7587 
7588   // Identify the diagnostic messages that should be produced.
7589   std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
7590   bool VectorizeLoop = true, InterleaveLoop = true;
7591   if (Requirements.doesNotMeet(F, L, Hints)) {
7592     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization "
7593                          "requirements.\n");
7594     Hints.emitRemarkWithHints();
7595     return false;
7596   }
7597 
7598   if (VF.Width == 1) {
7599     LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
7600     VecDiagMsg = std::make_pair(
7601         "VectorizationNotBeneficial",
7602         "the cost-model indicates that vectorization is not beneficial");
7603     VectorizeLoop = false;
7604   }
7605 
7606   if (!MaybeVF && UserIC > 1) {
7607     // Tell the user interleaving was avoided up-front, despite being explicitly
7608     // requested.
7609     LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
7610                          "interleaving should be avoided up front\n");
7611     IntDiagMsg = std::make_pair(
7612         "InterleavingAvoided",
7613         "Ignoring UserIC, because interleaving was avoided up front");
7614     InterleaveLoop = false;
7615   } else if (IC == 1 && UserIC <= 1) {
7616     // Tell the user interleaving is not beneficial.
7617     LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
7618     IntDiagMsg = std::make_pair(
7619         "InterleavingNotBeneficial",
7620         "the cost-model indicates that interleaving is not beneficial");
7621     InterleaveLoop = false;
7622     if (UserIC == 1) {
7623       IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
7624       IntDiagMsg.second +=
7625           " and is explicitly disabled or interleave count is set to 1";
7626     }
7627   } else if (IC > 1 && UserIC == 1) {
7628     // Tell the user interleaving is beneficial, but it explicitly disabled.
7629     LLVM_DEBUG(
7630         dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
7631     IntDiagMsg = std::make_pair(
7632         "InterleavingBeneficialButDisabled",
7633         "the cost-model indicates that interleaving is beneficial "
7634         "but is explicitly disabled or interleave count is set to 1");
7635     InterleaveLoop = false;
7636   }
7637 
7638   // Override IC if user provided an interleave count.
7639   IC = UserIC > 0 ? UserIC : IC;
7640 
7641   // Emit diagnostic messages, if any.
7642   const char *VAPassName = Hints.vectorizeAnalysisPassName();
7643   if (!VectorizeLoop && !InterleaveLoop) {
7644     // Do not vectorize or interleaving the loop.
7645     ORE->emit([&]() {
7646       return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
7647                                       L->getStartLoc(), L->getHeader())
7648              << VecDiagMsg.second;
7649     });
7650     ORE->emit([&]() {
7651       return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
7652                                       L->getStartLoc(), L->getHeader())
7653              << IntDiagMsg.second;
7654     });
7655     return false;
7656   } else if (!VectorizeLoop && InterleaveLoop) {
7657     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
7658     ORE->emit([&]() {
7659       return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
7660                                         L->getStartLoc(), L->getHeader())
7661              << VecDiagMsg.second;
7662     });
7663   } else if (VectorizeLoop && !InterleaveLoop) {
7664     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
7665                       << ") in " << DebugLocStr << '\n');
7666     ORE->emit([&]() {
7667       return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
7668                                         L->getStartLoc(), L->getHeader())
7669              << IntDiagMsg.second;
7670     });
7671   } else if (VectorizeLoop && InterleaveLoop) {
7672     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
7673                       << ") in " << DebugLocStr << '\n');
7674     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
7675   }
7676 
7677   LVP.setBestPlan(VF.Width, IC);
7678 
7679   using namespace ore;
7680   bool DisableRuntimeUnroll = false;
7681   MDNode *OrigLoopID = L->getLoopID();
7682 
7683   if (!VectorizeLoop) {
7684     assert(IC > 1 && "interleave count should not be 1 or 0");
7685     // If we decided that it is not legal to vectorize the loop, then
7686     // interleave it.
7687     InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
7688                                &CM);
7689     LVP.executePlan(Unroller, DT);
7690 
7691     ORE->emit([&]() {
7692       return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
7693                                 L->getHeader())
7694              << "interleaved loop (interleaved count: "
7695              << NV("InterleaveCount", IC) << ")";
7696     });
7697   } else {
7698     // If we decided that it is *legal* to vectorize the loop, then do it.
7699     InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
7700                            &LVL, &CM);
7701     LVP.executePlan(LB, DT);
7702     ++LoopsVectorized;
7703 
7704     // Add metadata to disable runtime unrolling a scalar loop when there are
7705     // no runtime checks about strides and memory. A scalar loop that is
7706     // rarely used is not worth unrolling.
7707     if (!LB.areSafetyChecksAdded())
7708       DisableRuntimeUnroll = true;
7709 
7710     // Report the vectorization decision.
7711     ORE->emit([&]() {
7712       return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(),
7713                                 L->getHeader())
7714              << "vectorized loop (vectorization width: "
7715              << NV("VectorizationFactor", VF.Width)
7716              << ", interleaved count: " << NV("InterleaveCount", IC) << ")";
7717     });
7718   }
7719 
7720   Optional<MDNode *> RemainderLoopID =
7721       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
7722                                       LLVMLoopVectorizeFollowupEpilogue});
7723   if (RemainderLoopID.hasValue()) {
7724     L->setLoopID(RemainderLoopID.getValue());
7725   } else {
7726     if (DisableRuntimeUnroll)
7727       AddRuntimeUnrollDisableMetaData(L);
7728 
7729     // Mark the loop as already vectorized to avoid vectorizing again.
7730     Hints.setAlreadyVectorized();
7731   }
7732 
7733   LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent()));
7734   return true;
7735 }
7736 
7737 bool LoopVectorizePass::runImpl(
7738     Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_,
7739     DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_,
7740     DemandedBits &DB_, AliasAnalysis &AA_, AssumptionCache &AC_,
7741     std::function<const LoopAccessInfo &(Loop &)> &GetLAA_,
7742     OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) {
7743   SE = &SE_;
7744   LI = &LI_;
7745   TTI = &TTI_;
7746   DT = &DT_;
7747   BFI = &BFI_;
7748   TLI = TLI_;
7749   AA = &AA_;
7750   AC = &AC_;
7751   GetLAA = &GetLAA_;
7752   DB = &DB_;
7753   ORE = &ORE_;
7754   PSI = PSI_;
7755 
7756   // Don't attempt if
7757   // 1. the target claims to have no vector registers, and
7758   // 2. interleaving won't help ILP.
7759   //
7760   // The second condition is necessary because, even if the target has no
7761   // vector registers, loop vectorization may still enable scalar
7762   // interleaving.
7763   if (!TTI->getNumberOfRegisters(true) && TTI->getMaxInterleaveFactor(1) < 2)
7764     return false;
7765 
7766   bool Changed = false;
7767 
7768   // The vectorizer requires loops to be in simplified form.
7769   // Since simplification may add new inner loops, it has to run before the
7770   // legality and profitability checks. This means running the loop vectorizer
7771   // will simplify all loops, regardless of whether anything end up being
7772   // vectorized.
7773   for (auto &L : *LI)
7774     Changed |=
7775         simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
7776 
7777   // Build up a worklist of inner-loops to vectorize. This is necessary as
7778   // the act of vectorizing or partially unrolling a loop creates new loops
7779   // and can invalidate iterators across the loops.
7780   SmallVector<Loop *, 8> Worklist;
7781 
7782   for (Loop *L : *LI)
7783     collectSupportedLoops(*L, LI, ORE, Worklist);
7784 
7785   LoopsAnalyzed += Worklist.size();
7786 
7787   // Now walk the identified inner loops.
7788   while (!Worklist.empty()) {
7789     Loop *L = Worklist.pop_back_val();
7790 
7791     // For the inner loops we actually process, form LCSSA to simplify the
7792     // transform.
7793     Changed |= formLCSSARecursively(*L, *DT, LI, SE);
7794 
7795     Changed |= processLoop(L);
7796   }
7797 
7798   // Process each loop nest in the function.
7799   return Changed;
7800 }
7801 
7802 PreservedAnalyses LoopVectorizePass::run(Function &F,
7803                                          FunctionAnalysisManager &AM) {
7804     auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
7805     auto &LI = AM.getResult<LoopAnalysis>(F);
7806     auto &TTI = AM.getResult<TargetIRAnalysis>(F);
7807     auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
7808     auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F);
7809     auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
7810     auto &AA = AM.getResult<AAManager>(F);
7811     auto &AC = AM.getResult<AssumptionAnalysis>(F);
7812     auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
7813     auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
7814     MemorySSA *MSSA = EnableMSSALoopDependency
7815                           ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA()
7816                           : nullptr;
7817 
7818     auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
7819     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
7820         [&](Loop &L) -> const LoopAccessInfo & {
7821       LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI, MSSA};
7822       return LAM.getResult<LoopAccessAnalysis>(L, AR);
7823     };
7824     const ModuleAnalysisManager &MAM =
7825         AM.getResult<ModuleAnalysisManagerFunctionProxy>(F).getManager();
7826     ProfileSummaryInfo *PSI =
7827         MAM.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
7828     bool Changed =
7829         runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI);
7830     if (!Changed)
7831       return PreservedAnalyses::all();
7832     PreservedAnalyses PA;
7833 
7834     // We currently do not preserve loopinfo/dominator analyses with outer loop
7835     // vectorization. Until this is addressed, mark these analyses as preserved
7836     // only for non-VPlan-native path.
7837     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
7838     if (!EnableVPlanNativePath) {
7839       PA.preserve<LoopAnalysis>();
7840       PA.preserve<DominatorTreeAnalysis>();
7841     }
7842     PA.preserve<BasicAA>();
7843     PA.preserve<GlobalsAA>();
7844     return PA;
7845 }
7846