1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10 // and generates target-independent LLVM-IR.
11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12 // of instructions in order to estimate the profitability of vectorization.
13 //
14 // The loop vectorizer combines consecutive loop iterations into a single
15 // 'wide' iteration. After this transformation the index is incremented
16 // by the SIMD vector width, and not by one.
17 //
18 // This pass has three parts:
19 // 1. The main loop pass that drives the different parts.
20 // 2. LoopVectorizationLegality - A unit that checks for the legality
21 //    of the vectorization.
22 // 3. InnerLoopVectorizer - A unit that performs the actual
23 //    widening of instructions.
24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
25 //    of vectorization. It decides on the optimal vector width, which
26 //    can be one, if vectorization is not profitable.
27 //
28 // There is a development effort going on to migrate loop vectorizer to the
29 // VPlan infrastructure and to introduce outer loop vectorization support (see
30 // docs/Proposal/VectorizationPlan.rst and
31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32 // purpose, we temporarily introduced the VPlan-native vectorization path: an
33 // alternative vectorization path that is natively implemented on top of the
34 // VPlan infrastructure. See EnableVPlanNativePath for enabling.
35 //
36 //===----------------------------------------------------------------------===//
37 //
38 // The reduction-variable vectorization is based on the paper:
39 //  D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40 //
41 // Variable uniformity checks are inspired by:
42 //  Karrenberg, R. and Hack, S. Whole Function Vectorization.
43 //
44 // The interleaved access vectorization is based on the paper:
45 //  Dorit Nuzman, Ira Rosen and Ayal Zaks.  Auto-Vectorization of Interleaved
46 //  Data for SIMD
47 //
48 // Other ideas/concepts are from:
49 //  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50 //
51 //  S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua.  An Evaluation of
52 //  Vectorizing Compilers.
53 //
54 //===----------------------------------------------------------------------===//
55 
56 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
57 #include "LoopVectorizationPlanner.h"
58 #include "VPRecipeBuilder.h"
59 #include "VPlan.h"
60 #include "VPlanHCFGBuilder.h"
61 #include "VPlanPredicator.h"
62 #include "VPlanTransforms.h"
63 #include "llvm/ADT/APInt.h"
64 #include "llvm/ADT/ArrayRef.h"
65 #include "llvm/ADT/DenseMap.h"
66 #include "llvm/ADT/DenseMapInfo.h"
67 #include "llvm/ADT/Hashing.h"
68 #include "llvm/ADT/MapVector.h"
69 #include "llvm/ADT/None.h"
70 #include "llvm/ADT/Optional.h"
71 #include "llvm/ADT/STLExtras.h"
72 #include "llvm/ADT/SetVector.h"
73 #include "llvm/ADT/SmallPtrSet.h"
74 #include "llvm/ADT/SmallVector.h"
75 #include "llvm/ADT/Statistic.h"
76 #include "llvm/ADT/StringRef.h"
77 #include "llvm/ADT/Twine.h"
78 #include "llvm/ADT/iterator_range.h"
79 #include "llvm/Analysis/AssumptionCache.h"
80 #include "llvm/Analysis/BasicAliasAnalysis.h"
81 #include "llvm/Analysis/BlockFrequencyInfo.h"
82 #include "llvm/Analysis/CFG.h"
83 #include "llvm/Analysis/CodeMetrics.h"
84 #include "llvm/Analysis/DemandedBits.h"
85 #include "llvm/Analysis/GlobalsModRef.h"
86 #include "llvm/Analysis/LoopAccessAnalysis.h"
87 #include "llvm/Analysis/LoopAnalysisManager.h"
88 #include "llvm/Analysis/LoopInfo.h"
89 #include "llvm/Analysis/LoopIterator.h"
90 #include "llvm/Analysis/MemorySSA.h"
91 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
92 #include "llvm/Analysis/ProfileSummaryInfo.h"
93 #include "llvm/Analysis/ScalarEvolution.h"
94 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
95 #include "llvm/Analysis/TargetLibraryInfo.h"
96 #include "llvm/Analysis/TargetTransformInfo.h"
97 #include "llvm/Analysis/VectorUtils.h"
98 #include "llvm/IR/Attributes.h"
99 #include "llvm/IR/BasicBlock.h"
100 #include "llvm/IR/CFG.h"
101 #include "llvm/IR/Constant.h"
102 #include "llvm/IR/Constants.h"
103 #include "llvm/IR/DataLayout.h"
104 #include "llvm/IR/DebugInfoMetadata.h"
105 #include "llvm/IR/DebugLoc.h"
106 #include "llvm/IR/DerivedTypes.h"
107 #include "llvm/IR/DiagnosticInfo.h"
108 #include "llvm/IR/Dominators.h"
109 #include "llvm/IR/Function.h"
110 #include "llvm/IR/IRBuilder.h"
111 #include "llvm/IR/InstrTypes.h"
112 #include "llvm/IR/Instruction.h"
113 #include "llvm/IR/Instructions.h"
114 #include "llvm/IR/IntrinsicInst.h"
115 #include "llvm/IR/Intrinsics.h"
116 #include "llvm/IR/LLVMContext.h"
117 #include "llvm/IR/Metadata.h"
118 #include "llvm/IR/Module.h"
119 #include "llvm/IR/Operator.h"
120 #include "llvm/IR/Type.h"
121 #include "llvm/IR/Use.h"
122 #include "llvm/IR/User.h"
123 #include "llvm/IR/Value.h"
124 #include "llvm/IR/ValueHandle.h"
125 #include "llvm/IR/Verifier.h"
126 #include "llvm/InitializePasses.h"
127 #include "llvm/Pass.h"
128 #include "llvm/Support/Casting.h"
129 #include "llvm/Support/CommandLine.h"
130 #include "llvm/Support/Compiler.h"
131 #include "llvm/Support/Debug.h"
132 #include "llvm/Support/ErrorHandling.h"
133 #include "llvm/Support/MathExtras.h"
134 #include "llvm/Support/raw_ostream.h"
135 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
136 #include "llvm/Transforms/Utils/InjectTLIMappings.h"
137 #include "llvm/Transforms/Utils/LoopSimplify.h"
138 #include "llvm/Transforms/Utils/LoopUtils.h"
139 #include "llvm/Transforms/Utils/LoopVersioning.h"
140 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
141 #include "llvm/Transforms/Utils/SizeOpts.h"
142 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
143 #include <algorithm>
144 #include <cassert>
145 #include <cstdint>
146 #include <cstdlib>
147 #include <functional>
148 #include <iterator>
149 #include <limits>
150 #include <memory>
151 #include <string>
152 #include <tuple>
153 #include <utility>
154 
155 using namespace llvm;
156 
157 #define LV_NAME "loop-vectorize"
158 #define DEBUG_TYPE LV_NAME
159 
160 /// @{
161 /// Metadata attribute names
162 static const char *const LLVMLoopVectorizeFollowupAll =
163     "llvm.loop.vectorize.followup_all";
164 static const char *const LLVMLoopVectorizeFollowupVectorized =
165     "llvm.loop.vectorize.followup_vectorized";
166 static const char *const LLVMLoopVectorizeFollowupEpilogue =
167     "llvm.loop.vectorize.followup_epilogue";
168 /// @}
169 
170 STATISTIC(LoopsVectorized, "Number of loops vectorized");
171 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
172 
173 /// Loops with a known constant trip count below this number are vectorized only
174 /// if no scalar iteration overheads are incurred.
175 static cl::opt<unsigned> TinyTripCountVectorThreshold(
176     "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
177     cl::desc("Loops with a constant trip count that is smaller than this "
178              "value are vectorized only if no scalar iteration overheads "
179              "are incurred."));
180 
181 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
182 // that predication is preferred, and this lists all options. I.e., the
183 // vectorizer will try to fold the tail-loop (epilogue) into the vector body
184 // and predicate the instructions accordingly. If tail-folding fails, there are
185 // different fallback strategies depending on these values:
186 namespace PreferPredicateTy {
187   enum Option {
188     ScalarEpilogue = 0,
189     PredicateElseScalarEpilogue,
190     PredicateOrDontVectorize
191   };
192 } // namespace PreferPredicateTy
193 
194 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue(
195     "prefer-predicate-over-epilogue",
196     cl::init(PreferPredicateTy::ScalarEpilogue),
197     cl::Hidden,
198     cl::desc("Tail-folding and predication preferences over creating a scalar "
199              "epilogue loop."),
200     cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue,
201                          "scalar-epilogue",
202                          "Don't tail-predicate loops, create scalar epilogue"),
203               clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue,
204                          "predicate-else-scalar-epilogue",
205                          "prefer tail-folding, create scalar epilogue if tail "
206                          "folding fails."),
207               clEnumValN(PreferPredicateTy::PredicateOrDontVectorize,
208                          "predicate-dont-vectorize",
209                          "prefers tail-folding, don't attempt vectorization if "
210                          "tail-folding fails.")));
211 
212 static cl::opt<bool> MaximizeBandwidth(
213     "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
214     cl::desc("Maximize bandwidth when selecting vectorization factor which "
215              "will be determined by the smallest type in loop."));
216 
217 static cl::opt<bool> EnableInterleavedMemAccesses(
218     "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
219     cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
220 
221 /// An interleave-group may need masking if it resides in a block that needs
222 /// predication, or in order to mask away gaps.
223 static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
224     "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
225     cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
226 
227 static cl::opt<unsigned> TinyTripCountInterleaveThreshold(
228     "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden,
229     cl::desc("We don't interleave loops with a estimated constant trip count "
230              "below this number"));
231 
232 static cl::opt<unsigned> ForceTargetNumScalarRegs(
233     "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
234     cl::desc("A flag that overrides the target's number of scalar registers."));
235 
236 static cl::opt<unsigned> ForceTargetNumVectorRegs(
237     "force-target-num-vector-regs", cl::init(0), cl::Hidden,
238     cl::desc("A flag that overrides the target's number of vector registers."));
239 
240 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
241     "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
242     cl::desc("A flag that overrides the target's max interleave factor for "
243              "scalar loops."));
244 
245 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
246     "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
247     cl::desc("A flag that overrides the target's max interleave factor for "
248              "vectorized loops."));
249 
250 static cl::opt<unsigned> ForceTargetInstructionCost(
251     "force-target-instruction-cost", cl::init(0), cl::Hidden,
252     cl::desc("A flag that overrides the target's expected cost for "
253              "an instruction to a single constant value. Mostly "
254              "useful for getting consistent testing."));
255 
256 static cl::opt<unsigned> SmallLoopCost(
257     "small-loop-cost", cl::init(20), cl::Hidden,
258     cl::desc(
259         "The cost of a loop that is considered 'small' by the interleaver."));
260 
261 static cl::opt<bool> LoopVectorizeWithBlockFrequency(
262     "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
263     cl::desc("Enable the use of the block frequency analysis to access PGO "
264              "heuristics minimizing code growth in cold regions and being more "
265              "aggressive in hot regions."));
266 
267 // Runtime interleave loops for load/store throughput.
268 static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
269     "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
270     cl::desc(
271         "Enable runtime interleaving until load/store ports are saturated"));
272 
273 /// Interleave small loops with scalar reductions.
274 static cl::opt<bool> InterleaveSmallLoopScalarReduction(
275     "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden,
276     cl::desc("Enable interleaving for loops with small iteration counts that "
277              "contain scalar reductions to expose ILP."));
278 
279 /// The number of stores in a loop that are allowed to need predication.
280 static cl::opt<unsigned> NumberOfStoresToPredicate(
281     "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
282     cl::desc("Max number of stores to be predicated behind an if."));
283 
284 static cl::opt<bool> EnableIndVarRegisterHeur(
285     "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
286     cl::desc("Count the induction variable only once when interleaving"));
287 
288 static cl::opt<bool> EnableCondStoresVectorization(
289     "enable-cond-stores-vec", cl::init(true), cl::Hidden,
290     cl::desc("Enable if predication of stores during vectorization."));
291 
292 static cl::opt<unsigned> MaxNestedScalarReductionIC(
293     "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
294     cl::desc("The maximum interleave count to use when interleaving a scalar "
295              "reduction in a nested loop."));
296 
297 static cl::opt<bool>
298     PreferInLoopReductions("prefer-inloop-reductions", cl::init(false),
299                            cl::Hidden,
300                            cl::desc("Prefer in-loop vector reductions, "
301                                     "overriding the targets preference."));
302 
303 static cl::opt<bool> PreferPredicatedReductionSelect(
304     "prefer-predicated-reduction-select", cl::init(false), cl::Hidden,
305     cl::desc(
306         "Prefer predicating a reduction operation over an after loop select."));
307 
308 cl::opt<bool> EnableVPlanNativePath(
309     "enable-vplan-native-path", cl::init(false), cl::Hidden,
310     cl::desc("Enable VPlan-native vectorization path with "
311              "support for outer loop vectorization."));
312 
313 // FIXME: Remove this switch once we have divergence analysis. Currently we
314 // assume divergent non-backedge branches when this switch is true.
315 cl::opt<bool> EnableVPlanPredication(
316     "enable-vplan-predication", cl::init(false), cl::Hidden,
317     cl::desc("Enable VPlan-native vectorization path predicator with "
318              "support for outer loop vectorization."));
319 
320 // This flag enables the stress testing of the VPlan H-CFG construction in the
321 // VPlan-native vectorization path. It must be used in conjuction with
322 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
323 // verification of the H-CFGs built.
324 static cl::opt<bool> VPlanBuildStressTest(
325     "vplan-build-stress-test", cl::init(false), cl::Hidden,
326     cl::desc(
327         "Build VPlan for every supported loop nest in the function and bail "
328         "out right after the build (stress test the VPlan H-CFG construction "
329         "in the VPlan-native vectorization path)."));
330 
331 cl::opt<bool> llvm::EnableLoopInterleaving(
332     "interleave-loops", cl::init(true), cl::Hidden,
333     cl::desc("Enable loop interleaving in Loop vectorization passes"));
334 cl::opt<bool> llvm::EnableLoopVectorization(
335     "vectorize-loops", cl::init(true), cl::Hidden,
336     cl::desc("Run the Loop vectorization passes"));
337 
338 /// A helper function that returns the type of loaded or stored value.
339 static Type *getMemInstValueType(Value *I) {
340   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
341          "Expected Load or Store instruction");
342   if (auto *LI = dyn_cast<LoadInst>(I))
343     return LI->getType();
344   return cast<StoreInst>(I)->getValueOperand()->getType();
345 }
346 
347 /// A helper function that returns true if the given type is irregular. The
348 /// type is irregular if its allocated size doesn't equal the store size of an
349 /// element of the corresponding vector type at the given vectorization factor.
350 static bool hasIrregularType(Type *Ty, const DataLayout &DL, ElementCount VF) {
351   assert(!VF.isScalable() && "scalable vectors not yet supported.");
352   // Determine if an array of VF elements of type Ty is "bitcast compatible"
353   // with a <VF x Ty> vector.
354   if (VF.isVector()) {
355     auto *VectorTy = VectorType::get(Ty, VF);
356     return TypeSize::get(VF.getKnownMinValue() *
357                              DL.getTypeAllocSize(Ty).getFixedValue(),
358                          VF.isScalable()) != DL.getTypeStoreSize(VectorTy);
359   }
360 
361   // If the vectorization factor is one, we just check if an array of type Ty
362   // requires padding between elements.
363   return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
364 }
365 
366 /// A helper function that returns the reciprocal of the block probability of
367 /// predicated blocks. If we return X, we are assuming the predicated block
368 /// will execute once for every X iterations of the loop header.
369 ///
370 /// TODO: We should use actual block probability here, if available. Currently,
371 ///       we always assume predicated blocks have a 50% chance of executing.
372 static unsigned getReciprocalPredBlockProb() { return 2; }
373 
374 /// A helper function that adds a 'fast' flag to floating-point operations.
375 static Value *addFastMathFlag(Value *V) {
376   if (isa<FPMathOperator>(V))
377     cast<Instruction>(V)->setFastMathFlags(FastMathFlags::getFast());
378   return V;
379 }
380 
381 static Value *addFastMathFlag(Value *V, FastMathFlags FMF) {
382   if (isa<FPMathOperator>(V))
383     cast<Instruction>(V)->setFastMathFlags(FMF);
384   return V;
385 }
386 
387 /// A helper function that returns an integer or floating-point constant with
388 /// value C.
389 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
390   return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
391                            : ConstantFP::get(Ty, C);
392 }
393 
394 /// Returns "best known" trip count for the specified loop \p L as defined by
395 /// the following procedure:
396 ///   1) Returns exact trip count if it is known.
397 ///   2) Returns expected trip count according to profile data if any.
398 ///   3) Returns upper bound estimate if it is known.
399 ///   4) Returns None if all of the above failed.
400 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) {
401   // Check if exact trip count is known.
402   if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
403     return ExpectedTC;
404 
405   // Check if there is an expected trip count available from profile data.
406   if (LoopVectorizeWithBlockFrequency)
407     if (auto EstimatedTC = getLoopEstimatedTripCount(L))
408       return EstimatedTC;
409 
410   // Check if upper bound estimate is known.
411   if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
412     return ExpectedTC;
413 
414   return None;
415 }
416 
417 namespace llvm {
418 
419 /// InnerLoopVectorizer vectorizes loops which contain only one basic
420 /// block to a specified vectorization factor (VF).
421 /// This class performs the widening of scalars into vectors, or multiple
422 /// scalars. This class also implements the following features:
423 /// * It inserts an epilogue loop for handling loops that don't have iteration
424 ///   counts that are known to be a multiple of the vectorization factor.
425 /// * It handles the code generation for reduction variables.
426 /// * Scalarization (implementation using scalars) of un-vectorizable
427 ///   instructions.
428 /// InnerLoopVectorizer does not perform any vectorization-legality
429 /// checks, and relies on the caller to check for the different legality
430 /// aspects. The InnerLoopVectorizer relies on the
431 /// LoopVectorizationLegality class to provide information about the induction
432 /// and reduction variables that were found to a given vectorization factor.
433 class InnerLoopVectorizer {
434 public:
435   InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
436                       LoopInfo *LI, DominatorTree *DT,
437                       const TargetLibraryInfo *TLI,
438                       const TargetTransformInfo *TTI, AssumptionCache *AC,
439                       OptimizationRemarkEmitter *ORE, ElementCount VecWidth,
440                       unsigned UnrollFactor, LoopVectorizationLegality *LVL,
441                       LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
442                       ProfileSummaryInfo *PSI)
443       : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
444         AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
445         Builder(PSE.getSE()->getContext()),
446         VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM),
447         BFI(BFI), PSI(PSI) {
448     // Query this against the original loop and save it here because the profile
449     // of the original loop header may change as the transformation happens.
450     OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize(
451         OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass);
452   }
453 
454   virtual ~InnerLoopVectorizer() = default;
455 
456   /// Create a new empty loop that will contain vectorized instructions later
457   /// on, while the old loop will be used as the scalar remainder. Control flow
458   /// is generated around the vectorized (and scalar epilogue) loops consisting
459   /// of various checks and bypasses. Return the pre-header block of the new
460   /// loop.
461   BasicBlock *createVectorizedLoopSkeleton();
462 
463   /// Widen a single instruction within the innermost loop.
464   void widenInstruction(Instruction &I, VPValue *Def, VPUser &Operands,
465                         VPTransformState &State);
466 
467   /// Widen a single call instruction within the innermost loop.
468   void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands,
469                             VPTransformState &State);
470 
471   /// Widen a single select instruction within the innermost loop.
472   void widenSelectInstruction(SelectInst &I, VPValue *VPDef, VPUser &Operands,
473                               bool InvariantCond, VPTransformState &State);
474 
475   /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
476   void fixVectorizedLoop();
477 
478   // Return true if any runtime check is added.
479   bool areSafetyChecksAdded() { return AddedSafetyChecks; }
480 
481   /// A type for vectorized values in the new loop. Each value from the
482   /// original loop, when vectorized, is represented by UF vector values in the
483   /// new unrolled loop, where UF is the unroll factor.
484   using VectorParts = SmallVector<Value *, 2>;
485 
486   /// Vectorize a single GetElementPtrInst based on information gathered and
487   /// decisions taken during planning.
488   void widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, VPUser &Indices,
489                 unsigned UF, ElementCount VF, bool IsPtrLoopInvariant,
490                 SmallBitVector &IsIndexLoopInvariant, VPTransformState &State);
491 
492   /// Vectorize a single PHINode in a block. This method handles the induction
493   /// variable canonicalization. It supports both VF = 1 for unrolled loops and
494   /// arbitrary length vectors.
495   void widenPHIInstruction(Instruction *PN, unsigned UF, ElementCount VF);
496 
497   /// A helper function to scalarize a single Instruction in the innermost loop.
498   /// Generates a sequence of scalar instances for each lane between \p MinLane
499   /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
500   /// inclusive. Uses the VPValue operands from \p Operands instead of \p
501   /// Instr's operands.
502   void scalarizeInstruction(Instruction *Instr, VPUser &Operands,
503                             const VPIteration &Instance, bool IfPredicateInstr,
504                             VPTransformState &State);
505 
506   /// Widen an integer or floating-point induction variable \p IV. If \p Trunc
507   /// is provided, the integer induction variable will first be truncated to
508   /// the corresponding type.
509   void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr);
510 
511   /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a
512   /// vector or scalar value on-demand if one is not yet available. When
513   /// vectorizing a loop, we visit the definition of an instruction before its
514   /// uses. When visiting the definition, we either vectorize or scalarize the
515   /// instruction, creating an entry for it in the corresponding map. (In some
516   /// cases, such as induction variables, we will create both vector and scalar
517   /// entries.) Then, as we encounter uses of the definition, we derive values
518   /// for each scalar or vector use unless such a value is already available.
519   /// For example, if we scalarize a definition and one of its uses is vector,
520   /// we build the required vector on-demand with an insertelement sequence
521   /// when visiting the use. Otherwise, if the use is scalar, we can use the
522   /// existing scalar definition.
523   ///
524   /// Return a value in the new loop corresponding to \p V from the original
525   /// loop at unroll index \p Part. If the value has already been vectorized,
526   /// the corresponding vector entry in VectorLoopValueMap is returned. If,
527   /// however, the value has a scalar entry in VectorLoopValueMap, we construct
528   /// a new vector value on-demand by inserting the scalar values into a vector
529   /// with an insertelement sequence. If the value has been neither vectorized
530   /// nor scalarized, it must be loop invariant, so we simply broadcast the
531   /// value into a vector.
532   Value *getOrCreateVectorValue(Value *V, unsigned Part);
533 
534   void setVectorValue(Value *Scalar, unsigned Part, Value *Vector) {
535     VectorLoopValueMap.setVectorValue(Scalar, Part, Vector);
536   }
537 
538   /// Return a value in the new loop corresponding to \p V from the original
539   /// loop at unroll and vector indices \p Instance. If the value has been
540   /// vectorized but not scalarized, the necessary extractelement instruction
541   /// will be generated.
542   Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance);
543 
544   /// Construct the vector value of a scalarized value \p V one lane at a time.
545   void packScalarIntoVectorValue(Value *V, const VPIteration &Instance);
546 
547   /// Try to vectorize interleaved access group \p Group with the base address
548   /// given in \p Addr, optionally masking the vector operations if \p
549   /// BlockInMask is non-null. Use \p State to translate given VPValues to IR
550   /// values in the vectorized loop.
551   void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group,
552                                 VPTransformState &State, VPValue *Addr,
553                                 ArrayRef<VPValue *> StoredValues,
554                                 VPValue *BlockInMask = nullptr);
555 
556   /// Vectorize Load and Store instructions with the base address given in \p
557   /// Addr, optionally masking the vector operations if \p BlockInMask is
558   /// non-null. Use \p State to translate given VPValues to IR values in the
559   /// vectorized loop.
560   void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State,
561                                   VPValue *Def, VPValue *Addr,
562                                   VPValue *StoredValue, VPValue *BlockInMask);
563 
564   /// Set the debug location in the builder using the debug location in
565   /// the instruction.
566   void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr);
567 
568   /// Fix the non-induction PHIs in the OrigPHIsToFix vector.
569   void fixNonInductionPHIs(void);
570 
571 protected:
572   friend class LoopVectorizationPlanner;
573 
574   /// A small list of PHINodes.
575   using PhiVector = SmallVector<PHINode *, 4>;
576 
577   /// A type for scalarized values in the new loop. Each value from the
578   /// original loop, when scalarized, is represented by UF x VF scalar values
579   /// in the new unrolled loop, where UF is the unroll factor and VF is the
580   /// vectorization factor.
581   using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
582 
583   /// Set up the values of the IVs correctly when exiting the vector loop.
584   void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
585                     Value *CountRoundDown, Value *EndValue,
586                     BasicBlock *MiddleBlock);
587 
588   /// Create a new induction variable inside L.
589   PHINode *createInductionVariable(Loop *L, Value *Start, Value *End,
590                                    Value *Step, Instruction *DL);
591 
592   /// Handle all cross-iteration phis in the header.
593   void fixCrossIterationPHIs();
594 
595   /// Fix a first-order recurrence. This is the second phase of vectorizing
596   /// this phi node.
597   void fixFirstOrderRecurrence(PHINode *Phi);
598 
599   /// Fix a reduction cross-iteration phi. This is the second phase of
600   /// vectorizing this phi node.
601   void fixReduction(PHINode *Phi);
602 
603   /// Clear NSW/NUW flags from reduction instructions if necessary.
604   void clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc);
605 
606   /// The Loop exit block may have single value PHI nodes with some
607   /// incoming value. While vectorizing we only handled real values
608   /// that were defined inside the loop and we should have one value for
609   /// each predecessor of its parent basic block. See PR14725.
610   void fixLCSSAPHIs();
611 
612   /// Iteratively sink the scalarized operands of a predicated instruction into
613   /// the block that was created for it.
614   void sinkScalarOperands(Instruction *PredInst);
615 
616   /// Shrinks vector element sizes to the smallest bitwidth they can be legally
617   /// represented as.
618   void truncateToMinimalBitwidths();
619 
620   /// Create a broadcast instruction. This method generates a broadcast
621   /// instruction (shuffle) for loop invariant values and for the induction
622   /// value. If this is the induction variable then we extend it to N, N+1, ...
623   /// this is needed because each iteration in the loop corresponds to a SIMD
624   /// element.
625   virtual Value *getBroadcastInstrs(Value *V);
626 
627   /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...)
628   /// to each vector element of Val. The sequence starts at StartIndex.
629   /// \p Opcode is relevant for FP induction variable.
630   virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step,
631                                Instruction::BinaryOps Opcode =
632                                Instruction::BinaryOpsEnd);
633 
634   /// Compute scalar induction steps. \p ScalarIV is the scalar induction
635   /// variable on which to base the steps, \p Step is the size of the step, and
636   /// \p EntryVal is the value from the original loop that maps to the steps.
637   /// Note that \p EntryVal doesn't have to be an induction variable - it
638   /// can also be a truncate instruction.
639   void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal,
640                         const InductionDescriptor &ID);
641 
642   /// Create a vector induction phi node based on an existing scalar one. \p
643   /// EntryVal is the value from the original loop that maps to the vector phi
644   /// node, and \p Step is the loop-invariant step. If \p EntryVal is a
645   /// truncate instruction, instead of widening the original IV, we widen a
646   /// version of the IV truncated to \p EntryVal's type.
647   void createVectorIntOrFpInductionPHI(const InductionDescriptor &II,
648                                        Value *Step, Instruction *EntryVal);
649 
650   /// Returns true if an instruction \p I should be scalarized instead of
651   /// vectorized for the chosen vectorization factor.
652   bool shouldScalarizeInstruction(Instruction *I) const;
653 
654   /// Returns true if we should generate a scalar version of \p IV.
655   bool needsScalarInduction(Instruction *IV) const;
656 
657   /// If there is a cast involved in the induction variable \p ID, which should
658   /// be ignored in the vectorized loop body, this function records the
659   /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the
660   /// cast. We had already proved that the casted Phi is equal to the uncasted
661   /// Phi in the vectorized loop (under a runtime guard), and therefore
662   /// there is no need to vectorize the cast - the same value can be used in the
663   /// vector loop for both the Phi and the cast.
664   /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified,
665   /// Otherwise, \p VectorLoopValue is a widened/vectorized value.
666   ///
667   /// \p EntryVal is the value from the original loop that maps to the vector
668   /// phi node and is used to distinguish what is the IV currently being
669   /// processed - original one (if \p EntryVal is a phi corresponding to the
670   /// original IV) or the "newly-created" one based on the proof mentioned above
671   /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the
672   /// latter case \p EntryVal is a TruncInst and we must not record anything for
673   /// that IV, but it's error-prone to expect callers of this routine to care
674   /// about that, hence this explicit parameter.
675   void recordVectorLoopValueForInductionCast(const InductionDescriptor &ID,
676                                              const Instruction *EntryVal,
677                                              Value *VectorLoopValue,
678                                              unsigned Part,
679                                              unsigned Lane = UINT_MAX);
680 
681   /// Generate a shuffle sequence that will reverse the vector Vec.
682   virtual Value *reverseVector(Value *Vec);
683 
684   /// Returns (and creates if needed) the original loop trip count.
685   Value *getOrCreateTripCount(Loop *NewLoop);
686 
687   /// Returns (and creates if needed) the trip count of the widened loop.
688   Value *getOrCreateVectorTripCount(Loop *NewLoop);
689 
690   /// Returns a bitcasted value to the requested vector type.
691   /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
692   Value *createBitOrPointerCast(Value *V, VectorType *DstVTy,
693                                 const DataLayout &DL);
694 
695   /// Emit a bypass check to see if the vector trip count is zero, including if
696   /// it overflows.
697   void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass);
698 
699   /// Emit a bypass check to see if all of the SCEV assumptions we've
700   /// had to make are correct.
701   void emitSCEVChecks(Loop *L, BasicBlock *Bypass);
702 
703   /// Emit bypass checks to check any memory assumptions we may have made.
704   void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass);
705 
706   /// Compute the transformed value of Index at offset StartValue using step
707   /// StepValue.
708   /// For integer induction, returns StartValue + Index * StepValue.
709   /// For pointer induction, returns StartValue[Index * StepValue].
710   /// FIXME: The newly created binary instructions should contain nsw/nuw
711   /// flags, which can be found from the original scalar operations.
712   Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE,
713                               const DataLayout &DL,
714                               const InductionDescriptor &ID) const;
715 
716   /// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
717   /// vector loop preheader, middle block and scalar preheader. Also
718   /// allocate a loop object for the new vector loop and return it.
719   Loop *createVectorLoopSkeleton(StringRef Prefix);
720 
721   /// Create new phi nodes for the induction variables to resume iteration count
722   /// in the scalar epilogue, from where the vectorized loop left off (given by
723   /// \p VectorTripCount).
724   void createInductionResumeValues(Loop *L, Value *VectorTripCount);
725 
726   /// Complete the loop skeleton by adding debug MDs, creating appropriate
727   /// conditional branches in the middle block, preparing the builder and
728   /// running the verifier. Take in the vector loop \p L as argument, and return
729   /// the preheader of the completed vector loop.
730   BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID);
731 
732   /// Add additional metadata to \p To that was not present on \p Orig.
733   ///
734   /// Currently this is used to add the noalias annotations based on the
735   /// inserted memchecks.  Use this for instructions that are *cloned* into the
736   /// vector loop.
737   void addNewMetadata(Instruction *To, const Instruction *Orig);
738 
739   /// Add metadata from one instruction to another.
740   ///
741   /// This includes both the original MDs from \p From and additional ones (\see
742   /// addNewMetadata).  Use this for *newly created* instructions in the vector
743   /// loop.
744   void addMetadata(Instruction *To, Instruction *From);
745 
746   /// Similar to the previous function but it adds the metadata to a
747   /// vector of instructions.
748   void addMetadata(ArrayRef<Value *> To, Instruction *From);
749 
750   /// The original loop.
751   Loop *OrigLoop;
752 
753   /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
754   /// dynamic knowledge to simplify SCEV expressions and converts them to a
755   /// more usable form.
756   PredicatedScalarEvolution &PSE;
757 
758   /// Loop Info.
759   LoopInfo *LI;
760 
761   /// Dominator Tree.
762   DominatorTree *DT;
763 
764   /// Alias Analysis.
765   AAResults *AA;
766 
767   /// Target Library Info.
768   const TargetLibraryInfo *TLI;
769 
770   /// Target Transform Info.
771   const TargetTransformInfo *TTI;
772 
773   /// Assumption Cache.
774   AssumptionCache *AC;
775 
776   /// Interface to emit optimization remarks.
777   OptimizationRemarkEmitter *ORE;
778 
779   /// LoopVersioning.  It's only set up (non-null) if memchecks were
780   /// used.
781   ///
782   /// This is currently only used to add no-alias metadata based on the
783   /// memchecks.  The actually versioning is performed manually.
784   std::unique_ptr<LoopVersioning> LVer;
785 
786   /// The vectorization SIMD factor to use. Each vector will have this many
787   /// vector elements.
788   ElementCount VF;
789 
790   /// The vectorization unroll factor to use. Each scalar is vectorized to this
791   /// many different vector instructions.
792   unsigned UF;
793 
794   /// The builder that we use
795   IRBuilder<> Builder;
796 
797   // --- Vectorization state ---
798 
799   /// The vector-loop preheader.
800   BasicBlock *LoopVectorPreHeader;
801 
802   /// The scalar-loop preheader.
803   BasicBlock *LoopScalarPreHeader;
804 
805   /// Middle Block between the vector and the scalar.
806   BasicBlock *LoopMiddleBlock;
807 
808   /// The ExitBlock of the scalar loop.
809   BasicBlock *LoopExitBlock;
810 
811   /// The vector loop body.
812   BasicBlock *LoopVectorBody;
813 
814   /// The scalar loop body.
815   BasicBlock *LoopScalarBody;
816 
817   /// A list of all bypass blocks. The first block is the entry of the loop.
818   SmallVector<BasicBlock *, 4> LoopBypassBlocks;
819 
820   /// The new Induction variable which was added to the new block.
821   PHINode *Induction = nullptr;
822 
823   /// The induction variable of the old basic block.
824   PHINode *OldInduction = nullptr;
825 
826   /// Maps values from the original loop to their corresponding values in the
827   /// vectorized loop. A key value can map to either vector values, scalar
828   /// values or both kinds of values, depending on whether the key was
829   /// vectorized and scalarized.
830   VectorizerValueMap VectorLoopValueMap;
831 
832   /// Store instructions that were predicated.
833   SmallVector<Instruction *, 4> PredicatedInstructions;
834 
835   /// Trip count of the original loop.
836   Value *TripCount = nullptr;
837 
838   /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
839   Value *VectorTripCount = nullptr;
840 
841   /// The legality analysis.
842   LoopVectorizationLegality *Legal;
843 
844   /// The profitablity analysis.
845   LoopVectorizationCostModel *Cost;
846 
847   // Record whether runtime checks are added.
848   bool AddedSafetyChecks = false;
849 
850   // Holds the end values for each induction variable. We save the end values
851   // so we can later fix-up the external users of the induction variables.
852   DenseMap<PHINode *, Value *> IVEndValues;
853 
854   // Vector of original scalar PHIs whose corresponding widened PHIs need to be
855   // fixed up at the end of vector code generation.
856   SmallVector<PHINode *, 8> OrigPHIsToFix;
857 
858   /// BFI and PSI are used to check for profile guided size optimizations.
859   BlockFrequencyInfo *BFI;
860   ProfileSummaryInfo *PSI;
861 
862   // Whether this loop should be optimized for size based on profile guided size
863   // optimizatios.
864   bool OptForSizeBasedOnProfile;
865 };
866 
867 class InnerLoopUnroller : public InnerLoopVectorizer {
868 public:
869   InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
870                     LoopInfo *LI, DominatorTree *DT,
871                     const TargetLibraryInfo *TLI,
872                     const TargetTransformInfo *TTI, AssumptionCache *AC,
873                     OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
874                     LoopVectorizationLegality *LVL,
875                     LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
876                     ProfileSummaryInfo *PSI)
877       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
878                             ElementCount::getFixed(1), UnrollFactor, LVL, CM,
879                             BFI, PSI) {}
880 
881 private:
882   Value *getBroadcastInstrs(Value *V) override;
883   Value *getStepVector(Value *Val, int StartIdx, Value *Step,
884                        Instruction::BinaryOps Opcode =
885                        Instruction::BinaryOpsEnd) override;
886   Value *reverseVector(Value *Vec) override;
887 };
888 
889 } // end namespace llvm
890 
891 /// Look for a meaningful debug location on the instruction or it's
892 /// operands.
893 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
894   if (!I)
895     return I;
896 
897   DebugLoc Empty;
898   if (I->getDebugLoc() != Empty)
899     return I;
900 
901   for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) {
902     if (Instruction *OpInst = dyn_cast<Instruction>(*OI))
903       if (OpInst->getDebugLoc() != Empty)
904         return OpInst;
905   }
906 
907   return I;
908 }
909 
910 void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) {
911   if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) {
912     const DILocation *DIL = Inst->getDebugLoc();
913     if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
914         !isa<DbgInfoIntrinsic>(Inst)) {
915       assert(!VF.isScalable() && "scalable vectors not yet supported.");
916       auto NewDIL =
917           DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue());
918       if (NewDIL)
919         B.SetCurrentDebugLocation(NewDIL.getValue());
920       else
921         LLVM_DEBUG(dbgs()
922                    << "Failed to create new discriminator: "
923                    << DIL->getFilename() << " Line: " << DIL->getLine());
924     }
925     else
926       B.SetCurrentDebugLocation(DIL);
927   } else
928     B.SetCurrentDebugLocation(DebugLoc());
929 }
930 
931 /// Write a record \p DebugMsg about vectorization failure to the debug
932 /// output stream. If \p I is passed, it is an instruction that prevents
933 /// vectorization.
934 #ifndef NDEBUG
935 static void debugVectorizationFailure(const StringRef DebugMsg,
936     Instruction *I) {
937   dbgs() << "LV: Not vectorizing: " << DebugMsg;
938   if (I != nullptr)
939     dbgs() << " " << *I;
940   else
941     dbgs() << '.';
942   dbgs() << '\n';
943 }
944 #endif
945 
946 /// Create an analysis remark that explains why vectorization failed
947 ///
948 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint).  \p
949 /// RemarkName is the identifier for the remark.  If \p I is passed it is an
950 /// instruction that prevents vectorization.  Otherwise \p TheLoop is used for
951 /// the location of the remark.  \return the remark object that can be
952 /// streamed to.
953 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
954     StringRef RemarkName, Loop *TheLoop, Instruction *I) {
955   Value *CodeRegion = TheLoop->getHeader();
956   DebugLoc DL = TheLoop->getStartLoc();
957 
958   if (I) {
959     CodeRegion = I->getParent();
960     // If there is no debug location attached to the instruction, revert back to
961     // using the loop's.
962     if (I->getDebugLoc())
963       DL = I->getDebugLoc();
964   }
965 
966   OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion);
967   R << "loop not vectorized: ";
968   return R;
969 }
970 
971 namespace llvm {
972 
973 void reportVectorizationFailure(const StringRef DebugMsg,
974     const StringRef OREMsg, const StringRef ORETag,
975     OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) {
976   LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I));
977   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
978   ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(),
979                 ORETag, TheLoop, I) << OREMsg);
980 }
981 
982 } // end namespace llvm
983 
984 #ifndef NDEBUG
985 /// \return string containing a file name and a line # for the given loop.
986 static std::string getDebugLocString(const Loop *L) {
987   std::string Result;
988   if (L) {
989     raw_string_ostream OS(Result);
990     if (const DebugLoc LoopDbgLoc = L->getStartLoc())
991       LoopDbgLoc.print(OS);
992     else
993       // Just print the module name.
994       OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
995     OS.flush();
996   }
997   return Result;
998 }
999 #endif
1000 
1001 void InnerLoopVectorizer::addNewMetadata(Instruction *To,
1002                                          const Instruction *Orig) {
1003   // If the loop was versioned with memchecks, add the corresponding no-alias
1004   // metadata.
1005   if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig)))
1006     LVer->annotateInstWithNoAlias(To, Orig);
1007 }
1008 
1009 void InnerLoopVectorizer::addMetadata(Instruction *To,
1010                                       Instruction *From) {
1011   propagateMetadata(To, From);
1012   addNewMetadata(To, From);
1013 }
1014 
1015 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,
1016                                       Instruction *From) {
1017   for (Value *V : To) {
1018     if (Instruction *I = dyn_cast<Instruction>(V))
1019       addMetadata(I, From);
1020   }
1021 }
1022 
1023 namespace llvm {
1024 
1025 // Loop vectorization cost-model hints how the scalar epilogue loop should be
1026 // lowered.
1027 enum ScalarEpilogueLowering {
1028 
1029   // The default: allowing scalar epilogues.
1030   CM_ScalarEpilogueAllowed,
1031 
1032   // Vectorization with OptForSize: don't allow epilogues.
1033   CM_ScalarEpilogueNotAllowedOptSize,
1034 
1035   // A special case of vectorisation with OptForSize: loops with a very small
1036   // trip count are considered for vectorization under OptForSize, thereby
1037   // making sure the cost of their loop body is dominant, free of runtime
1038   // guards and scalar iteration overheads.
1039   CM_ScalarEpilogueNotAllowedLowTripLoop,
1040 
1041   // Loop hint predicate indicating an epilogue is undesired.
1042   CM_ScalarEpilogueNotNeededUsePredicate
1043 };
1044 
1045 /// LoopVectorizationCostModel - estimates the expected speedups due to
1046 /// vectorization.
1047 /// In many cases vectorization is not profitable. This can happen because of
1048 /// a number of reasons. In this class we mainly attempt to predict the
1049 /// expected speedup/slowdowns due to the supported instruction set. We use the
1050 /// TargetTransformInfo to query the different backends for the cost of
1051 /// different operations.
1052 class LoopVectorizationCostModel {
1053 public:
1054   LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
1055                              PredicatedScalarEvolution &PSE, LoopInfo *LI,
1056                              LoopVectorizationLegality *Legal,
1057                              const TargetTransformInfo &TTI,
1058                              const TargetLibraryInfo *TLI, DemandedBits *DB,
1059                              AssumptionCache *AC,
1060                              OptimizationRemarkEmitter *ORE, const Function *F,
1061                              const LoopVectorizeHints *Hints,
1062                              InterleavedAccessInfo &IAI)
1063       : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
1064         TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
1065         Hints(Hints), InterleaveInfo(IAI) {}
1066 
1067   /// \return An upper bound for the vectorization factor, or None if
1068   /// vectorization and interleaving should be avoided up front.
1069   Optional<ElementCount> computeMaxVF(ElementCount UserVF, unsigned UserIC);
1070 
1071   /// \return True if runtime checks are required for vectorization, and false
1072   /// otherwise.
1073   bool runtimeChecksRequired();
1074 
1075   /// \return The most profitable vectorization factor and the cost of that VF.
1076   /// This method checks every power of two up to MaxVF. If UserVF is not ZERO
1077   /// then this vectorization factor will be selected if vectorization is
1078   /// possible.
1079   VectorizationFactor selectVectorizationFactor(ElementCount MaxVF);
1080 
1081   /// Setup cost-based decisions for user vectorization factor.
1082   void selectUserVectorizationFactor(ElementCount UserVF) {
1083     collectUniformsAndScalars(UserVF);
1084     collectInstsToScalarize(UserVF);
1085   }
1086 
1087   /// \return The size (in bits) of the smallest and widest types in the code
1088   /// that needs to be vectorized. We ignore values that remain scalar such as
1089   /// 64 bit loop indices.
1090   std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
1091 
1092   /// \return The desired interleave count.
1093   /// If interleave count has been specified by metadata it will be returned.
1094   /// Otherwise, the interleave count is computed and returned. VF and LoopCost
1095   /// are the selected vectorization factor and the cost of the selected VF.
1096   unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost);
1097 
1098   /// Memory access instruction may be vectorized in more than one way.
1099   /// Form of instruction after vectorization depends on cost.
1100   /// This function takes cost-based decisions for Load/Store instructions
1101   /// and collects them in a map. This decisions map is used for building
1102   /// the lists of loop-uniform and loop-scalar instructions.
1103   /// The calculated cost is saved with widening decision in order to
1104   /// avoid redundant calculations.
1105   void setCostBasedWideningDecision(ElementCount VF);
1106 
1107   /// A struct that represents some properties of the register usage
1108   /// of a loop.
1109   struct RegisterUsage {
1110     /// Holds the number of loop invariant values that are used in the loop.
1111     /// The key is ClassID of target-provided register class.
1112     SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs;
1113     /// Holds the maximum number of concurrent live intervals in the loop.
1114     /// The key is ClassID of target-provided register class.
1115     SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
1116   };
1117 
1118   /// \return Returns information about the register usages of the loop for the
1119   /// given vectorization factors.
1120   SmallVector<RegisterUsage, 8>
1121   calculateRegisterUsage(ArrayRef<ElementCount> VFs);
1122 
1123   /// Collect values we want to ignore in the cost model.
1124   void collectValuesToIgnore();
1125 
1126   /// Split reductions into those that happen in the loop, and those that happen
1127   /// outside. In loop reductions are collected into InLoopReductionChains.
1128   void collectInLoopReductions();
1129 
1130   /// \returns The smallest bitwidth each instruction can be represented with.
1131   /// The vector equivalents of these instructions should be truncated to this
1132   /// type.
1133   const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
1134     return MinBWs;
1135   }
1136 
1137   /// \returns True if it is more profitable to scalarize instruction \p I for
1138   /// vectorization factor \p VF.
1139   bool isProfitableToScalarize(Instruction *I, ElementCount VF) const {
1140     assert(VF.isVector() &&
1141            "Profitable to scalarize relevant only for VF > 1.");
1142 
1143     // Cost model is not run in the VPlan-native path - return conservative
1144     // result until this changes.
1145     if (EnableVPlanNativePath)
1146       return false;
1147 
1148     auto Scalars = InstsToScalarize.find(VF);
1149     assert(Scalars != InstsToScalarize.end() &&
1150            "VF not yet analyzed for scalarization profitability");
1151     return Scalars->second.find(I) != Scalars->second.end();
1152   }
1153 
1154   /// Returns true if \p I is known to be uniform after vectorization.
1155   bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const {
1156     if (VF.isScalar())
1157       return true;
1158 
1159     // Cost model is not run in the VPlan-native path - return conservative
1160     // result until this changes.
1161     if (EnableVPlanNativePath)
1162       return false;
1163 
1164     auto UniformsPerVF = Uniforms.find(VF);
1165     assert(UniformsPerVF != Uniforms.end() &&
1166            "VF not yet analyzed for uniformity");
1167     return UniformsPerVF->second.count(I);
1168   }
1169 
1170   /// Returns true if \p I is known to be scalar after vectorization.
1171   bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const {
1172     if (VF.isScalar())
1173       return true;
1174 
1175     // Cost model is not run in the VPlan-native path - return conservative
1176     // result until this changes.
1177     if (EnableVPlanNativePath)
1178       return false;
1179 
1180     auto ScalarsPerVF = Scalars.find(VF);
1181     assert(ScalarsPerVF != Scalars.end() &&
1182            "Scalar values are not calculated for VF");
1183     return ScalarsPerVF->second.count(I);
1184   }
1185 
1186   /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1187   /// for vectorization factor \p VF.
1188   bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const {
1189     return VF.isVector() && MinBWs.find(I) != MinBWs.end() &&
1190            !isProfitableToScalarize(I, VF) &&
1191            !isScalarAfterVectorization(I, VF);
1192   }
1193 
1194   /// Decision that was taken during cost calculation for memory instruction.
1195   enum InstWidening {
1196     CM_Unknown,
1197     CM_Widen,         // For consecutive accesses with stride +1.
1198     CM_Widen_Reverse, // For consecutive accesses with stride -1.
1199     CM_Interleave,
1200     CM_GatherScatter,
1201     CM_Scalarize
1202   };
1203 
1204   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1205   /// instruction \p I and vector width \p VF.
1206   void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W,
1207                            unsigned Cost) {
1208     assert(VF.isVector() && "Expected VF >=2");
1209     WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1210   }
1211 
1212   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1213   /// interleaving group \p Grp and vector width \p VF.
1214   void setWideningDecision(const InterleaveGroup<Instruction> *Grp,
1215                            ElementCount VF, InstWidening W, unsigned Cost) {
1216     assert(VF.isVector() && "Expected VF >=2");
1217     /// Broadcast this decicion to all instructions inside the group.
1218     /// But the cost will be assigned to one instruction only.
1219     for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1220       if (auto *I = Grp->getMember(i)) {
1221         if (Grp->getInsertPos() == I)
1222           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1223         else
1224           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1225       }
1226     }
1227   }
1228 
1229   /// Return the cost model decision for the given instruction \p I and vector
1230   /// width \p VF. Return CM_Unknown if this instruction did not pass
1231   /// through the cost modeling.
1232   InstWidening getWideningDecision(Instruction *I, ElementCount VF) {
1233     assert(!VF.isScalable() && "scalable vectors not yet supported.");
1234     assert(VF.isVector() && "Expected VF >=2");
1235 
1236     // Cost model is not run in the VPlan-native path - return conservative
1237     // result until this changes.
1238     if (EnableVPlanNativePath)
1239       return CM_GatherScatter;
1240 
1241     std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1242     auto Itr = WideningDecisions.find(InstOnVF);
1243     if (Itr == WideningDecisions.end())
1244       return CM_Unknown;
1245     return Itr->second.first;
1246   }
1247 
1248   /// Return the vectorization cost for the given instruction \p I and vector
1249   /// width \p VF.
1250   unsigned getWideningCost(Instruction *I, ElementCount VF) {
1251     assert(VF.isVector() && "Expected VF >=2");
1252     std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1253     assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&
1254            "The cost is not calculated");
1255     return WideningDecisions[InstOnVF].second;
1256   }
1257 
1258   /// Return True if instruction \p I is an optimizable truncate whose operand
1259   /// is an induction variable. Such a truncate will be removed by adding a new
1260   /// induction variable with the destination type.
1261   bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) {
1262     // If the instruction is not a truncate, return false.
1263     auto *Trunc = dyn_cast<TruncInst>(I);
1264     if (!Trunc)
1265       return false;
1266 
1267     // Get the source and destination types of the truncate.
1268     Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1269     Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1270 
1271     // If the truncate is free for the given types, return false. Replacing a
1272     // free truncate with an induction variable would add an induction variable
1273     // update instruction to each iteration of the loop. We exclude from this
1274     // check the primary induction variable since it will need an update
1275     // instruction regardless.
1276     Value *Op = Trunc->getOperand(0);
1277     if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1278       return false;
1279 
1280     // If the truncated value is not an induction variable, return false.
1281     return Legal->isInductionPhi(Op);
1282   }
1283 
1284   /// Collects the instructions to scalarize for each predicated instruction in
1285   /// the loop.
1286   void collectInstsToScalarize(ElementCount VF);
1287 
1288   /// Collect Uniform and Scalar values for the given \p VF.
1289   /// The sets depend on CM decision for Load/Store instructions
1290   /// that may be vectorized as interleave, gather-scatter or scalarized.
1291   void collectUniformsAndScalars(ElementCount VF) {
1292     // Do the analysis once.
1293     if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end())
1294       return;
1295     setCostBasedWideningDecision(VF);
1296     collectLoopUniforms(VF);
1297     collectLoopScalars(VF);
1298   }
1299 
1300   /// Returns true if the target machine supports masked store operation
1301   /// for the given \p DataType and kind of access to \p Ptr.
1302   bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) {
1303     return Legal->isConsecutivePtr(Ptr) &&
1304            TTI.isLegalMaskedStore(DataType, Alignment);
1305   }
1306 
1307   /// Returns true if the target machine supports masked load operation
1308   /// for the given \p DataType and kind of access to \p Ptr.
1309   bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) {
1310     return Legal->isConsecutivePtr(Ptr) &&
1311            TTI.isLegalMaskedLoad(DataType, Alignment);
1312   }
1313 
1314   /// Returns true if the target machine supports masked scatter operation
1315   /// for the given \p DataType.
1316   bool isLegalMaskedScatter(Type *DataType, Align Alignment) {
1317     return TTI.isLegalMaskedScatter(DataType, Alignment);
1318   }
1319 
1320   /// Returns true if the target machine supports masked gather operation
1321   /// for the given \p DataType.
1322   bool isLegalMaskedGather(Type *DataType, Align Alignment) {
1323     return TTI.isLegalMaskedGather(DataType, Alignment);
1324   }
1325 
1326   /// Returns true if the target machine can represent \p V as a masked gather
1327   /// or scatter operation.
1328   bool isLegalGatherOrScatter(Value *V) {
1329     bool LI = isa<LoadInst>(V);
1330     bool SI = isa<StoreInst>(V);
1331     if (!LI && !SI)
1332       return false;
1333     auto *Ty = getMemInstValueType(V);
1334     Align Align = getLoadStoreAlignment(V);
1335     return (LI && isLegalMaskedGather(Ty, Align)) ||
1336            (SI && isLegalMaskedScatter(Ty, Align));
1337   }
1338 
1339   /// Returns true if \p I is an instruction that will be scalarized with
1340   /// predication. Such instructions include conditional stores and
1341   /// instructions that may divide by zero.
1342   /// If a non-zero VF has been calculated, we check if I will be scalarized
1343   /// predication for that VF.
1344   bool isScalarWithPredication(Instruction *I,
1345                                ElementCount VF = ElementCount::getFixed(1));
1346 
1347   // Returns true if \p I is an instruction that will be predicated either
1348   // through scalar predication or masked load/store or masked gather/scatter.
1349   // Superset of instructions that return true for isScalarWithPredication.
1350   bool isPredicatedInst(Instruction *I) {
1351     if (!blockNeedsPredication(I->getParent()))
1352       return false;
1353     // Loads and stores that need some form of masked operation are predicated
1354     // instructions.
1355     if (isa<LoadInst>(I) || isa<StoreInst>(I))
1356       return Legal->isMaskRequired(I);
1357     return isScalarWithPredication(I);
1358   }
1359 
1360   /// Returns true if \p I is a memory instruction with consecutive memory
1361   /// access that can be widened.
1362   bool
1363   memoryInstructionCanBeWidened(Instruction *I,
1364                                 ElementCount VF = ElementCount::getFixed(1));
1365 
1366   /// Returns true if \p I is a memory instruction in an interleaved-group
1367   /// of memory accesses that can be vectorized with wide vector loads/stores
1368   /// and shuffles.
1369   bool
1370   interleavedAccessCanBeWidened(Instruction *I,
1371                                 ElementCount VF = ElementCount::getFixed(1));
1372 
1373   /// Check if \p Instr belongs to any interleaved access group.
1374   bool isAccessInterleaved(Instruction *Instr) {
1375     return InterleaveInfo.isInterleaved(Instr);
1376   }
1377 
1378   /// Get the interleaved access group that \p Instr belongs to.
1379   const InterleaveGroup<Instruction> *
1380   getInterleavedAccessGroup(Instruction *Instr) {
1381     return InterleaveInfo.getInterleaveGroup(Instr);
1382   }
1383 
1384   /// Returns true if an interleaved group requires a scalar iteration
1385   /// to handle accesses with gaps, and there is nothing preventing us from
1386   /// creating a scalar epilogue.
1387   bool requiresScalarEpilogue() const {
1388     return isScalarEpilogueAllowed() && InterleaveInfo.requiresScalarEpilogue();
1389   }
1390 
1391   /// Returns true if a scalar epilogue is not allowed due to optsize or a
1392   /// loop hint annotation.
1393   bool isScalarEpilogueAllowed() const {
1394     return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1395   }
1396 
1397   /// Returns true if all loop blocks should be masked to fold tail loop.
1398   bool foldTailByMasking() const { return FoldTailByMasking; }
1399 
1400   bool blockNeedsPredication(BasicBlock *BB) {
1401     return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1402   }
1403 
1404   /// A SmallMapVector to store the InLoop reduction op chains, mapping phi
1405   /// nodes to the chain of instructions representing the reductions. Uses a
1406   /// MapVector to ensure deterministic iteration order.
1407   using ReductionChainMap =
1408       SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>;
1409 
1410   /// Return the chain of instructions representing an inloop reduction.
1411   const ReductionChainMap &getInLoopReductionChains() const {
1412     return InLoopReductionChains;
1413   }
1414 
1415   /// Returns true if the Phi is part of an inloop reduction.
1416   bool isInLoopReduction(PHINode *Phi) const {
1417     return InLoopReductionChains.count(Phi);
1418   }
1419 
1420   /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1421   /// with factor VF.  Return the cost of the instruction, including
1422   /// scalarization overhead if it's needed.
1423   unsigned getVectorIntrinsicCost(CallInst *CI, ElementCount VF);
1424 
1425   /// Estimate cost of a call instruction CI if it were vectorized with factor
1426   /// VF. Return the cost of the instruction, including scalarization overhead
1427   /// if it's needed. The flag NeedToScalarize shows if the call needs to be
1428   /// scalarized -
1429   /// i.e. either vector version isn't available, or is too expensive.
1430   unsigned getVectorCallCost(CallInst *CI, ElementCount VF,
1431                              bool &NeedToScalarize);
1432 
1433   /// Invalidates decisions already taken by the cost model.
1434   void invalidateCostModelingDecisions() {
1435     WideningDecisions.clear();
1436     Uniforms.clear();
1437     Scalars.clear();
1438   }
1439 
1440 private:
1441   unsigned NumPredStores = 0;
1442 
1443   /// \return An upper bound for the vectorization factor, a power-of-2 larger
1444   /// than zero. One is returned if vectorization should best be avoided due
1445   /// to cost.
1446   ElementCount computeFeasibleMaxVF(unsigned ConstTripCount);
1447 
1448   /// The vectorization cost is a combination of the cost itself and a boolean
1449   /// indicating whether any of the contributing operations will actually
1450   /// operate on
1451   /// vector values after type legalization in the backend. If this latter value
1452   /// is
1453   /// false, then all operations will be scalarized (i.e. no vectorization has
1454   /// actually taken place).
1455   using VectorizationCostTy = std::pair<unsigned, bool>;
1456 
1457   /// Returns the expected execution cost. The unit of the cost does
1458   /// not matter because we use the 'cost' units to compare different
1459   /// vector widths. The cost that is returned is *not* normalized by
1460   /// the factor width.
1461   VectorizationCostTy expectedCost(ElementCount VF);
1462 
1463   /// Returns the execution time cost of an instruction for a given vector
1464   /// width. Vector width of one means scalar.
1465   VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF);
1466 
1467   /// The cost-computation logic from getInstructionCost which provides
1468   /// the vector type as an output parameter.
1469   unsigned getInstructionCost(Instruction *I, ElementCount VF, Type *&VectorTy);
1470 
1471   /// Calculate vectorization cost of memory instruction \p I.
1472   unsigned getMemoryInstructionCost(Instruction *I, ElementCount VF);
1473 
1474   /// The cost computation for scalarized memory instruction.
1475   unsigned getMemInstScalarizationCost(Instruction *I, ElementCount VF);
1476 
1477   /// The cost computation for interleaving group of memory instructions.
1478   unsigned getInterleaveGroupCost(Instruction *I, ElementCount VF);
1479 
1480   /// The cost computation for Gather/Scatter instruction.
1481   unsigned getGatherScatterCost(Instruction *I, ElementCount VF);
1482 
1483   /// The cost computation for widening instruction \p I with consecutive
1484   /// memory access.
1485   unsigned getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
1486 
1487   /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1488   /// Load: scalar load + broadcast.
1489   /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1490   /// element)
1491   unsigned getUniformMemOpCost(Instruction *I, ElementCount VF);
1492 
1493   /// Estimate the overhead of scalarizing an instruction. This is a
1494   /// convenience wrapper for the type-based getScalarizationOverhead API.
1495   unsigned getScalarizationOverhead(Instruction *I, ElementCount VF);
1496 
1497   /// Returns whether the instruction is a load or store and will be a emitted
1498   /// as a vector operation.
1499   bool isConsecutiveLoadOrStore(Instruction *I);
1500 
1501   /// Returns true if an artificially high cost for emulated masked memrefs
1502   /// should be used.
1503   bool useEmulatedMaskMemRefHack(Instruction *I);
1504 
1505   /// Map of scalar integer values to the smallest bitwidth they can be legally
1506   /// represented as. The vector equivalents of these values should be truncated
1507   /// to this type.
1508   MapVector<Instruction *, uint64_t> MinBWs;
1509 
1510   /// A type representing the costs for instructions if they were to be
1511   /// scalarized rather than vectorized. The entries are Instruction-Cost
1512   /// pairs.
1513   using ScalarCostsTy = DenseMap<Instruction *, unsigned>;
1514 
1515   /// A set containing all BasicBlocks that are known to present after
1516   /// vectorization as a predicated block.
1517   SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;
1518 
1519   /// Records whether it is allowed to have the original scalar loop execute at
1520   /// least once. This may be needed as a fallback loop in case runtime
1521   /// aliasing/dependence checks fail, or to handle the tail/remainder
1522   /// iterations when the trip count is unknown or doesn't divide by the VF,
1523   /// or as a peel-loop to handle gaps in interleave-groups.
1524   /// Under optsize and when the trip count is very small we don't allow any
1525   /// iterations to execute in the scalar loop.
1526   ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1527 
1528   /// All blocks of loop are to be masked to fold tail of scalar iterations.
1529   bool FoldTailByMasking = false;
1530 
1531   /// A map holding scalar costs for different vectorization factors. The
1532   /// presence of a cost for an instruction in the mapping indicates that the
1533   /// instruction will be scalarized when vectorizing with the associated
1534   /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1535   DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize;
1536 
1537   /// Holds the instructions known to be uniform after vectorization.
1538   /// The data is collected per VF.
1539   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms;
1540 
1541   /// Holds the instructions known to be scalar after vectorization.
1542   /// The data is collected per VF.
1543   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars;
1544 
1545   /// Holds the instructions (address computations) that are forced to be
1546   /// scalarized.
1547   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1548 
1549   /// PHINodes of the reductions that should be expanded in-loop along with
1550   /// their associated chains of reduction operations, in program order from top
1551   /// (PHI) to bottom
1552   ReductionChainMap InLoopReductionChains;
1553 
1554   /// Returns the expected difference in cost from scalarizing the expression
1555   /// feeding a predicated instruction \p PredInst. The instructions to
1556   /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1557   /// non-negative return value implies the expression will be scalarized.
1558   /// Currently, only single-use chains are considered for scalarization.
1559   int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
1560                               ElementCount VF);
1561 
1562   /// Collect the instructions that are uniform after vectorization. An
1563   /// instruction is uniform if we represent it with a single scalar value in
1564   /// the vectorized loop corresponding to each vector iteration. Examples of
1565   /// uniform instructions include pointer operands of consecutive or
1566   /// interleaved memory accesses. Note that although uniformity implies an
1567   /// instruction will be scalar, the reverse is not true. In general, a
1568   /// scalarized instruction will be represented by VF scalar values in the
1569   /// vectorized loop, each corresponding to an iteration of the original
1570   /// scalar loop.
1571   void collectLoopUniforms(ElementCount VF);
1572 
1573   /// Collect the instructions that are scalar after vectorization. An
1574   /// instruction is scalar if it is known to be uniform or will be scalarized
1575   /// during vectorization. Non-uniform scalarized instructions will be
1576   /// represented by VF values in the vectorized loop, each corresponding to an
1577   /// iteration of the original scalar loop.
1578   void collectLoopScalars(ElementCount VF);
1579 
1580   /// Keeps cost model vectorization decision and cost for instructions.
1581   /// Right now it is used for memory instructions only.
1582   using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>,
1583                                 std::pair<InstWidening, unsigned>>;
1584 
1585   DecisionList WideningDecisions;
1586 
1587   /// Returns true if \p V is expected to be vectorized and it needs to be
1588   /// extracted.
1589   bool needsExtract(Value *V, ElementCount VF) const {
1590     Instruction *I = dyn_cast<Instruction>(V);
1591     if (VF.isScalar() || !I || !TheLoop->contains(I) ||
1592         TheLoop->isLoopInvariant(I))
1593       return false;
1594 
1595     // Assume we can vectorize V (and hence we need extraction) if the
1596     // scalars are not computed yet. This can happen, because it is called
1597     // via getScalarizationOverhead from setCostBasedWideningDecision, before
1598     // the scalars are collected. That should be a safe assumption in most
1599     // cases, because we check if the operands have vectorizable types
1600     // beforehand in LoopVectorizationLegality.
1601     return Scalars.find(VF) == Scalars.end() ||
1602            !isScalarAfterVectorization(I, VF);
1603   };
1604 
1605   /// Returns a range containing only operands needing to be extracted.
1606   SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1607                                                    ElementCount VF) {
1608     return SmallVector<Value *, 4>(make_filter_range(
1609         Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1610   }
1611 
1612 public:
1613   /// The loop that we evaluate.
1614   Loop *TheLoop;
1615 
1616   /// Predicated scalar evolution analysis.
1617   PredicatedScalarEvolution &PSE;
1618 
1619   /// Loop Info analysis.
1620   LoopInfo *LI;
1621 
1622   /// Vectorization legality.
1623   LoopVectorizationLegality *Legal;
1624 
1625   /// Vector target information.
1626   const TargetTransformInfo &TTI;
1627 
1628   /// Target Library Info.
1629   const TargetLibraryInfo *TLI;
1630 
1631   /// Demanded bits analysis.
1632   DemandedBits *DB;
1633 
1634   /// Assumption cache.
1635   AssumptionCache *AC;
1636 
1637   /// Interface to emit optimization remarks.
1638   OptimizationRemarkEmitter *ORE;
1639 
1640   const Function *TheFunction;
1641 
1642   /// Loop Vectorize Hint.
1643   const LoopVectorizeHints *Hints;
1644 
1645   /// The interleave access information contains groups of interleaved accesses
1646   /// with the same stride and close to each other.
1647   InterleavedAccessInfo &InterleaveInfo;
1648 
1649   /// Values to ignore in the cost model.
1650   SmallPtrSet<const Value *, 16> ValuesToIgnore;
1651 
1652   /// Values to ignore in the cost model when VF > 1.
1653   SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1654 };
1655 
1656 } // end namespace llvm
1657 
1658 // Return true if \p OuterLp is an outer loop annotated with hints for explicit
1659 // vectorization. The loop needs to be annotated with #pragma omp simd
1660 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
1661 // vector length information is not provided, vectorization is not considered
1662 // explicit. Interleave hints are not allowed either. These limitations will be
1663 // relaxed in the future.
1664 // Please, note that we are currently forced to abuse the pragma 'clang
1665 // vectorize' semantics. This pragma provides *auto-vectorization hints*
1666 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
1667 // provides *explicit vectorization hints* (LV can bypass legal checks and
1668 // assume that vectorization is legal). However, both hints are implemented
1669 // using the same metadata (llvm.loop.vectorize, processed by
1670 // LoopVectorizeHints). This will be fixed in the future when the native IR
1671 // representation for pragma 'omp simd' is introduced.
1672 static bool isExplicitVecOuterLoop(Loop *OuterLp,
1673                                    OptimizationRemarkEmitter *ORE) {
1674   assert(!OuterLp->isInnermost() && "This is not an outer loop");
1675   LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
1676 
1677   // Only outer loops with an explicit vectorization hint are supported.
1678   // Unannotated outer loops are ignored.
1679   if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
1680     return false;
1681 
1682   Function *Fn = OuterLp->getHeader()->getParent();
1683   if (!Hints.allowVectorization(Fn, OuterLp,
1684                                 true /*VectorizeOnlyWhenForced*/)) {
1685     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
1686     return false;
1687   }
1688 
1689   if (Hints.getInterleave() > 1) {
1690     // TODO: Interleave support is future work.
1691     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
1692                          "outer loops.\n");
1693     Hints.emitRemarkWithHints();
1694     return false;
1695   }
1696 
1697   return true;
1698 }
1699 
1700 static void collectSupportedLoops(Loop &L, LoopInfo *LI,
1701                                   OptimizationRemarkEmitter *ORE,
1702                                   SmallVectorImpl<Loop *> &V) {
1703   // Collect inner loops and outer loops without irreducible control flow. For
1704   // now, only collect outer loops that have explicit vectorization hints. If we
1705   // are stress testing the VPlan H-CFG construction, we collect the outermost
1706   // loop of every loop nest.
1707   if (L.isInnermost() || VPlanBuildStressTest ||
1708       (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
1709     LoopBlocksRPO RPOT(&L);
1710     RPOT.perform(LI);
1711     if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
1712       V.push_back(&L);
1713       // TODO: Collect inner loops inside marked outer loops in case
1714       // vectorization fails for the outer loop. Do not invoke
1715       // 'containsIrreducibleCFG' again for inner loops when the outer loop is
1716       // already known to be reducible. We can use an inherited attribute for
1717       // that.
1718       return;
1719     }
1720   }
1721   for (Loop *InnerL : L)
1722     collectSupportedLoops(*InnerL, LI, ORE, V);
1723 }
1724 
1725 namespace {
1726 
1727 /// The LoopVectorize Pass.
1728 struct LoopVectorize : public FunctionPass {
1729   /// Pass identification, replacement for typeid
1730   static char ID;
1731 
1732   LoopVectorizePass Impl;
1733 
1734   explicit LoopVectorize(bool InterleaveOnlyWhenForced = false,
1735                          bool VectorizeOnlyWhenForced = false)
1736       : FunctionPass(ID),
1737         Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) {
1738     initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
1739   }
1740 
1741   bool runOnFunction(Function &F) override {
1742     if (skipFunction(F))
1743       return false;
1744 
1745     auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
1746     auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
1747     auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
1748     auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
1749     auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
1750     auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
1751     auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
1752     auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
1753     auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
1754     auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
1755     auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
1756     auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
1757     auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
1758 
1759     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
1760         [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
1761 
1762     return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC,
1763                         GetLAA, *ORE, PSI).MadeAnyChange;
1764   }
1765 
1766   void getAnalysisUsage(AnalysisUsage &AU) const override {
1767     AU.addRequired<AssumptionCacheTracker>();
1768     AU.addRequired<BlockFrequencyInfoWrapperPass>();
1769     AU.addRequired<DominatorTreeWrapperPass>();
1770     AU.addRequired<LoopInfoWrapperPass>();
1771     AU.addRequired<ScalarEvolutionWrapperPass>();
1772     AU.addRequired<TargetTransformInfoWrapperPass>();
1773     AU.addRequired<AAResultsWrapperPass>();
1774     AU.addRequired<LoopAccessLegacyAnalysis>();
1775     AU.addRequired<DemandedBitsWrapperPass>();
1776     AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
1777     AU.addRequired<InjectTLIMappingsLegacy>();
1778 
1779     // We currently do not preserve loopinfo/dominator analyses with outer loop
1780     // vectorization. Until this is addressed, mark these analyses as preserved
1781     // only for non-VPlan-native path.
1782     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
1783     if (!EnableVPlanNativePath) {
1784       AU.addPreserved<LoopInfoWrapperPass>();
1785       AU.addPreserved<DominatorTreeWrapperPass>();
1786     }
1787 
1788     AU.addPreserved<BasicAAWrapperPass>();
1789     AU.addPreserved<GlobalsAAWrapperPass>();
1790     AU.addRequired<ProfileSummaryInfoWrapperPass>();
1791   }
1792 };
1793 
1794 } // end anonymous namespace
1795 
1796 //===----------------------------------------------------------------------===//
1797 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
1798 // LoopVectorizationCostModel and LoopVectorizationPlanner.
1799 //===----------------------------------------------------------------------===//
1800 
1801 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
1802   // We need to place the broadcast of invariant variables outside the loop,
1803   // but only if it's proven safe to do so. Else, broadcast will be inside
1804   // vector loop body.
1805   Instruction *Instr = dyn_cast<Instruction>(V);
1806   bool SafeToHoist = OrigLoop->isLoopInvariant(V) &&
1807                      (!Instr ||
1808                       DT->dominates(Instr->getParent(), LoopVectorPreHeader));
1809   // Place the code for broadcasting invariant variables in the new preheader.
1810   IRBuilder<>::InsertPointGuard Guard(Builder);
1811   if (SafeToHoist)
1812     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
1813 
1814   // Broadcast the scalar into all locations in the vector.
1815   Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
1816 
1817   return Shuf;
1818 }
1819 
1820 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
1821     const InductionDescriptor &II, Value *Step, Instruction *EntryVal) {
1822   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1823          "Expected either an induction phi-node or a truncate of it!");
1824   Value *Start = II.getStartValue();
1825 
1826   // Construct the initial value of the vector IV in the vector loop preheader
1827   auto CurrIP = Builder.saveIP();
1828   Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
1829   if (isa<TruncInst>(EntryVal)) {
1830     assert(Start->getType()->isIntegerTy() &&
1831            "Truncation requires an integer type");
1832     auto *TruncType = cast<IntegerType>(EntryVal->getType());
1833     Step = Builder.CreateTrunc(Step, TruncType);
1834     Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
1835   }
1836   Value *SplatStart = Builder.CreateVectorSplat(VF, Start);
1837   Value *SteppedStart =
1838       getStepVector(SplatStart, 0, Step, II.getInductionOpcode());
1839 
1840   // We create vector phi nodes for both integer and floating-point induction
1841   // variables. Here, we determine the kind of arithmetic we will perform.
1842   Instruction::BinaryOps AddOp;
1843   Instruction::BinaryOps MulOp;
1844   if (Step->getType()->isIntegerTy()) {
1845     AddOp = Instruction::Add;
1846     MulOp = Instruction::Mul;
1847   } else {
1848     AddOp = II.getInductionOpcode();
1849     MulOp = Instruction::FMul;
1850   }
1851 
1852   // Multiply the vectorization factor by the step using integer or
1853   // floating-point arithmetic as appropriate.
1854   Value *ConstVF =
1855       getSignedIntOrFpConstant(Step->getType(), VF.getKnownMinValue());
1856   Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF));
1857 
1858   // Create a vector splat to use in the induction update.
1859   //
1860   // FIXME: If the step is non-constant, we create the vector splat with
1861   //        IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
1862   //        handle a constant vector splat.
1863   assert(!VF.isScalable() && "scalable vectors not yet supported.");
1864   Value *SplatVF = isa<Constant>(Mul)
1865                        ? ConstantVector::getSplat(VF, cast<Constant>(Mul))
1866                        : Builder.CreateVectorSplat(VF, Mul);
1867   Builder.restoreIP(CurrIP);
1868 
1869   // We may need to add the step a number of times, depending on the unroll
1870   // factor. The last of those goes into the PHI.
1871   PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
1872                                     &*LoopVectorBody->getFirstInsertionPt());
1873   VecInd->setDebugLoc(EntryVal->getDebugLoc());
1874   Instruction *LastInduction = VecInd;
1875   for (unsigned Part = 0; Part < UF; ++Part) {
1876     VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction);
1877 
1878     if (isa<TruncInst>(EntryVal))
1879       addMetadata(LastInduction, EntryVal);
1880     recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, Part);
1881 
1882     LastInduction = cast<Instruction>(addFastMathFlag(
1883         Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")));
1884     LastInduction->setDebugLoc(EntryVal->getDebugLoc());
1885   }
1886 
1887   // Move the last step to the end of the latch block. This ensures consistent
1888   // placement of all induction updates.
1889   auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
1890   auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator());
1891   auto *ICmp = cast<Instruction>(Br->getCondition());
1892   LastInduction->moveBefore(ICmp);
1893   LastInduction->setName("vec.ind.next");
1894 
1895   VecInd->addIncoming(SteppedStart, LoopVectorPreHeader);
1896   VecInd->addIncoming(LastInduction, LoopVectorLatch);
1897 }
1898 
1899 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const {
1900   return Cost->isScalarAfterVectorization(I, VF) ||
1901          Cost->isProfitableToScalarize(I, VF);
1902 }
1903 
1904 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const {
1905   if (shouldScalarizeInstruction(IV))
1906     return true;
1907   auto isScalarInst = [&](User *U) -> bool {
1908     auto *I = cast<Instruction>(U);
1909     return (OrigLoop->contains(I) && shouldScalarizeInstruction(I));
1910   };
1911   return llvm::any_of(IV->users(), isScalarInst);
1912 }
1913 
1914 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast(
1915     const InductionDescriptor &ID, const Instruction *EntryVal,
1916     Value *VectorLoopVal, unsigned Part, unsigned Lane) {
1917   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1918          "Expected either an induction phi-node or a truncate of it!");
1919 
1920   // This induction variable is not the phi from the original loop but the
1921   // newly-created IV based on the proof that casted Phi is equal to the
1922   // uncasted Phi in the vectorized loop (under a runtime guard possibly). It
1923   // re-uses the same InductionDescriptor that original IV uses but we don't
1924   // have to do any recording in this case - that is done when original IV is
1925   // processed.
1926   if (isa<TruncInst>(EntryVal))
1927     return;
1928 
1929   const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts();
1930   if (Casts.empty())
1931     return;
1932   // Only the first Cast instruction in the Casts vector is of interest.
1933   // The rest of the Casts (if exist) have no uses outside the
1934   // induction update chain itself.
1935   Instruction *CastInst = *Casts.begin();
1936   if (Lane < UINT_MAX)
1937     VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal);
1938   else
1939     VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal);
1940 }
1941 
1942 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) {
1943   assert((IV->getType()->isIntegerTy() || IV != OldInduction) &&
1944          "Primary induction variable must have an integer type");
1945 
1946   auto II = Legal->getInductionVars().find(IV);
1947   assert(II != Legal->getInductionVars().end() && "IV is not an induction");
1948 
1949   auto ID = II->second;
1950   assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
1951 
1952   // The value from the original loop to which we are mapping the new induction
1953   // variable.
1954   Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
1955 
1956   auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
1957 
1958   // Generate code for the induction step. Note that induction steps are
1959   // required to be loop-invariant
1960   auto CreateStepValue = [&](const SCEV *Step) -> Value * {
1961     assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) &&
1962            "Induction step should be loop invariant");
1963     if (PSE.getSE()->isSCEVable(IV->getType())) {
1964       SCEVExpander Exp(*PSE.getSE(), DL, "induction");
1965       return Exp.expandCodeFor(Step, Step->getType(),
1966                                LoopVectorPreHeader->getTerminator());
1967     }
1968     return cast<SCEVUnknown>(Step)->getValue();
1969   };
1970 
1971   // The scalar value to broadcast. This is derived from the canonical
1972   // induction variable. If a truncation type is given, truncate the canonical
1973   // induction variable and step. Otherwise, derive these values from the
1974   // induction descriptor.
1975   auto CreateScalarIV = [&](Value *&Step) -> Value * {
1976     Value *ScalarIV = Induction;
1977     if (IV != OldInduction) {
1978       ScalarIV = IV->getType()->isIntegerTy()
1979                      ? Builder.CreateSExtOrTrunc(Induction, IV->getType())
1980                      : Builder.CreateCast(Instruction::SIToFP, Induction,
1981                                           IV->getType());
1982       ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID);
1983       ScalarIV->setName("offset.idx");
1984     }
1985     if (Trunc) {
1986       auto *TruncType = cast<IntegerType>(Trunc->getType());
1987       assert(Step->getType()->isIntegerTy() &&
1988              "Truncation requires an integer step");
1989       ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType);
1990       Step = Builder.CreateTrunc(Step, TruncType);
1991     }
1992     return ScalarIV;
1993   };
1994 
1995   // Create the vector values from the scalar IV, in the absence of creating a
1996   // vector IV.
1997   auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) {
1998     Value *Broadcasted = getBroadcastInstrs(ScalarIV);
1999     for (unsigned Part = 0; Part < UF; ++Part) {
2000       assert(!VF.isScalable() && "scalable vectors not yet supported.");
2001       Value *EntryPart =
2002           getStepVector(Broadcasted, VF.getKnownMinValue() * Part, Step,
2003                         ID.getInductionOpcode());
2004       VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart);
2005       if (Trunc)
2006         addMetadata(EntryPart, Trunc);
2007       recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part);
2008     }
2009   };
2010 
2011   // Now do the actual transformations, and start with creating the step value.
2012   Value *Step = CreateStepValue(ID.getStep());
2013   if (VF.isZero() || VF.isScalar()) {
2014     Value *ScalarIV = CreateScalarIV(Step);
2015     CreateSplatIV(ScalarIV, Step);
2016     return;
2017   }
2018 
2019   // Determine if we want a scalar version of the induction variable. This is
2020   // true if the induction variable itself is not widened, or if it has at
2021   // least one user in the loop that is not widened.
2022   auto NeedsScalarIV = needsScalarInduction(EntryVal);
2023   if (!NeedsScalarIV) {
2024     createVectorIntOrFpInductionPHI(ID, Step, EntryVal);
2025     return;
2026   }
2027 
2028   // Try to create a new independent vector induction variable. If we can't
2029   // create the phi node, we will splat the scalar induction variable in each
2030   // loop iteration.
2031   if (!shouldScalarizeInstruction(EntryVal)) {
2032     createVectorIntOrFpInductionPHI(ID, Step, EntryVal);
2033     Value *ScalarIV = CreateScalarIV(Step);
2034     // Create scalar steps that can be used by instructions we will later
2035     // scalarize. Note that the addition of the scalar steps will not increase
2036     // the number of instructions in the loop in the common case prior to
2037     // InstCombine. We will be trading one vector extract for each scalar step.
2038     buildScalarSteps(ScalarIV, Step, EntryVal, ID);
2039     return;
2040   }
2041 
2042   // All IV users are scalar instructions, so only emit a scalar IV, not a
2043   // vectorised IV. Except when we tail-fold, then the splat IV feeds the
2044   // predicate used by the masked loads/stores.
2045   Value *ScalarIV = CreateScalarIV(Step);
2046   if (!Cost->isScalarEpilogueAllowed())
2047     CreateSplatIV(ScalarIV, Step);
2048   buildScalarSteps(ScalarIV, Step, EntryVal, ID);
2049 }
2050 
2051 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step,
2052                                           Instruction::BinaryOps BinOp) {
2053   // Create and check the types.
2054   auto *ValVTy = cast<FixedVectorType>(Val->getType());
2055   int VLen = ValVTy->getNumElements();
2056 
2057   Type *STy = Val->getType()->getScalarType();
2058   assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
2059          "Induction Step must be an integer or FP");
2060   assert(Step->getType() == STy && "Step has wrong type");
2061 
2062   SmallVector<Constant *, 8> Indices;
2063 
2064   if (STy->isIntegerTy()) {
2065     // Create a vector of consecutive numbers from zero to VF.
2066     for (int i = 0; i < VLen; ++i)
2067       Indices.push_back(ConstantInt::get(STy, StartIdx + i));
2068 
2069     // Add the consecutive indices to the vector value.
2070     Constant *Cv = ConstantVector::get(Indices);
2071     assert(Cv->getType() == Val->getType() && "Invalid consecutive vec");
2072     Step = Builder.CreateVectorSplat(VLen, Step);
2073     assert(Step->getType() == Val->getType() && "Invalid step vec");
2074     // FIXME: The newly created binary instructions should contain nsw/nuw flags,
2075     // which can be found from the original scalar operations.
2076     Step = Builder.CreateMul(Cv, Step);
2077     return Builder.CreateAdd(Val, Step, "induction");
2078   }
2079 
2080   // Floating point induction.
2081   assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
2082          "Binary Opcode should be specified for FP induction");
2083   // Create a vector of consecutive numbers from zero to VF.
2084   for (int i = 0; i < VLen; ++i)
2085     Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i)));
2086 
2087   // Add the consecutive indices to the vector value.
2088   Constant *Cv = ConstantVector::get(Indices);
2089 
2090   Step = Builder.CreateVectorSplat(VLen, Step);
2091 
2092   // Floating point operations had to be 'fast' to enable the induction.
2093   FastMathFlags Flags;
2094   Flags.setFast();
2095 
2096   Value *MulOp = Builder.CreateFMul(Cv, Step);
2097   if (isa<Instruction>(MulOp))
2098     // Have to check, MulOp may be a constant
2099     cast<Instruction>(MulOp)->setFastMathFlags(Flags);
2100 
2101   Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
2102   if (isa<Instruction>(BOp))
2103     cast<Instruction>(BOp)->setFastMathFlags(Flags);
2104   return BOp;
2105 }
2106 
2107 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
2108                                            Instruction *EntryVal,
2109                                            const InductionDescriptor &ID) {
2110   // We shouldn't have to build scalar steps if we aren't vectorizing.
2111   assert(VF.isVector() && "VF should be greater than one");
2112   assert(!VF.isScalable() &&
2113          "the code below assumes a fixed number of elements at compile time");
2114   // Get the value type and ensure it and the step have the same integer type.
2115   Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
2116   assert(ScalarIVTy == Step->getType() &&
2117          "Val and Step should have the same type");
2118 
2119   // We build scalar steps for both integer and floating-point induction
2120   // variables. Here, we determine the kind of arithmetic we will perform.
2121   Instruction::BinaryOps AddOp;
2122   Instruction::BinaryOps MulOp;
2123   if (ScalarIVTy->isIntegerTy()) {
2124     AddOp = Instruction::Add;
2125     MulOp = Instruction::Mul;
2126   } else {
2127     AddOp = ID.getInductionOpcode();
2128     MulOp = Instruction::FMul;
2129   }
2130 
2131   // Determine the number of scalars we need to generate for each unroll
2132   // iteration. If EntryVal is uniform, we only need to generate the first
2133   // lane. Otherwise, we generate all VF values.
2134   unsigned Lanes =
2135       Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF)
2136           ? 1
2137           : VF.getKnownMinValue();
2138   // Compute the scalar steps and save the results in VectorLoopValueMap.
2139   for (unsigned Part = 0; Part < UF; ++Part) {
2140     for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
2141       auto *StartIdx = getSignedIntOrFpConstant(
2142           ScalarIVTy, VF.getKnownMinValue() * Part + Lane);
2143       auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step));
2144       auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul));
2145       VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add);
2146       recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane);
2147     }
2148   }
2149 }
2150 
2151 Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) {
2152   assert(V != Induction && "The new induction variable should not be used.");
2153   assert(!V->getType()->isVectorTy() && "Can't widen a vector");
2154   assert(!V->getType()->isVoidTy() && "Type does not produce a value");
2155 
2156   // If we have a stride that is replaced by one, do it here. Defer this for
2157   // the VPlan-native path until we start running Legal checks in that path.
2158   if (!EnableVPlanNativePath && Legal->hasStride(V))
2159     V = ConstantInt::get(V->getType(), 1);
2160 
2161   // If we have a vector mapped to this value, return it.
2162   if (VectorLoopValueMap.hasVectorValue(V, Part))
2163     return VectorLoopValueMap.getVectorValue(V, Part);
2164 
2165   // If the value has not been vectorized, check if it has been scalarized
2166   // instead. If it has been scalarized, and we actually need the value in
2167   // vector form, we will construct the vector values on demand.
2168   if (VectorLoopValueMap.hasAnyScalarValue(V)) {
2169     Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0});
2170 
2171     // If we've scalarized a value, that value should be an instruction.
2172     auto *I = cast<Instruction>(V);
2173 
2174     // If we aren't vectorizing, we can just copy the scalar map values over to
2175     // the vector map.
2176     if (VF.isScalar()) {
2177       VectorLoopValueMap.setVectorValue(V, Part, ScalarValue);
2178       return ScalarValue;
2179     }
2180 
2181     // Get the last scalar instruction we generated for V and Part. If the value
2182     // is known to be uniform after vectorization, this corresponds to lane zero
2183     // of the Part unroll iteration. Otherwise, the last instruction is the one
2184     // we created for the last vector lane of the Part unroll iteration.
2185     assert(!VF.isScalable() && "scalable vectors not yet supported.");
2186     unsigned LastLane = Cost->isUniformAfterVectorization(I, VF)
2187                             ? 0
2188                             : VF.getKnownMinValue() - 1;
2189     auto *LastInst = cast<Instruction>(
2190         VectorLoopValueMap.getScalarValue(V, {Part, LastLane}));
2191 
2192     // Set the insert point after the last scalarized instruction. This ensures
2193     // the insertelement sequence will directly follow the scalar definitions.
2194     auto OldIP = Builder.saveIP();
2195     auto NewIP = std::next(BasicBlock::iterator(LastInst));
2196     Builder.SetInsertPoint(&*NewIP);
2197 
2198     // However, if we are vectorizing, we need to construct the vector values.
2199     // If the value is known to be uniform after vectorization, we can just
2200     // broadcast the scalar value corresponding to lane zero for each unroll
2201     // iteration. Otherwise, we construct the vector values using insertelement
2202     // instructions. Since the resulting vectors are stored in
2203     // VectorLoopValueMap, we will only generate the insertelements once.
2204     Value *VectorValue = nullptr;
2205     if (Cost->isUniformAfterVectorization(I, VF)) {
2206       VectorValue = getBroadcastInstrs(ScalarValue);
2207       VectorLoopValueMap.setVectorValue(V, Part, VectorValue);
2208     } else {
2209       // Initialize packing with insertelements to start from undef.
2210       assert(!VF.isScalable() && "VF is assumed to be non scalable.");
2211       Value *Undef = UndefValue::get(VectorType::get(V->getType(), VF));
2212       VectorLoopValueMap.setVectorValue(V, Part, Undef);
2213       for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane)
2214         packScalarIntoVectorValue(V, {Part, Lane});
2215       VectorValue = VectorLoopValueMap.getVectorValue(V, Part);
2216     }
2217     Builder.restoreIP(OldIP);
2218     return VectorValue;
2219   }
2220 
2221   // If this scalar is unknown, assume that it is a constant or that it is
2222   // loop invariant. Broadcast V and save the value for future uses.
2223   Value *B = getBroadcastInstrs(V);
2224   VectorLoopValueMap.setVectorValue(V, Part, B);
2225   return B;
2226 }
2227 
2228 Value *
2229 InnerLoopVectorizer::getOrCreateScalarValue(Value *V,
2230                                             const VPIteration &Instance) {
2231   // If the value is not an instruction contained in the loop, it should
2232   // already be scalar.
2233   if (OrigLoop->isLoopInvariant(V))
2234     return V;
2235 
2236   assert(Instance.Lane > 0
2237              ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF)
2238              : true && "Uniform values only have lane zero");
2239 
2240   // If the value from the original loop has not been vectorized, it is
2241   // represented by UF x VF scalar values in the new loop. Return the requested
2242   // scalar value.
2243   if (VectorLoopValueMap.hasScalarValue(V, Instance))
2244     return VectorLoopValueMap.getScalarValue(V, Instance);
2245 
2246   // If the value has not been scalarized, get its entry in VectorLoopValueMap
2247   // for the given unroll part. If this entry is not a vector type (i.e., the
2248   // vectorization factor is one), there is no need to generate an
2249   // extractelement instruction.
2250   auto *U = getOrCreateVectorValue(V, Instance.Part);
2251   if (!U->getType()->isVectorTy()) {
2252     assert(VF.isScalar() && "Value not scalarized has non-vector type");
2253     return U;
2254   }
2255 
2256   // Otherwise, the value from the original loop has been vectorized and is
2257   // represented by UF vector values. Extract and return the requested scalar
2258   // value from the appropriate vector lane.
2259   return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane));
2260 }
2261 
2262 void InnerLoopVectorizer::packScalarIntoVectorValue(
2263     Value *V, const VPIteration &Instance) {
2264   assert(V != Induction && "The new induction variable should not be used.");
2265   assert(!V->getType()->isVectorTy() && "Can't pack a vector");
2266   assert(!V->getType()->isVoidTy() && "Type does not produce a value");
2267 
2268   Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance);
2269   Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part);
2270   VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst,
2271                                             Builder.getInt32(Instance.Lane));
2272   VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue);
2273 }
2274 
2275 Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
2276   assert(Vec->getType()->isVectorTy() && "Invalid type");
2277   assert(!VF.isScalable() && "Cannot reverse scalable vectors");
2278   SmallVector<int, 8> ShuffleMask;
2279   for (unsigned i = 0; i < VF.getKnownMinValue(); ++i)
2280     ShuffleMask.push_back(VF.getKnownMinValue() - i - 1);
2281 
2282   return Builder.CreateShuffleVector(Vec, ShuffleMask, "reverse");
2283 }
2284 
2285 // Return whether we allow using masked interleave-groups (for dealing with
2286 // strided loads/stores that reside in predicated blocks, or for dealing
2287 // with gaps).
2288 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2289   // If an override option has been passed in for interleaved accesses, use it.
2290   if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2291     return EnableMaskedInterleavedMemAccesses;
2292 
2293   return TTI.enableMaskedInterleavedAccessVectorization();
2294 }
2295 
2296 // Try to vectorize the interleave group that \p Instr belongs to.
2297 //
2298 // E.g. Translate following interleaved load group (factor = 3):
2299 //   for (i = 0; i < N; i+=3) {
2300 //     R = Pic[i];             // Member of index 0
2301 //     G = Pic[i+1];           // Member of index 1
2302 //     B = Pic[i+2];           // Member of index 2
2303 //     ... // do something to R, G, B
2304 //   }
2305 // To:
2306 //   %wide.vec = load <12 x i32>                       ; Read 4 tuples of R,G,B
2307 //   %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9>   ; R elements
2308 //   %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10>  ; G elements
2309 //   %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11>  ; B elements
2310 //
2311 // Or translate following interleaved store group (factor = 3):
2312 //   for (i = 0; i < N; i+=3) {
2313 //     ... do something to R, G, B
2314 //     Pic[i]   = R;           // Member of index 0
2315 //     Pic[i+1] = G;           // Member of index 1
2316 //     Pic[i+2] = B;           // Member of index 2
2317 //   }
2318 // To:
2319 //   %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2320 //   %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u>
2321 //   %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2322 //        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements
2323 //   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
2324 void InnerLoopVectorizer::vectorizeInterleaveGroup(
2325     const InterleaveGroup<Instruction> *Group, VPTransformState &State,
2326     VPValue *Addr, ArrayRef<VPValue *> StoredValues, VPValue *BlockInMask) {
2327   Instruction *Instr = Group->getInsertPos();
2328   const DataLayout &DL = Instr->getModule()->getDataLayout();
2329 
2330   // Prepare for the vector type of the interleaved load/store.
2331   Type *ScalarTy = getMemInstValueType(Instr);
2332   unsigned InterleaveFactor = Group->getFactor();
2333   assert(!VF.isScalable() && "scalable vectors not yet supported.");
2334   auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor);
2335 
2336   // Prepare for the new pointers.
2337   SmallVector<Value *, 2> AddrParts;
2338   unsigned Index = Group->getIndex(Instr);
2339 
2340   // TODO: extend the masked interleaved-group support to reversed access.
2341   assert((!BlockInMask || !Group->isReverse()) &&
2342          "Reversed masked interleave-group not supported.");
2343 
2344   // If the group is reverse, adjust the index to refer to the last vector lane
2345   // instead of the first. We adjust the index from the first vector lane,
2346   // rather than directly getting the pointer for lane VF - 1, because the
2347   // pointer operand of the interleaved access is supposed to be uniform. For
2348   // uniform instructions, we're only required to generate a value for the
2349   // first vector lane in each unroll iteration.
2350   assert(!VF.isScalable() &&
2351          "scalable vector reverse operation is not implemented");
2352   if (Group->isReverse())
2353     Index += (VF.getKnownMinValue() - 1) * Group->getFactor();
2354 
2355   for (unsigned Part = 0; Part < UF; Part++) {
2356     Value *AddrPart = State.get(Addr, {Part, 0});
2357     setDebugLocFromInst(Builder, AddrPart);
2358 
2359     // Notice current instruction could be any index. Need to adjust the address
2360     // to the member of index 0.
2361     //
2362     // E.g.  a = A[i+1];     // Member of index 1 (Current instruction)
2363     //       b = A[i];       // Member of index 0
2364     // Current pointer is pointed to A[i+1], adjust it to A[i].
2365     //
2366     // E.g.  A[i+1] = a;     // Member of index 1
2367     //       A[i]   = b;     // Member of index 0
2368     //       A[i+2] = c;     // Member of index 2 (Current instruction)
2369     // Current pointer is pointed to A[i+2], adjust it to A[i].
2370 
2371     bool InBounds = false;
2372     if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts()))
2373       InBounds = gep->isInBounds();
2374     AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index));
2375     cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds);
2376 
2377     // Cast to the vector pointer type.
2378     unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace();
2379     Type *PtrTy = VecTy->getPointerTo(AddressSpace);
2380     AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy));
2381   }
2382 
2383   setDebugLocFromInst(Builder, Instr);
2384   Value *UndefVec = UndefValue::get(VecTy);
2385 
2386   Value *MaskForGaps = nullptr;
2387   if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) {
2388     assert(!VF.isScalable() && "scalable vectors not yet supported.");
2389     MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
2390     assert(MaskForGaps && "Mask for Gaps is required but it is null");
2391   }
2392 
2393   // Vectorize the interleaved load group.
2394   if (isa<LoadInst>(Instr)) {
2395     // For each unroll part, create a wide load for the group.
2396     SmallVector<Value *, 2> NewLoads;
2397     for (unsigned Part = 0; Part < UF; Part++) {
2398       Instruction *NewLoad;
2399       if (BlockInMask || MaskForGaps) {
2400         assert(useMaskedInterleavedAccesses(*TTI) &&
2401                "masked interleaved groups are not allowed.");
2402         Value *GroupMask = MaskForGaps;
2403         if (BlockInMask) {
2404           Value *BlockInMaskPart = State.get(BlockInMask, Part);
2405           assert(!VF.isScalable() && "scalable vectors not yet supported.");
2406           Value *ShuffledMask = Builder.CreateShuffleVector(
2407               BlockInMaskPart,
2408               createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2409               "interleaved.mask");
2410           GroupMask = MaskForGaps
2411                           ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
2412                                                 MaskForGaps)
2413                           : ShuffledMask;
2414         }
2415         NewLoad =
2416             Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlign(),
2417                                      GroupMask, UndefVec, "wide.masked.vec");
2418       }
2419       else
2420         NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part],
2421                                             Group->getAlign(), "wide.vec");
2422       Group->addMetadata(NewLoad);
2423       NewLoads.push_back(NewLoad);
2424     }
2425 
2426     // For each member in the group, shuffle out the appropriate data from the
2427     // wide loads.
2428     for (unsigned I = 0; I < InterleaveFactor; ++I) {
2429       Instruction *Member = Group->getMember(I);
2430 
2431       // Skip the gaps in the group.
2432       if (!Member)
2433         continue;
2434 
2435       assert(!VF.isScalable() && "scalable vectors not yet supported.");
2436       auto StrideMask =
2437           createStrideMask(I, InterleaveFactor, VF.getKnownMinValue());
2438       for (unsigned Part = 0; Part < UF; Part++) {
2439         Value *StridedVec = Builder.CreateShuffleVector(
2440             NewLoads[Part], StrideMask, "strided.vec");
2441 
2442         // If this member has different type, cast the result type.
2443         if (Member->getType() != ScalarTy) {
2444           assert(!VF.isScalable() && "VF is assumed to be non scalable.");
2445           VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2446           StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2447         }
2448 
2449         if (Group->isReverse())
2450           StridedVec = reverseVector(StridedVec);
2451 
2452         VectorLoopValueMap.setVectorValue(Member, Part, StridedVec);
2453       }
2454     }
2455     return;
2456   }
2457 
2458   // The sub vector type for current instruction.
2459   assert(!VF.isScalable() && "VF is assumed to be non scalable.");
2460   auto *SubVT = VectorType::get(ScalarTy, VF);
2461 
2462   // Vectorize the interleaved store group.
2463   for (unsigned Part = 0; Part < UF; Part++) {
2464     // Collect the stored vector from each member.
2465     SmallVector<Value *, 4> StoredVecs;
2466     for (unsigned i = 0; i < InterleaveFactor; i++) {
2467       // Interleaved store group doesn't allow a gap, so each index has a member
2468       assert(Group->getMember(i) && "Fail to get a member from an interleaved store group");
2469 
2470       Value *StoredVec = State.get(StoredValues[i], Part);
2471 
2472       if (Group->isReverse())
2473         StoredVec = reverseVector(StoredVec);
2474 
2475       // If this member has different type, cast it to a unified type.
2476 
2477       if (StoredVec->getType() != SubVT)
2478         StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2479 
2480       StoredVecs.push_back(StoredVec);
2481     }
2482 
2483     // Concatenate all vectors into a wide vector.
2484     Value *WideVec = concatenateVectors(Builder, StoredVecs);
2485 
2486     // Interleave the elements in the wide vector.
2487     assert(!VF.isScalable() && "scalable vectors not yet supported.");
2488     Value *IVec = Builder.CreateShuffleVector(
2489         WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor),
2490         "interleaved.vec");
2491 
2492     Instruction *NewStoreInstr;
2493     if (BlockInMask) {
2494       Value *BlockInMaskPart = State.get(BlockInMask, Part);
2495       Value *ShuffledMask = Builder.CreateShuffleVector(
2496           BlockInMaskPart,
2497           createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2498           "interleaved.mask");
2499       NewStoreInstr = Builder.CreateMaskedStore(
2500           IVec, AddrParts[Part], Group->getAlign(), ShuffledMask);
2501     }
2502     else
2503       NewStoreInstr =
2504           Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign());
2505 
2506     Group->addMetadata(NewStoreInstr);
2507   }
2508 }
2509 
2510 void InnerLoopVectorizer::vectorizeMemoryInstruction(
2511     Instruction *Instr, VPTransformState &State, VPValue *Def, VPValue *Addr,
2512     VPValue *StoredValue, VPValue *BlockInMask) {
2513   // Attempt to issue a wide load.
2514   LoadInst *LI = dyn_cast<LoadInst>(Instr);
2515   StoreInst *SI = dyn_cast<StoreInst>(Instr);
2516 
2517   assert((LI || SI) && "Invalid Load/Store instruction");
2518   assert((!SI || StoredValue) && "No stored value provided for widened store");
2519   assert((!LI || !StoredValue) && "Stored value provided for widened load");
2520 
2521   LoopVectorizationCostModel::InstWidening Decision =
2522       Cost->getWideningDecision(Instr, VF);
2523   assert((Decision == LoopVectorizationCostModel::CM_Widen ||
2524           Decision == LoopVectorizationCostModel::CM_Widen_Reverse ||
2525           Decision == LoopVectorizationCostModel::CM_GatherScatter) &&
2526          "CM decision is not to widen the memory instruction");
2527 
2528   Type *ScalarDataTy = getMemInstValueType(Instr);
2529 
2530   assert(!VF.isScalable() && "scalable vectors not yet supported.");
2531   auto *DataTy = VectorType::get(ScalarDataTy, VF);
2532   const Align Alignment = getLoadStoreAlignment(Instr);
2533 
2534   // Determine if the pointer operand of the access is either consecutive or
2535   // reverse consecutive.
2536   bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse);
2537   bool ConsecutiveStride =
2538       Reverse || (Decision == LoopVectorizationCostModel::CM_Widen);
2539   bool CreateGatherScatter =
2540       (Decision == LoopVectorizationCostModel::CM_GatherScatter);
2541 
2542   // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector
2543   // gather/scatter. Otherwise Decision should have been to Scalarize.
2544   assert((ConsecutiveStride || CreateGatherScatter) &&
2545          "The instruction should be scalarized");
2546   (void)ConsecutiveStride;
2547 
2548   VectorParts BlockInMaskParts(UF);
2549   bool isMaskRequired = BlockInMask;
2550   if (isMaskRequired)
2551     for (unsigned Part = 0; Part < UF; ++Part)
2552       BlockInMaskParts[Part] = State.get(BlockInMask, Part);
2553 
2554   const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
2555     // Calculate the pointer for the specific unroll-part.
2556     GetElementPtrInst *PartPtr = nullptr;
2557 
2558     bool InBounds = false;
2559     if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
2560       InBounds = gep->isInBounds();
2561 
2562     if (Reverse) {
2563       // If the address is consecutive but reversed, then the
2564       // wide store needs to start at the last vector element.
2565       PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP(
2566           ScalarDataTy, Ptr, Builder.getInt32(-Part * VF.getKnownMinValue())));
2567       PartPtr->setIsInBounds(InBounds);
2568       PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP(
2569           ScalarDataTy, PartPtr, Builder.getInt32(1 - VF.getKnownMinValue())));
2570       PartPtr->setIsInBounds(InBounds);
2571       if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
2572         BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]);
2573     } else {
2574       PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP(
2575           ScalarDataTy, Ptr, Builder.getInt32(Part * VF.getKnownMinValue())));
2576       PartPtr->setIsInBounds(InBounds);
2577     }
2578 
2579     unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
2580     return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
2581   };
2582 
2583   // Handle Stores:
2584   if (SI) {
2585     setDebugLocFromInst(Builder, SI);
2586 
2587     for (unsigned Part = 0; Part < UF; ++Part) {
2588       Instruction *NewSI = nullptr;
2589       Value *StoredVal = State.get(StoredValue, Part);
2590       if (CreateGatherScatter) {
2591         Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
2592         Value *VectorGep = State.get(Addr, Part);
2593         NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
2594                                             MaskPart);
2595       } else {
2596         if (Reverse) {
2597           // If we store to reverse consecutive memory locations, then we need
2598           // to reverse the order of elements in the stored value.
2599           StoredVal = reverseVector(StoredVal);
2600           // We don't want to update the value in the map as it might be used in
2601           // another expression. So don't call resetVectorValue(StoredVal).
2602         }
2603         auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0}));
2604         if (isMaskRequired)
2605           NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
2606                                             BlockInMaskParts[Part]);
2607         else
2608           NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
2609       }
2610       addMetadata(NewSI, SI);
2611     }
2612     return;
2613   }
2614 
2615   // Handle loads.
2616   assert(LI && "Must have a load instruction");
2617   setDebugLocFromInst(Builder, LI);
2618   for (unsigned Part = 0; Part < UF; ++Part) {
2619     Value *NewLI;
2620     if (CreateGatherScatter) {
2621       Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
2622       Value *VectorGep = State.get(Addr, Part);
2623       NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart,
2624                                          nullptr, "wide.masked.gather");
2625       addMetadata(NewLI, LI);
2626     } else {
2627       auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0}));
2628       if (isMaskRequired)
2629         NewLI = Builder.CreateMaskedLoad(
2630             VecPtr, Alignment, BlockInMaskParts[Part], UndefValue::get(DataTy),
2631             "wide.masked.load");
2632       else
2633         NewLI =
2634             Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");
2635 
2636       // Add metadata to the load, but setVectorValue to the reverse shuffle.
2637       addMetadata(NewLI, LI);
2638       if (Reverse)
2639         NewLI = reverseVector(NewLI);
2640     }
2641 
2642     State.set(Def, Instr, NewLI, Part);
2643   }
2644 }
2645 
2646 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPUser &User,
2647                                                const VPIteration &Instance,
2648                                                bool IfPredicateInstr,
2649                                                VPTransformState &State) {
2650   assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
2651 
2652   setDebugLocFromInst(Builder, Instr);
2653 
2654   // Does this instruction return a value ?
2655   bool IsVoidRetTy = Instr->getType()->isVoidTy();
2656 
2657   Instruction *Cloned = Instr->clone();
2658   if (!IsVoidRetTy)
2659     Cloned->setName(Instr->getName() + ".cloned");
2660 
2661   // Replace the operands of the cloned instructions with their scalar
2662   // equivalents in the new loop.
2663   for (unsigned op = 0, e = User.getNumOperands(); op != e; ++op) {
2664     auto *Operand = dyn_cast<Instruction>(Instr->getOperand(op));
2665     auto InputInstance = Instance;
2666     if (!Operand || !OrigLoop->contains(Operand) ||
2667         (Cost->isUniformAfterVectorization(Operand, State.VF)))
2668       InputInstance.Lane = 0;
2669     auto *NewOp = State.get(User.getOperand(op), InputInstance);
2670     Cloned->setOperand(op, NewOp);
2671   }
2672   addNewMetadata(Cloned, Instr);
2673 
2674   // Place the cloned scalar in the new loop.
2675   Builder.Insert(Cloned);
2676 
2677   // TODO: Set result for VPValue of VPReciplicateRecipe. This requires
2678   // representing scalar values in VPTransformState. Add the cloned scalar to
2679   // the scalar map entry.
2680   VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned);
2681 
2682   // If we just cloned a new assumption, add it the assumption cache.
2683   if (auto *II = dyn_cast<IntrinsicInst>(Cloned))
2684     if (II->getIntrinsicID() == Intrinsic::assume)
2685       AC->registerAssumption(II);
2686 
2687   // End if-block.
2688   if (IfPredicateInstr)
2689     PredicatedInstructions.push_back(Cloned);
2690 }
2691 
2692 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start,
2693                                                       Value *End, Value *Step,
2694                                                       Instruction *DL) {
2695   BasicBlock *Header = L->getHeader();
2696   BasicBlock *Latch = L->getLoopLatch();
2697   // As we're just creating this loop, it's possible no latch exists
2698   // yet. If so, use the header as this will be a single block loop.
2699   if (!Latch)
2700     Latch = Header;
2701 
2702   IRBuilder<> Builder(&*Header->getFirstInsertionPt());
2703   Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction);
2704   setDebugLocFromInst(Builder, OldInst);
2705   auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index");
2706 
2707   Builder.SetInsertPoint(Latch->getTerminator());
2708   setDebugLocFromInst(Builder, OldInst);
2709 
2710   // Create i+1 and fill the PHINode.
2711   Value *Next = Builder.CreateAdd(Induction, Step, "index.next");
2712   Induction->addIncoming(Start, L->getLoopPreheader());
2713   Induction->addIncoming(Next, Latch);
2714   // Create the compare.
2715   Value *ICmp = Builder.CreateICmpEQ(Next, End);
2716   Builder.CreateCondBr(ICmp, L->getExitBlock(), Header);
2717 
2718   // Now we have two terminators. Remove the old one from the block.
2719   Latch->getTerminator()->eraseFromParent();
2720 
2721   return Induction;
2722 }
2723 
2724 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
2725   if (TripCount)
2726     return TripCount;
2727 
2728   assert(L && "Create Trip Count for null loop.");
2729   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2730   // Find the loop boundaries.
2731   ScalarEvolution *SE = PSE.getSE();
2732   const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
2733   assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) &&
2734          "Invalid loop count");
2735 
2736   Type *IdxTy = Legal->getWidestInductionType();
2737   assert(IdxTy && "No type for induction");
2738 
2739   // The exit count might have the type of i64 while the phi is i32. This can
2740   // happen if we have an induction variable that is sign extended before the
2741   // compare. The only way that we get a backedge taken count is that the
2742   // induction variable was signed and as such will not overflow. In such a case
2743   // truncation is legal.
2744   if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) >
2745       IdxTy->getPrimitiveSizeInBits())
2746     BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
2747   BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
2748 
2749   // Get the total trip count from the count by adding 1.
2750   const SCEV *ExitCount = SE->getAddExpr(
2751       BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
2752 
2753   const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
2754 
2755   // Expand the trip count and place the new instructions in the preheader.
2756   // Notice that the pre-header does not change, only the loop body.
2757   SCEVExpander Exp(*SE, DL, "induction");
2758 
2759   // Count holds the overall loop count (N).
2760   TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
2761                                 L->getLoopPreheader()->getTerminator());
2762 
2763   if (TripCount->getType()->isPointerTy())
2764     TripCount =
2765         CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
2766                                     L->getLoopPreheader()->getTerminator());
2767 
2768   return TripCount;
2769 }
2770 
2771 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
2772   if (VectorTripCount)
2773     return VectorTripCount;
2774 
2775   Value *TC = getOrCreateTripCount(L);
2776   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2777 
2778   Type *Ty = TC->getType();
2779   // This is where we can make the step a runtime constant.
2780   assert(!VF.isScalable() && "scalable vectorization is not supported yet");
2781   Constant *Step = ConstantInt::get(Ty, VF.getKnownMinValue() * UF);
2782 
2783   // If the tail is to be folded by masking, round the number of iterations N
2784   // up to a multiple of Step instead of rounding down. This is done by first
2785   // adding Step-1 and then rounding down. Note that it's ok if this addition
2786   // overflows: the vector induction variable will eventually wrap to zero given
2787   // that it starts at zero and its Step is a power of two; the loop will then
2788   // exit, with the last early-exit vector comparison also producing all-true.
2789   if (Cost->foldTailByMasking()) {
2790     assert(isPowerOf2_32(VF.getKnownMinValue() * UF) &&
2791            "VF*UF must be a power of 2 when folding tail by masking");
2792     TC = Builder.CreateAdd(
2793         TC, ConstantInt::get(Ty, VF.getKnownMinValue() * UF - 1), "n.rnd.up");
2794   }
2795 
2796   // Now we need to generate the expression for the part of the loop that the
2797   // vectorized body will execute. This is equal to N - (N % Step) if scalar
2798   // iterations are not required for correctness, or N - Step, otherwise. Step
2799   // is equal to the vectorization factor (number of SIMD elements) times the
2800   // unroll factor (number of SIMD instructions).
2801   Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
2802 
2803   // If there is a non-reversed interleaved group that may speculatively access
2804   // memory out-of-bounds, we need to ensure that there will be at least one
2805   // iteration of the scalar epilogue loop. Thus, if the step evenly divides
2806   // the trip count, we set the remainder to be equal to the step. If the step
2807   // does not evenly divide the trip count, no adjustment is necessary since
2808   // there will already be scalar iterations. Note that the minimum iterations
2809   // check ensures that N >= Step.
2810   if (VF.isVector() && Cost->requiresScalarEpilogue()) {
2811     auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
2812     R = Builder.CreateSelect(IsZero, Step, R);
2813   }
2814 
2815   VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
2816 
2817   return VectorTripCount;
2818 }
2819 
2820 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
2821                                                    const DataLayout &DL) {
2822   // Verify that V is a vector type with same number of elements as DstVTy.
2823   auto *DstFVTy = cast<FixedVectorType>(DstVTy);
2824   unsigned VF = DstFVTy->getNumElements();
2825   auto *SrcVecTy = cast<FixedVectorType>(V->getType());
2826   assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match");
2827   Type *SrcElemTy = SrcVecTy->getElementType();
2828   Type *DstElemTy = DstFVTy->getElementType();
2829   assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
2830          "Vector elements must have same size");
2831 
2832   // Do a direct cast if element types are castable.
2833   if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
2834     return Builder.CreateBitOrPointerCast(V, DstFVTy);
2835   }
2836   // V cannot be directly casted to desired vector type.
2837   // May happen when V is a floating point vector but DstVTy is a vector of
2838   // pointers or vice-versa. Handle this using a two-step bitcast using an
2839   // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
2840   assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
2841          "Only one type should be a pointer type");
2842   assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
2843          "Only one type should be a floating point type");
2844   Type *IntTy =
2845       IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
2846   auto *VecIntTy = FixedVectorType::get(IntTy, VF);
2847   Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
2848   return Builder.CreateBitOrPointerCast(CastVal, DstFVTy);
2849 }
2850 
2851 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
2852                                                          BasicBlock *Bypass) {
2853   Value *Count = getOrCreateTripCount(L);
2854   // Reuse existing vector loop preheader for TC checks.
2855   // Note that new preheader block is generated for vector loop.
2856   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
2857   IRBuilder<> Builder(TCCheckBlock->getTerminator());
2858 
2859   // Generate code to check if the loop's trip count is less than VF * UF, or
2860   // equal to it in case a scalar epilogue is required; this implies that the
2861   // vector trip count is zero. This check also covers the case where adding one
2862   // to the backedge-taken count overflowed leading to an incorrect trip count
2863   // of zero. In this case we will also jump to the scalar loop.
2864   auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE
2865                                           : ICmpInst::ICMP_ULT;
2866 
2867   // If tail is to be folded, vector loop takes care of all iterations.
2868   Value *CheckMinIters = Builder.getFalse();
2869   if (!Cost->foldTailByMasking()) {
2870     assert(!VF.isScalable() && "scalable vectors not yet supported.");
2871     CheckMinIters = Builder.CreateICmp(
2872         P, Count,
2873         ConstantInt::get(Count->getType(), VF.getKnownMinValue() * UF),
2874         "min.iters.check");
2875   }
2876   // Create new preheader for vector loop.
2877   LoopVectorPreHeader =
2878       SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
2879                  "vector.ph");
2880 
2881   assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
2882                                DT->getNode(Bypass)->getIDom()) &&
2883          "TC check is expected to dominate Bypass");
2884 
2885   // Update dominator for Bypass & LoopExit.
2886   DT->changeImmediateDominator(Bypass, TCCheckBlock);
2887   DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
2888 
2889   ReplaceInstWithInst(
2890       TCCheckBlock->getTerminator(),
2891       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
2892   LoopBypassBlocks.push_back(TCCheckBlock);
2893 }
2894 
2895 void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
2896   // Reuse existing vector loop preheader for SCEV checks.
2897   // Note that new preheader block is generated for vector loop.
2898   BasicBlock *const SCEVCheckBlock = LoopVectorPreHeader;
2899 
2900   // Generate the code to check that the SCEV assumptions that we made.
2901   // We want the new basic block to start at the first instruction in a
2902   // sequence of instructions that form a check.
2903   SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(),
2904                    "scev.check");
2905   Value *SCEVCheck = Exp.expandCodeForPredicate(
2906       &PSE.getUnionPredicate(), SCEVCheckBlock->getTerminator());
2907 
2908   if (auto *C = dyn_cast<ConstantInt>(SCEVCheck))
2909     if (C->isZero())
2910       return;
2911 
2912   assert(!(SCEVCheckBlock->getParent()->hasOptSize() ||
2913            (OptForSizeBasedOnProfile &&
2914             Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
2915          "Cannot SCEV check stride or overflow when optimizing for size");
2916 
2917   SCEVCheckBlock->setName("vector.scevcheck");
2918   // Create new preheader for vector loop.
2919   LoopVectorPreHeader =
2920       SplitBlock(SCEVCheckBlock, SCEVCheckBlock->getTerminator(), DT, LI,
2921                  nullptr, "vector.ph");
2922 
2923   // Update dominator only if this is first RT check.
2924   if (LoopBypassBlocks.empty()) {
2925     DT->changeImmediateDominator(Bypass, SCEVCheckBlock);
2926     DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock);
2927   }
2928 
2929   ReplaceInstWithInst(
2930       SCEVCheckBlock->getTerminator(),
2931       BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheck));
2932   LoopBypassBlocks.push_back(SCEVCheckBlock);
2933   AddedSafetyChecks = true;
2934 }
2935 
2936 void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {
2937   // VPlan-native path does not do any analysis for runtime checks currently.
2938   if (EnableVPlanNativePath)
2939     return;
2940 
2941   // Reuse existing vector loop preheader for runtime memory checks.
2942   // Note that new preheader block is generated for vector loop.
2943   BasicBlock *const MemCheckBlock = L->getLoopPreheader();
2944 
2945   // Generate the code that checks in runtime if arrays overlap. We put the
2946   // checks into a separate block to make the more common case of few elements
2947   // faster.
2948   auto *LAI = Legal->getLAI();
2949   const auto &RtPtrChecking = *LAI->getRuntimePointerChecking();
2950   if (!RtPtrChecking.Need)
2951     return;
2952 
2953   if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) {
2954     assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
2955            "Cannot emit memory checks when optimizing for size, unless forced "
2956            "to vectorize.");
2957     ORE->emit([&]() {
2958       return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
2959                                         L->getStartLoc(), L->getHeader())
2960              << "Code-size may be reduced by not forcing "
2961                 "vectorization, or by source-code modifications "
2962                 "eliminating the need for runtime checks "
2963                 "(e.g., adding 'restrict').";
2964     });
2965   }
2966 
2967   MemCheckBlock->setName("vector.memcheck");
2968   // Create new preheader for vector loop.
2969   LoopVectorPreHeader =
2970       SplitBlock(MemCheckBlock, MemCheckBlock->getTerminator(), DT, LI, nullptr,
2971                  "vector.ph");
2972 
2973   auto *CondBranch = cast<BranchInst>(
2974       Builder.CreateCondBr(Builder.getTrue(), Bypass, LoopVectorPreHeader));
2975   ReplaceInstWithInst(MemCheckBlock->getTerminator(), CondBranch);
2976   LoopBypassBlocks.push_back(MemCheckBlock);
2977   AddedSafetyChecks = true;
2978 
2979   // Update dominator only if this is first RT check.
2980   if (LoopBypassBlocks.empty()) {
2981     DT->changeImmediateDominator(Bypass, MemCheckBlock);
2982     DT->changeImmediateDominator(LoopExitBlock, MemCheckBlock);
2983   }
2984 
2985   Instruction *FirstCheckInst;
2986   Instruction *MemRuntimeCheck;
2987   std::tie(FirstCheckInst, MemRuntimeCheck) =
2988       addRuntimeChecks(MemCheckBlock->getTerminator(), OrigLoop,
2989                        RtPtrChecking.getChecks(), RtPtrChecking.getSE());
2990   assert(MemRuntimeCheck && "no RT checks generated although RtPtrChecking "
2991                             "claimed checks are required");
2992   CondBranch->setCondition(MemRuntimeCheck);
2993 
2994   // We currently don't use LoopVersioning for the actual loop cloning but we
2995   // still use it to add the noalias metadata.
2996   LVer = std::make_unique<LoopVersioning>(
2997       *Legal->getLAI(),
2998       Legal->getLAI()->getRuntimePointerChecking()->getChecks(), OrigLoop, LI,
2999       DT, PSE.getSE());
3000   LVer->prepareNoAliasMetadata();
3001 }
3002 
3003 Value *InnerLoopVectorizer::emitTransformedIndex(
3004     IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL,
3005     const InductionDescriptor &ID) const {
3006 
3007   SCEVExpander Exp(*SE, DL, "induction");
3008   auto Step = ID.getStep();
3009   auto StartValue = ID.getStartValue();
3010   assert(Index->getType() == Step->getType() &&
3011          "Index type does not match StepValue type");
3012 
3013   // Note: the IR at this point is broken. We cannot use SE to create any new
3014   // SCEV and then expand it, hoping that SCEV's simplification will give us
3015   // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
3016   // lead to various SCEV crashes. So all we can do is to use builder and rely
3017   // on InstCombine for future simplifications. Here we handle some trivial
3018   // cases only.
3019   auto CreateAdd = [&B](Value *X, Value *Y) {
3020     assert(X->getType() == Y->getType() && "Types don't match!");
3021     if (auto *CX = dyn_cast<ConstantInt>(X))
3022       if (CX->isZero())
3023         return Y;
3024     if (auto *CY = dyn_cast<ConstantInt>(Y))
3025       if (CY->isZero())
3026         return X;
3027     return B.CreateAdd(X, Y);
3028   };
3029 
3030   auto CreateMul = [&B](Value *X, Value *Y) {
3031     assert(X->getType() == Y->getType() && "Types don't match!");
3032     if (auto *CX = dyn_cast<ConstantInt>(X))
3033       if (CX->isOne())
3034         return Y;
3035     if (auto *CY = dyn_cast<ConstantInt>(Y))
3036       if (CY->isOne())
3037         return X;
3038     return B.CreateMul(X, Y);
3039   };
3040 
3041   // Get a suitable insert point for SCEV expansion. For blocks in the vector
3042   // loop, choose the end of the vector loop header (=LoopVectorBody), because
3043   // the DomTree is not kept up-to-date for additional blocks generated in the
3044   // vector loop. By using the header as insertion point, we guarantee that the
3045   // expanded instructions dominate all their uses.
3046   auto GetInsertPoint = [this, &B]() {
3047     BasicBlock *InsertBB = B.GetInsertPoint()->getParent();
3048     if (InsertBB != LoopVectorBody &&
3049         LI->getLoopFor(LoopVectorBody) == LI->getLoopFor(InsertBB))
3050       return LoopVectorBody->getTerminator();
3051     return &*B.GetInsertPoint();
3052   };
3053   switch (ID.getKind()) {
3054   case InductionDescriptor::IK_IntInduction: {
3055     assert(Index->getType() == StartValue->getType() &&
3056            "Index type does not match StartValue type");
3057     if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne())
3058       return B.CreateSub(StartValue, Index);
3059     auto *Offset = CreateMul(
3060         Index, Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint()));
3061     return CreateAdd(StartValue, Offset);
3062   }
3063   case InductionDescriptor::IK_PtrInduction: {
3064     assert(isa<SCEVConstant>(Step) &&
3065            "Expected constant step for pointer induction");
3066     return B.CreateGEP(
3067         StartValue->getType()->getPointerElementType(), StartValue,
3068         CreateMul(Index,
3069                   Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint())));
3070   }
3071   case InductionDescriptor::IK_FpInduction: {
3072     assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
3073     auto InductionBinOp = ID.getInductionBinOp();
3074     assert(InductionBinOp &&
3075            (InductionBinOp->getOpcode() == Instruction::FAdd ||
3076             InductionBinOp->getOpcode() == Instruction::FSub) &&
3077            "Original bin op should be defined for FP induction");
3078 
3079     Value *StepValue = cast<SCEVUnknown>(Step)->getValue();
3080 
3081     // Floating point operations had to be 'fast' to enable the induction.
3082     FastMathFlags Flags;
3083     Flags.setFast();
3084 
3085     Value *MulExp = B.CreateFMul(StepValue, Index);
3086     if (isa<Instruction>(MulExp))
3087       // We have to check, the MulExp may be a constant.
3088       cast<Instruction>(MulExp)->setFastMathFlags(Flags);
3089 
3090     Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
3091                                "induction");
3092     if (isa<Instruction>(BOp))
3093       cast<Instruction>(BOp)->setFastMathFlags(Flags);
3094 
3095     return BOp;
3096   }
3097   case InductionDescriptor::IK_NoInduction:
3098     return nullptr;
3099   }
3100   llvm_unreachable("invalid enum");
3101 }
3102 
3103 Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
3104   LoopScalarBody = OrigLoop->getHeader();
3105   LoopVectorPreHeader = OrigLoop->getLoopPreheader();
3106   LoopExitBlock = OrigLoop->getExitBlock();
3107   assert(LoopExitBlock && "Must have an exit block");
3108   assert(LoopVectorPreHeader && "Invalid loop structure");
3109 
3110   LoopMiddleBlock =
3111       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
3112                  LI, nullptr, Twine(Prefix) + "middle.block");
3113   LoopScalarPreHeader =
3114       SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,
3115                  nullptr, Twine(Prefix) + "scalar.ph");
3116   // We intentionally don't let SplitBlock to update LoopInfo since
3117   // LoopVectorBody should belong to another loop than LoopVectorPreHeader.
3118   // LoopVectorBody is explicitly added to the correct place few lines later.
3119   LoopVectorBody =
3120       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
3121                  nullptr, nullptr, Twine(Prefix) + "vector.body");
3122 
3123   // Update dominator for loop exit.
3124   DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
3125 
3126   // Create and register the new vector loop.
3127   Loop *Lp = LI->AllocateLoop();
3128   Loop *ParentLoop = OrigLoop->getParentLoop();
3129 
3130   // Insert the new loop into the loop nest and register the new basic blocks
3131   // before calling any utilities such as SCEV that require valid LoopInfo.
3132   if (ParentLoop) {
3133     ParentLoop->addChildLoop(Lp);
3134   } else {
3135     LI->addTopLevelLoop(Lp);
3136   }
3137   Lp->addBasicBlockToLoop(LoopVectorBody, *LI);
3138   return Lp;
3139 }
3140 
3141 void InnerLoopVectorizer::createInductionResumeValues(Loop *L,
3142                                                       Value *VectorTripCount) {
3143   assert(VectorTripCount && L && "Expected valid arguments");
3144   // We are going to resume the execution of the scalar loop.
3145   // Go over all of the induction variables that we found and fix the
3146   // PHIs that are left in the scalar version of the loop.
3147   // The starting values of PHI nodes depend on the counter of the last
3148   // iteration in the vectorized loop.
3149   // If we come from a bypass edge then we need to start from the original
3150   // start value.
3151   for (auto &InductionEntry : Legal->getInductionVars()) {
3152     PHINode *OrigPhi = InductionEntry.first;
3153     InductionDescriptor II = InductionEntry.second;
3154 
3155     // Create phi nodes to merge from the  backedge-taken check block.
3156     PHINode *BCResumeVal =
3157         PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
3158                         LoopScalarPreHeader->getTerminator());
3159     // Copy original phi DL over to the new one.
3160     BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
3161     Value *&EndValue = IVEndValues[OrigPhi];
3162     if (OrigPhi == OldInduction) {
3163       // We know what the end value is.
3164       EndValue = VectorTripCount;
3165     } else {
3166       IRBuilder<> B(L->getLoopPreheader()->getTerminator());
3167       Type *StepType = II.getStep()->getType();
3168       Instruction::CastOps CastOp =
3169           CastInst::getCastOpcode(VectorTripCount, true, StepType, true);
3170       Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd");
3171       const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout();
3172       EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
3173       EndValue->setName("ind.end");
3174     }
3175 
3176     // The new PHI merges the original incoming value, in case of a bypass,
3177     // or the value at the end of the vectorized loop.
3178     BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);
3179 
3180     // Fix the scalar body counter (PHI node).
3181     // The old induction's phi node in the scalar body needs the truncated
3182     // value.
3183     for (BasicBlock *BB : LoopBypassBlocks)
3184       BCResumeVal->addIncoming(II.getStartValue(), BB);
3185     OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
3186   }
3187 }
3188 
3189 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L,
3190                                                       MDNode *OrigLoopID) {
3191   assert(L && "Expected valid loop.");
3192 
3193   // The trip counts should be cached by now.
3194   Value *Count = getOrCreateTripCount(L);
3195   Value *VectorTripCount = getOrCreateVectorTripCount(L);
3196 
3197   // We need the OrigLoop (scalar loop part) latch terminator to help
3198   // produce correct debug info for the middle block BB instructions.
3199   // The legality check stage guarantees that the loop will have a single
3200   // latch.
3201   assert(isa<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()) &&
3202          "Scalar loop latch terminator isn't a branch");
3203   BranchInst *ScalarLatchBr =
3204       cast<BranchInst>(OrigLoop->getLoopLatch()->getTerminator());
3205 
3206   // Add a check in the middle block to see if we have completed
3207   // all of the iterations in the first vector loop.
3208   // If (N - N%VF) == N, then we *don't* need to run the remainder.
3209   // If tail is to be folded, we know we don't need to run the remainder.
3210   Value *CmpN = Builder.getTrue();
3211   if (!Cost->foldTailByMasking()) {
3212     CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count,
3213                            VectorTripCount, "cmp.n",
3214                            LoopMiddleBlock->getTerminator());
3215 
3216     // Here we use the same DebugLoc as the scalar loop latch branch instead
3217     // of the corresponding compare because they may have ended up with
3218     // different line numbers and we want to avoid awkward line stepping while
3219     // debugging. Eg. if the compare has got a line number inside the loop.
3220     cast<Instruction>(CmpN)->setDebugLoc(ScalarLatchBr->getDebugLoc());
3221   }
3222 
3223   BranchInst *BrInst =
3224       BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, CmpN);
3225   BrInst->setDebugLoc(ScalarLatchBr->getDebugLoc());
3226   ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst);
3227 
3228   // Get ready to start creating new instructions into the vectorized body.
3229   assert(LoopVectorPreHeader == L->getLoopPreheader() &&
3230          "Inconsistent vector loop preheader");
3231   Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt());
3232 
3233   Optional<MDNode *> VectorizedLoopID =
3234       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
3235                                       LLVMLoopVectorizeFollowupVectorized});
3236   if (VectorizedLoopID.hasValue()) {
3237     L->setLoopID(VectorizedLoopID.getValue());
3238 
3239     // Do not setAlreadyVectorized if loop attributes have been defined
3240     // explicitly.
3241     return LoopVectorPreHeader;
3242   }
3243 
3244   // Keep all loop hints from the original loop on the vector loop (we'll
3245   // replace the vectorizer-specific hints below).
3246   if (MDNode *LID = OrigLoop->getLoopID())
3247     L->setLoopID(LID);
3248 
3249   LoopVectorizeHints Hints(L, true, *ORE);
3250   Hints.setAlreadyVectorized();
3251 
3252 #ifdef EXPENSIVE_CHECKS
3253   assert(DT->verify(DominatorTree::VerificationLevel::Fast));
3254   LI->verify(*DT);
3255 #endif
3256 
3257   return LoopVectorPreHeader;
3258 }
3259 
3260 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
3261   /*
3262    In this function we generate a new loop. The new loop will contain
3263    the vectorized instructions while the old loop will continue to run the
3264    scalar remainder.
3265 
3266        [ ] <-- loop iteration number check.
3267     /   |
3268    /    v
3269   |    [ ] <-- vector loop bypass (may consist of multiple blocks).
3270   |  /  |
3271   | /   v
3272   ||   [ ]     <-- vector pre header.
3273   |/    |
3274   |     v
3275   |    [  ] \
3276   |    [  ]_|   <-- vector loop.
3277   |     |
3278   |     v
3279   |   -[ ]   <--- middle-block.
3280   |  /  |
3281   | /   v
3282   -|- >[ ]     <--- new preheader.
3283    |    |
3284    |    v
3285    |   [ ] \
3286    |   [ ]_|   <-- old scalar loop to handle remainder.
3287     \   |
3288      \  v
3289       >[ ]     <-- exit block.
3290    ...
3291    */
3292 
3293   // Get the metadata of the original loop before it gets modified.
3294   MDNode *OrigLoopID = OrigLoop->getLoopID();
3295 
3296   // Create an empty vector loop, and prepare basic blocks for the runtime
3297   // checks.
3298   Loop *Lp = createVectorLoopSkeleton("");
3299 
3300   // Now, compare the new count to zero. If it is zero skip the vector loop and
3301   // jump to the scalar loop. This check also covers the case where the
3302   // backedge-taken count is uint##_max: adding one to it will overflow leading
3303   // to an incorrect trip count of zero. In this (rare) case we will also jump
3304   // to the scalar loop.
3305   emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader);
3306 
3307   // Generate the code to check any assumptions that we've made for SCEV
3308   // expressions.
3309   emitSCEVChecks(Lp, LoopScalarPreHeader);
3310 
3311   // Generate the code that checks in runtime if arrays overlap. We put the
3312   // checks into a separate block to make the more common case of few elements
3313   // faster.
3314   emitMemRuntimeChecks(Lp, LoopScalarPreHeader);
3315 
3316   // Some loops have a single integer induction variable, while other loops
3317   // don't. One example is c++ iterators that often have multiple pointer
3318   // induction variables. In the code below we also support a case where we
3319   // don't have a single induction variable.
3320   //
3321   // We try to obtain an induction variable from the original loop as hard
3322   // as possible. However if we don't find one that:
3323   //   - is an integer
3324   //   - counts from zero, stepping by one
3325   //   - is the size of the widest induction variable type
3326   // then we create a new one.
3327   OldInduction = Legal->getPrimaryInduction();
3328   Type *IdxTy = Legal->getWidestInductionType();
3329   Value *StartIdx = ConstantInt::get(IdxTy, 0);
3330   // The loop step is equal to the vectorization factor (num of SIMD elements)
3331   // times the unroll factor (num of SIMD instructions).
3332   assert(!VF.isScalable() && "scalable vectors not yet supported.");
3333   Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF);
3334   Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
3335   Induction =
3336       createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
3337                               getDebugLocFromInstOrOperands(OldInduction));
3338 
3339   // Emit phis for the new starting index of the scalar loop.
3340   createInductionResumeValues(Lp, CountRoundDown);
3341 
3342   return completeLoopSkeleton(Lp, OrigLoopID);
3343 }
3344 
3345 // Fix up external users of the induction variable. At this point, we are
3346 // in LCSSA form, with all external PHIs that use the IV having one input value,
3347 // coming from the remainder loop. We need those PHIs to also have a correct
3348 // value for the IV when arriving directly from the middle block.
3349 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
3350                                        const InductionDescriptor &II,
3351                                        Value *CountRoundDown, Value *EndValue,
3352                                        BasicBlock *MiddleBlock) {
3353   // There are two kinds of external IV usages - those that use the value
3354   // computed in the last iteration (the PHI) and those that use the penultimate
3355   // value (the value that feeds into the phi from the loop latch).
3356   // We allow both, but they, obviously, have different values.
3357 
3358   assert(OrigLoop->getExitBlock() && "Expected a single exit block");
3359 
3360   DenseMap<Value *, Value *> MissingVals;
3361 
3362   // An external user of the last iteration's value should see the value that
3363   // the remainder loop uses to initialize its own IV.
3364   Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
3365   for (User *U : PostInc->users()) {
3366     Instruction *UI = cast<Instruction>(U);
3367     if (!OrigLoop->contains(UI)) {
3368       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3369       MissingVals[UI] = EndValue;
3370     }
3371   }
3372 
3373   // An external user of the penultimate value need to see EndValue - Step.
3374   // The simplest way to get this is to recompute it from the constituent SCEVs,
3375   // that is Start + (Step * (CRD - 1)).
3376   for (User *U : OrigPhi->users()) {
3377     auto *UI = cast<Instruction>(U);
3378     if (!OrigLoop->contains(UI)) {
3379       const DataLayout &DL =
3380           OrigLoop->getHeader()->getModule()->getDataLayout();
3381       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3382 
3383       IRBuilder<> B(MiddleBlock->getTerminator());
3384       Value *CountMinusOne = B.CreateSub(
3385           CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1));
3386       Value *CMO =
3387           !II.getStep()->getType()->isIntegerTy()
3388               ? B.CreateCast(Instruction::SIToFP, CountMinusOne,
3389                              II.getStep()->getType())
3390               : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType());
3391       CMO->setName("cast.cmo");
3392       Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II);
3393       Escape->setName("ind.escape");
3394       MissingVals[UI] = Escape;
3395     }
3396   }
3397 
3398   for (auto &I : MissingVals) {
3399     PHINode *PHI = cast<PHINode>(I.first);
3400     // One corner case we have to handle is two IVs "chasing" each-other,
3401     // that is %IV2 = phi [...], [ %IV1, %latch ]
3402     // In this case, if IV1 has an external use, we need to avoid adding both
3403     // "last value of IV1" and "penultimate value of IV2". So, verify that we
3404     // don't already have an incoming value for the middle block.
3405     if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
3406       PHI->addIncoming(I.second, MiddleBlock);
3407   }
3408 }
3409 
3410 namespace {
3411 
3412 struct CSEDenseMapInfo {
3413   static bool canHandle(const Instruction *I) {
3414     return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
3415            isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
3416   }
3417 
3418   static inline Instruction *getEmptyKey() {
3419     return DenseMapInfo<Instruction *>::getEmptyKey();
3420   }
3421 
3422   static inline Instruction *getTombstoneKey() {
3423     return DenseMapInfo<Instruction *>::getTombstoneKey();
3424   }
3425 
3426   static unsigned getHashValue(const Instruction *I) {
3427     assert(canHandle(I) && "Unknown instruction!");
3428     return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
3429                                                            I->value_op_end()));
3430   }
3431 
3432   static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
3433     if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3434         LHS == getTombstoneKey() || RHS == getTombstoneKey())
3435       return LHS == RHS;
3436     return LHS->isIdenticalTo(RHS);
3437   }
3438 };
3439 
3440 } // end anonymous namespace
3441 
3442 ///Perform cse of induction variable instructions.
3443 static void cse(BasicBlock *BB) {
3444   // Perform simple cse.
3445   SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
3446   for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
3447     Instruction *In = &*I++;
3448 
3449     if (!CSEDenseMapInfo::canHandle(In))
3450       continue;
3451 
3452     // Check if we can replace this instruction with any of the
3453     // visited instructions.
3454     if (Instruction *V = CSEMap.lookup(In)) {
3455       In->replaceAllUsesWith(V);
3456       In->eraseFromParent();
3457       continue;
3458     }
3459 
3460     CSEMap[In] = In;
3461   }
3462 }
3463 
3464 unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
3465                                                        ElementCount VF,
3466                                                        bool &NeedToScalarize) {
3467   assert(!VF.isScalable() && "scalable vectors not yet supported.");
3468   Function *F = CI->getCalledFunction();
3469   Type *ScalarRetTy = CI->getType();
3470   SmallVector<Type *, 4> Tys, ScalarTys;
3471   for (auto &ArgOp : CI->arg_operands())
3472     ScalarTys.push_back(ArgOp->getType());
3473 
3474   // Estimate cost of scalarized vector call. The source operands are assumed
3475   // to be vectors, so we need to extract individual elements from there,
3476   // execute VF scalar calls, and then gather the result into the vector return
3477   // value.
3478   unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys,
3479                                                  TTI::TCK_RecipThroughput);
3480   if (VF.isScalar())
3481     return ScalarCallCost;
3482 
3483   // Compute corresponding vector type for return value and arguments.
3484   Type *RetTy = ToVectorTy(ScalarRetTy, VF);
3485   for (Type *ScalarTy : ScalarTys)
3486     Tys.push_back(ToVectorTy(ScalarTy, VF));
3487 
3488   // Compute costs of unpacking argument values for the scalar calls and
3489   // packing the return values to a vector.
3490   unsigned ScalarizationCost = getScalarizationOverhead(CI, VF);
3491 
3492   unsigned Cost = ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
3493 
3494   // If we can't emit a vector call for this function, then the currently found
3495   // cost is the cost we need to return.
3496   NeedToScalarize = true;
3497   VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
3498   Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
3499 
3500   if (!TLI || CI->isNoBuiltin() || !VecFunc)
3501     return Cost;
3502 
3503   // If the corresponding vector cost is cheaper, return its cost.
3504   unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys,
3505                                                  TTI::TCK_RecipThroughput);
3506   if (VectorCallCost < Cost) {
3507     NeedToScalarize = false;
3508     return VectorCallCost;
3509   }
3510   return Cost;
3511 }
3512 
3513 unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
3514                                                             ElementCount VF) {
3515   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
3516   assert(ID && "Expected intrinsic call!");
3517 
3518   IntrinsicCostAttributes CostAttrs(ID, *CI, VF);
3519   return TTI.getIntrinsicInstrCost(CostAttrs,
3520                                    TargetTransformInfo::TCK_RecipThroughput);
3521 }
3522 
3523 static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
3524   auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3525   auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3526   return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3527 }
3528 
3529 static Type *largestIntegerVectorType(Type *T1, Type *T2) {
3530   auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3531   auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3532   return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3533 }
3534 
3535 void InnerLoopVectorizer::truncateToMinimalBitwidths() {
3536   // For every instruction `I` in MinBWs, truncate the operands, create a
3537   // truncated version of `I` and reextend its result. InstCombine runs
3538   // later and will remove any ext/trunc pairs.
3539   SmallPtrSet<Value *, 4> Erased;
3540   for (const auto &KV : Cost->getMinimalBitwidths()) {
3541     // If the value wasn't vectorized, we must maintain the original scalar
3542     // type. The absence of the value from VectorLoopValueMap indicates that it
3543     // wasn't vectorized.
3544     if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3545       continue;
3546     for (unsigned Part = 0; Part < UF; ++Part) {
3547       Value *I = getOrCreateVectorValue(KV.first, Part);
3548       if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I))
3549         continue;
3550       Type *OriginalTy = I->getType();
3551       Type *ScalarTruncatedTy =
3552           IntegerType::get(OriginalTy->getContext(), KV.second);
3553       auto *TruncatedTy = FixedVectorType::get(
3554           ScalarTruncatedTy,
3555           cast<FixedVectorType>(OriginalTy)->getNumElements());
3556       if (TruncatedTy == OriginalTy)
3557         continue;
3558 
3559       IRBuilder<> B(cast<Instruction>(I));
3560       auto ShrinkOperand = [&](Value *V) -> Value * {
3561         if (auto *ZI = dyn_cast<ZExtInst>(V))
3562           if (ZI->getSrcTy() == TruncatedTy)
3563             return ZI->getOperand(0);
3564         return B.CreateZExtOrTrunc(V, TruncatedTy);
3565       };
3566 
3567       // The actual instruction modification depends on the instruction type,
3568       // unfortunately.
3569       Value *NewI = nullptr;
3570       if (auto *BO = dyn_cast<BinaryOperator>(I)) {
3571         NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
3572                              ShrinkOperand(BO->getOperand(1)));
3573 
3574         // Any wrapping introduced by shrinking this operation shouldn't be
3575         // considered undefined behavior. So, we can't unconditionally copy
3576         // arithmetic wrapping flags to NewI.
3577         cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
3578       } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
3579         NewI =
3580             B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
3581                          ShrinkOperand(CI->getOperand(1)));
3582       } else if (auto *SI = dyn_cast<SelectInst>(I)) {
3583         NewI = B.CreateSelect(SI->getCondition(),
3584                               ShrinkOperand(SI->getTrueValue()),
3585                               ShrinkOperand(SI->getFalseValue()));
3586       } else if (auto *CI = dyn_cast<CastInst>(I)) {
3587         switch (CI->getOpcode()) {
3588         default:
3589           llvm_unreachable("Unhandled cast!");
3590         case Instruction::Trunc:
3591           NewI = ShrinkOperand(CI->getOperand(0));
3592           break;
3593         case Instruction::SExt:
3594           NewI = B.CreateSExtOrTrunc(
3595               CI->getOperand(0),
3596               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3597           break;
3598         case Instruction::ZExt:
3599           NewI = B.CreateZExtOrTrunc(
3600               CI->getOperand(0),
3601               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3602           break;
3603         }
3604       } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
3605         auto Elements0 = cast<FixedVectorType>(SI->getOperand(0)->getType())
3606                              ->getNumElements();
3607         auto *O0 = B.CreateZExtOrTrunc(
3608             SI->getOperand(0),
3609             FixedVectorType::get(ScalarTruncatedTy, Elements0));
3610         auto Elements1 = cast<FixedVectorType>(SI->getOperand(1)->getType())
3611                              ->getNumElements();
3612         auto *O1 = B.CreateZExtOrTrunc(
3613             SI->getOperand(1),
3614             FixedVectorType::get(ScalarTruncatedTy, Elements1));
3615 
3616         NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask());
3617       } else if (isa<LoadInst>(I) || isa<PHINode>(I)) {
3618         // Don't do anything with the operands, just extend the result.
3619         continue;
3620       } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
3621         auto Elements = cast<FixedVectorType>(IE->getOperand(0)->getType())
3622                             ->getNumElements();
3623         auto *O0 = B.CreateZExtOrTrunc(
3624             IE->getOperand(0),
3625             FixedVectorType::get(ScalarTruncatedTy, Elements));
3626         auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
3627         NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
3628       } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
3629         auto Elements = cast<FixedVectorType>(EE->getOperand(0)->getType())
3630                             ->getNumElements();
3631         auto *O0 = B.CreateZExtOrTrunc(
3632             EE->getOperand(0),
3633             FixedVectorType::get(ScalarTruncatedTy, Elements));
3634         NewI = B.CreateExtractElement(O0, EE->getOperand(2));
3635       } else {
3636         // If we don't know what to do, be conservative and don't do anything.
3637         continue;
3638       }
3639 
3640       // Lastly, extend the result.
3641       NewI->takeName(cast<Instruction>(I));
3642       Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
3643       I->replaceAllUsesWith(Res);
3644       cast<Instruction>(I)->eraseFromParent();
3645       Erased.insert(I);
3646       VectorLoopValueMap.resetVectorValue(KV.first, Part, Res);
3647     }
3648   }
3649 
3650   // We'll have created a bunch of ZExts that are now parentless. Clean up.
3651   for (const auto &KV : Cost->getMinimalBitwidths()) {
3652     // If the value wasn't vectorized, we must maintain the original scalar
3653     // type. The absence of the value from VectorLoopValueMap indicates that it
3654     // wasn't vectorized.
3655     if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3656       continue;
3657     for (unsigned Part = 0; Part < UF; ++Part) {
3658       Value *I = getOrCreateVectorValue(KV.first, Part);
3659       ZExtInst *Inst = dyn_cast<ZExtInst>(I);
3660       if (Inst && Inst->use_empty()) {
3661         Value *NewI = Inst->getOperand(0);
3662         Inst->eraseFromParent();
3663         VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI);
3664       }
3665     }
3666   }
3667 }
3668 
3669 void InnerLoopVectorizer::fixVectorizedLoop() {
3670   // Insert truncates and extends for any truncated instructions as hints to
3671   // InstCombine.
3672   if (VF.isVector())
3673     truncateToMinimalBitwidths();
3674 
3675   // Fix widened non-induction PHIs by setting up the PHI operands.
3676   if (OrigPHIsToFix.size()) {
3677     assert(EnableVPlanNativePath &&
3678            "Unexpected non-induction PHIs for fixup in non VPlan-native path");
3679     fixNonInductionPHIs();
3680   }
3681 
3682   // At this point every instruction in the original loop is widened to a
3683   // vector form. Now we need to fix the recurrences in the loop. These PHI
3684   // nodes are currently empty because we did not want to introduce cycles.
3685   // This is the second stage of vectorizing recurrences.
3686   fixCrossIterationPHIs();
3687 
3688   // Forget the original basic block.
3689   PSE.getSE()->forgetLoop(OrigLoop);
3690 
3691   // Fix-up external users of the induction variables.
3692   for (auto &Entry : Legal->getInductionVars())
3693     fixupIVUsers(Entry.first, Entry.second,
3694                  getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)),
3695                  IVEndValues[Entry.first], LoopMiddleBlock);
3696 
3697   fixLCSSAPHIs();
3698   for (Instruction *PI : PredicatedInstructions)
3699     sinkScalarOperands(&*PI);
3700 
3701   // Remove redundant induction instructions.
3702   cse(LoopVectorBody);
3703 
3704   // Set/update profile weights for the vector and remainder loops as original
3705   // loop iterations are now distributed among them. Note that original loop
3706   // represented by LoopScalarBody becomes remainder loop after vectorization.
3707   //
3708   // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
3709   // end up getting slightly roughened result but that should be OK since
3710   // profile is not inherently precise anyway. Note also possible bypass of
3711   // vector code caused by legality checks is ignored, assigning all the weight
3712   // to the vector loop, optimistically.
3713   assert(!VF.isScalable() &&
3714          "cannot use scalable ElementCount to determine unroll factor");
3715   setProfileInfoAfterUnrolling(
3716       LI->getLoopFor(LoopScalarBody), LI->getLoopFor(LoopVectorBody),
3717       LI->getLoopFor(LoopScalarBody), VF.getKnownMinValue() * UF);
3718 }
3719 
3720 void InnerLoopVectorizer::fixCrossIterationPHIs() {
3721   // In order to support recurrences we need to be able to vectorize Phi nodes.
3722   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3723   // stage #2: We now need to fix the recurrences by adding incoming edges to
3724   // the currently empty PHI nodes. At this point every instruction in the
3725   // original loop is widened to a vector form so we can use them to construct
3726   // the incoming edges.
3727   for (PHINode &Phi : OrigLoop->getHeader()->phis()) {
3728     // Handle first-order recurrences and reductions that need to be fixed.
3729     if (Legal->isFirstOrderRecurrence(&Phi))
3730       fixFirstOrderRecurrence(&Phi);
3731     else if (Legal->isReductionVariable(&Phi))
3732       fixReduction(&Phi);
3733   }
3734 }
3735 
3736 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
3737   // This is the second phase of vectorizing first-order recurrences. An
3738   // overview of the transformation is described below. Suppose we have the
3739   // following loop.
3740   //
3741   //   for (int i = 0; i < n; ++i)
3742   //     b[i] = a[i] - a[i - 1];
3743   //
3744   // There is a first-order recurrence on "a". For this loop, the shorthand
3745   // scalar IR looks like:
3746   //
3747   //   scalar.ph:
3748   //     s_init = a[-1]
3749   //     br scalar.body
3750   //
3751   //   scalar.body:
3752   //     i = phi [0, scalar.ph], [i+1, scalar.body]
3753   //     s1 = phi [s_init, scalar.ph], [s2, scalar.body]
3754   //     s2 = a[i]
3755   //     b[i] = s2 - s1
3756   //     br cond, scalar.body, ...
3757   //
3758   // In this example, s1 is a recurrence because it's value depends on the
3759   // previous iteration. In the first phase of vectorization, we created a
3760   // temporary value for s1. We now complete the vectorization and produce the
3761   // shorthand vector IR shown below (for VF = 4, UF = 1).
3762   //
3763   //   vector.ph:
3764   //     v_init = vector(..., ..., ..., a[-1])
3765   //     br vector.body
3766   //
3767   //   vector.body
3768   //     i = phi [0, vector.ph], [i+4, vector.body]
3769   //     v1 = phi [v_init, vector.ph], [v2, vector.body]
3770   //     v2 = a[i, i+1, i+2, i+3];
3771   //     v3 = vector(v1(3), v2(0, 1, 2))
3772   //     b[i, i+1, i+2, i+3] = v2 - v3
3773   //     br cond, vector.body, middle.block
3774   //
3775   //   middle.block:
3776   //     x = v2(3)
3777   //     br scalar.ph
3778   //
3779   //   scalar.ph:
3780   //     s_init = phi [x, middle.block], [a[-1], otherwise]
3781   //     br scalar.body
3782   //
3783   // After execution completes the vector loop, we extract the next value of
3784   // the recurrence (x) to use as the initial value in the scalar loop.
3785 
3786   // Get the original loop preheader and single loop latch.
3787   auto *Preheader = OrigLoop->getLoopPreheader();
3788   auto *Latch = OrigLoop->getLoopLatch();
3789 
3790   // Get the initial and previous values of the scalar recurrence.
3791   auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader);
3792   auto *Previous = Phi->getIncomingValueForBlock(Latch);
3793 
3794   // Create a vector from the initial value.
3795   auto *VectorInit = ScalarInit;
3796   if (VF.isVector()) {
3797     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
3798     assert(!VF.isScalable() && "VF is assumed to be non scalable.");
3799     VectorInit = Builder.CreateInsertElement(
3800         UndefValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit,
3801         Builder.getInt32(VF.getKnownMinValue() - 1), "vector.recur.init");
3802   }
3803 
3804   // We constructed a temporary phi node in the first phase of vectorization.
3805   // This phi node will eventually be deleted.
3806   Builder.SetInsertPoint(
3807       cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0)));
3808 
3809   // Create a phi node for the new recurrence. The current value will either be
3810   // the initial value inserted into a vector or loop-varying vector value.
3811   auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur");
3812   VecPhi->addIncoming(VectorInit, LoopVectorPreHeader);
3813 
3814   // Get the vectorized previous value of the last part UF - 1. It appears last
3815   // among all unrolled iterations, due to the order of their construction.
3816   Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1);
3817 
3818   // Find and set the insertion point after the previous value if it is an
3819   // instruction.
3820   BasicBlock::iterator InsertPt;
3821   // Note that the previous value may have been constant-folded so it is not
3822   // guaranteed to be an instruction in the vector loop.
3823   // FIXME: Loop invariant values do not form recurrences. We should deal with
3824   //        them earlier.
3825   if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart))
3826     InsertPt = LoopVectorBody->getFirstInsertionPt();
3827   else {
3828     Instruction *PreviousInst = cast<Instruction>(PreviousLastPart);
3829     if (isa<PHINode>(PreviousLastPart))
3830       // If the previous value is a phi node, we should insert after all the phi
3831       // nodes in the block containing the PHI to avoid breaking basic block
3832       // verification. Note that the basic block may be different to
3833       // LoopVectorBody, in case we predicate the loop.
3834       InsertPt = PreviousInst->getParent()->getFirstInsertionPt();
3835     else
3836       InsertPt = ++PreviousInst->getIterator();
3837   }
3838   Builder.SetInsertPoint(&*InsertPt);
3839 
3840   // We will construct a vector for the recurrence by combining the values for
3841   // the current and previous iterations. This is the required shuffle mask.
3842   assert(!VF.isScalable());
3843   SmallVector<int, 8> ShuffleMask(VF.getKnownMinValue());
3844   ShuffleMask[0] = VF.getKnownMinValue() - 1;
3845   for (unsigned I = 1; I < VF.getKnownMinValue(); ++I)
3846     ShuffleMask[I] = I + VF.getKnownMinValue() - 1;
3847 
3848   // The vector from which to take the initial value for the current iteration
3849   // (actual or unrolled). Initially, this is the vector phi node.
3850   Value *Incoming = VecPhi;
3851 
3852   // Shuffle the current and previous vector and update the vector parts.
3853   for (unsigned Part = 0; Part < UF; ++Part) {
3854     Value *PreviousPart = getOrCreateVectorValue(Previous, Part);
3855     Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part);
3856     auto *Shuffle =
3857         VF.isVector()
3858             ? Builder.CreateShuffleVector(Incoming, PreviousPart, ShuffleMask)
3859             : Incoming;
3860     PhiPart->replaceAllUsesWith(Shuffle);
3861     cast<Instruction>(PhiPart)->eraseFromParent();
3862     VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle);
3863     Incoming = PreviousPart;
3864   }
3865 
3866   // Fix the latch value of the new recurrence in the vector loop.
3867   VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
3868 
3869   // Extract the last vector element in the middle block. This will be the
3870   // initial value for the recurrence when jumping to the scalar loop.
3871   auto *ExtractForScalar = Incoming;
3872   if (VF.isVector()) {
3873     Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
3874     ExtractForScalar = Builder.CreateExtractElement(
3875         ExtractForScalar, Builder.getInt32(VF.getKnownMinValue() - 1),
3876         "vector.recur.extract");
3877   }
3878   // Extract the second last element in the middle block if the
3879   // Phi is used outside the loop. We need to extract the phi itself
3880   // and not the last element (the phi update in the current iteration). This
3881   // will be the value when jumping to the exit block from the LoopMiddleBlock,
3882   // when the scalar loop is not run at all.
3883   Value *ExtractForPhiUsedOutsideLoop = nullptr;
3884   if (VF.isVector())
3885     ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
3886         Incoming, Builder.getInt32(VF.getKnownMinValue() - 2),
3887         "vector.recur.extract.for.phi");
3888   // When loop is unrolled without vectorizing, initialize
3889   // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of
3890   // `Incoming`. This is analogous to the vectorized case above: extracting the
3891   // second last element when VF > 1.
3892   else if (UF > 1)
3893     ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2);
3894 
3895   // Fix the initial value of the original recurrence in the scalar loop.
3896   Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
3897   auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
3898   for (auto *BB : predecessors(LoopScalarPreHeader)) {
3899     auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
3900     Start->addIncoming(Incoming, BB);
3901   }
3902 
3903   Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start);
3904   Phi->setName("scalar.recur");
3905 
3906   // Finally, fix users of the recurrence outside the loop. The users will need
3907   // either the last value of the scalar recurrence or the last value of the
3908   // vector recurrence we extracted in the middle block. Since the loop is in
3909   // LCSSA form, we just need to find all the phi nodes for the original scalar
3910   // recurrence in the exit block, and then add an edge for the middle block.
3911   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3912     if (LCSSAPhi.getIncomingValue(0) == Phi) {
3913       LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
3914     }
3915   }
3916 }
3917 
3918 void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
3919   Constant *Zero = Builder.getInt32(0);
3920 
3921   // Get it's reduction variable descriptor.
3922   assert(Legal->isReductionVariable(Phi) &&
3923          "Unable to find the reduction variable");
3924   RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi];
3925 
3926   RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();
3927   TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
3928   Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
3929   RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind =
3930     RdxDesc.getMinMaxRecurrenceKind();
3931   setDebugLocFromInst(Builder, ReductionStartValue);
3932   bool IsInLoopReductionPhi = Cost->isInLoopReduction(Phi);
3933 
3934   // We need to generate a reduction vector from the incoming scalar.
3935   // To do so, we need to generate the 'identity' vector and override
3936   // one of the elements with the incoming scalar reduction. We need
3937   // to do it in the vector-loop preheader.
3938   Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
3939 
3940   // This is the vector-clone of the value that leaves the loop.
3941   Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType();
3942 
3943   // Find the reduction identity variable. Zero for addition, or, xor,
3944   // one for multiplication, -1 for And.
3945   Value *Identity;
3946   Value *VectorStart;
3947   if (RK == RecurrenceDescriptor::RK_IntegerMinMax ||
3948       RK == RecurrenceDescriptor::RK_FloatMinMax) {
3949     // MinMax reduction have the start value as their identify.
3950     if (VF.isScalar() || IsInLoopReductionPhi) {
3951       VectorStart = Identity = ReductionStartValue;
3952     } else {
3953       VectorStart = Identity =
3954         Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident");
3955     }
3956   } else {
3957     // Handle other reduction kinds:
3958     Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity(
3959         RK, MinMaxKind, VecTy->getScalarType());
3960     if (VF.isScalar() || IsInLoopReductionPhi) {
3961       Identity = Iden;
3962       // This vector is the Identity vector where the first element is the
3963       // incoming scalar reduction.
3964       VectorStart = ReductionStartValue;
3965     } else {
3966       Identity = ConstantVector::getSplat(VF, Iden);
3967 
3968       // This vector is the Identity vector where the first element is the
3969       // incoming scalar reduction.
3970       VectorStart =
3971         Builder.CreateInsertElement(Identity, ReductionStartValue, Zero);
3972     }
3973   }
3974 
3975   // Wrap flags are in general invalid after vectorization, clear them.
3976   clearReductionWrapFlags(RdxDesc);
3977 
3978   // Fix the vector-loop phi.
3979 
3980   // Reductions do not have to start at zero. They can start with
3981   // any loop invariant values.
3982   BasicBlock *Latch = OrigLoop->getLoopLatch();
3983   Value *LoopVal = Phi->getIncomingValueForBlock(Latch);
3984 
3985   for (unsigned Part = 0; Part < UF; ++Part) {
3986     Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part);
3987     Value *Val = getOrCreateVectorValue(LoopVal, Part);
3988     // Make sure to add the reduction start value only to the
3989     // first unroll part.
3990     Value *StartVal = (Part == 0) ? VectorStart : Identity;
3991     cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader);
3992     cast<PHINode>(VecRdxPhi)
3993       ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
3994   }
3995 
3996   // Before each round, move the insertion point right between
3997   // the PHIs and the values we are going to write.
3998   // This allows us to write both PHINodes and the extractelement
3999   // instructions.
4000   Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
4001 
4002   setDebugLocFromInst(Builder, LoopExitInst);
4003 
4004   // If tail is folded by masking, the vector value to leave the loop should be
4005   // a Select choosing between the vectorized LoopExitInst and vectorized Phi,
4006   // instead of the former. For an inloop reduction the reduction will already
4007   // be predicated, and does not need to be handled here.
4008   if (Cost->foldTailByMasking() && !IsInLoopReductionPhi) {
4009     for (unsigned Part = 0; Part < UF; ++Part) {
4010       Value *VecLoopExitInst =
4011           VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
4012       Value *Sel = nullptr;
4013       for (User *U : VecLoopExitInst->users()) {
4014         if (isa<SelectInst>(U)) {
4015           assert(!Sel && "Reduction exit feeding two selects");
4016           Sel = U;
4017         } else
4018           assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select");
4019       }
4020       assert(Sel && "Reduction exit feeds no select");
4021       VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, Sel);
4022 
4023       // If the target can create a predicated operator for the reduction at no
4024       // extra cost in the loop (for example a predicated vadd), it can be
4025       // cheaper for the select to remain in the loop than be sunk out of it,
4026       // and so use the select value for the phi instead of the old
4027       // LoopExitValue.
4028       RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi];
4029       if (PreferPredicatedReductionSelect ||
4030           TTI->preferPredicatedReductionSelect(
4031               RdxDesc.getRecurrenceBinOp(), Phi->getType(),
4032               TargetTransformInfo::ReductionFlags())) {
4033         auto *VecRdxPhi = cast<PHINode>(getOrCreateVectorValue(Phi, Part));
4034         VecRdxPhi->setIncomingValueForBlock(
4035             LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel);
4036       }
4037     }
4038   }
4039 
4040   // If the vector reduction can be performed in a smaller type, we truncate
4041   // then extend the loop exit value to enable InstCombine to evaluate the
4042   // entire expression in the smaller type.
4043   if (VF.isVector() && Phi->getType() != RdxDesc.getRecurrenceType()) {
4044     assert(!IsInLoopReductionPhi && "Unexpected truncated inloop reduction!");
4045     assert(!VF.isScalable() && "scalable vectors not yet supported.");
4046     Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
4047     Builder.SetInsertPoint(
4048         LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator());
4049     VectorParts RdxParts(UF);
4050     for (unsigned Part = 0; Part < UF; ++Part) {
4051       RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
4052       Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
4053       Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
4054                                         : Builder.CreateZExt(Trunc, VecTy);
4055       for (Value::user_iterator UI = RdxParts[Part]->user_begin();
4056            UI != RdxParts[Part]->user_end();)
4057         if (*UI != Trunc) {
4058           (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd);
4059           RdxParts[Part] = Extnd;
4060         } else {
4061           ++UI;
4062         }
4063     }
4064     Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
4065     for (unsigned Part = 0; Part < UF; ++Part) {
4066       RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
4067       VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]);
4068     }
4069   }
4070 
4071   // Reduce all of the unrolled parts into a single vector.
4072   Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0);
4073   unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK);
4074 
4075   // The middle block terminator has already been assigned a DebugLoc here (the
4076   // OrigLoop's single latch terminator). We want the whole middle block to
4077   // appear to execute on this line because: (a) it is all compiler generated,
4078   // (b) these instructions are always executed after evaluating the latch
4079   // conditional branch, and (c) other passes may add new predecessors which
4080   // terminate on this line. This is the easiest way to ensure we don't
4081   // accidentally cause an extra step back into the loop while debugging.
4082   setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator());
4083   for (unsigned Part = 1; Part < UF; ++Part) {
4084     Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
4085     if (Op != Instruction::ICmp && Op != Instruction::FCmp)
4086       // Floating point operations had to be 'fast' to enable the reduction.
4087       ReducedPartRdx = addFastMathFlag(
4088           Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxPart,
4089                               ReducedPartRdx, "bin.rdx"),
4090           RdxDesc.getFastMathFlags());
4091     else
4092       ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx,
4093                                       RdxPart);
4094   }
4095 
4096   // Create the reduction after the loop. Note that inloop reductions create the
4097   // target reduction in the loop using a Reduction recipe.
4098   if (VF.isVector() && !IsInLoopReductionPhi) {
4099     bool NoNaN = Legal->hasFunNoNaNAttr();
4100     ReducedPartRdx =
4101         createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN);
4102     // If the reduction can be performed in a smaller type, we need to extend
4103     // the reduction to the wider type before we branch to the original loop.
4104     if (Phi->getType() != RdxDesc.getRecurrenceType())
4105       ReducedPartRdx =
4106         RdxDesc.isSigned()
4107         ? Builder.CreateSExt(ReducedPartRdx, Phi->getType())
4108         : Builder.CreateZExt(ReducedPartRdx, Phi->getType());
4109   }
4110 
4111   // Create a phi node that merges control-flow from the backedge-taken check
4112   // block and the middle block.
4113   PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx",
4114                                         LoopScalarPreHeader->getTerminator());
4115   for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
4116     BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]);
4117   BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
4118 
4119   // Now, we need to fix the users of the reduction variable
4120   // inside and outside of the scalar remainder loop.
4121   // We know that the loop is in LCSSA form. We need to update the
4122   // PHI nodes in the exit blocks.
4123   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
4124     // All PHINodes need to have a single entry edge, or two if
4125     // we already fixed them.
4126     assert(LCSSAPhi.getNumIncomingValues() < 3 && "Invalid LCSSA PHI");
4127 
4128     // We found a reduction value exit-PHI. Update it with the
4129     // incoming bypass edge.
4130     if (LCSSAPhi.getIncomingValue(0) == LoopExitInst)
4131       LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
4132   } // end of the LCSSA phi scan.
4133 
4134     // Fix the scalar loop reduction variable with the incoming reduction sum
4135     // from the vector body and from the backedge value.
4136   int IncomingEdgeBlockIdx =
4137     Phi->getBasicBlockIndex(OrigLoop->getLoopLatch());
4138   assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
4139   // Pick the other block.
4140   int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
4141   Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
4142   Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
4143 }
4144 
4145 void InnerLoopVectorizer::clearReductionWrapFlags(
4146     RecurrenceDescriptor &RdxDesc) {
4147   RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();
4148   if (RK != RecurrenceDescriptor::RK_IntegerAdd &&
4149       RK != RecurrenceDescriptor::RK_IntegerMult)
4150     return;
4151 
4152   Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr();
4153   assert(LoopExitInstr && "null loop exit instruction");
4154   SmallVector<Instruction *, 8> Worklist;
4155   SmallPtrSet<Instruction *, 8> Visited;
4156   Worklist.push_back(LoopExitInstr);
4157   Visited.insert(LoopExitInstr);
4158 
4159   while (!Worklist.empty()) {
4160     Instruction *Cur = Worklist.pop_back_val();
4161     if (isa<OverflowingBinaryOperator>(Cur))
4162       for (unsigned Part = 0; Part < UF; ++Part) {
4163         Value *V = getOrCreateVectorValue(Cur, Part);
4164         cast<Instruction>(V)->dropPoisonGeneratingFlags();
4165       }
4166 
4167     for (User *U : Cur->users()) {
4168       Instruction *UI = cast<Instruction>(U);
4169       if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) &&
4170           Visited.insert(UI).second)
4171         Worklist.push_back(UI);
4172     }
4173   }
4174 }
4175 
4176 void InnerLoopVectorizer::fixLCSSAPHIs() {
4177   assert(!VF.isScalable() && "the code below assumes fixed width vectors");
4178   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
4179     if (LCSSAPhi.getNumIncomingValues() == 1) {
4180       auto *IncomingValue = LCSSAPhi.getIncomingValue(0);
4181       // Non-instruction incoming values will have only one value.
4182       unsigned LastLane = 0;
4183       if (isa<Instruction>(IncomingValue))
4184         LastLane = Cost->isUniformAfterVectorization(
4185                        cast<Instruction>(IncomingValue), VF)
4186                        ? 0
4187                        : VF.getKnownMinValue() - 1;
4188       // Can be a loop invariant incoming value or the last scalar value to be
4189       // extracted from the vectorized loop.
4190       Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
4191       Value *lastIncomingValue =
4192           getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane });
4193       LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock);
4194     }
4195   }
4196 }
4197 
4198 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
4199   // The basic block and loop containing the predicated instruction.
4200   auto *PredBB = PredInst->getParent();
4201   auto *VectorLoop = LI->getLoopFor(PredBB);
4202 
4203   // Initialize a worklist with the operands of the predicated instruction.
4204   SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
4205 
4206   // Holds instructions that we need to analyze again. An instruction may be
4207   // reanalyzed if we don't yet know if we can sink it or not.
4208   SmallVector<Instruction *, 8> InstsToReanalyze;
4209 
4210   // Returns true if a given use occurs in the predicated block. Phi nodes use
4211   // their operands in their corresponding predecessor blocks.
4212   auto isBlockOfUsePredicated = [&](Use &U) -> bool {
4213     auto *I = cast<Instruction>(U.getUser());
4214     BasicBlock *BB = I->getParent();
4215     if (auto *Phi = dyn_cast<PHINode>(I))
4216       BB = Phi->getIncomingBlock(
4217           PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
4218     return BB == PredBB;
4219   };
4220 
4221   // Iteratively sink the scalarized operands of the predicated instruction
4222   // into the block we created for it. When an instruction is sunk, it's
4223   // operands are then added to the worklist. The algorithm ends after one pass
4224   // through the worklist doesn't sink a single instruction.
4225   bool Changed;
4226   do {
4227     // Add the instructions that need to be reanalyzed to the worklist, and
4228     // reset the changed indicator.
4229     Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
4230     InstsToReanalyze.clear();
4231     Changed = false;
4232 
4233     while (!Worklist.empty()) {
4234       auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
4235 
4236       // We can't sink an instruction if it is a phi node, is already in the
4237       // predicated block, is not in the loop, or may have side effects.
4238       if (!I || isa<PHINode>(I) || I->getParent() == PredBB ||
4239           !VectorLoop->contains(I) || I->mayHaveSideEffects())
4240         continue;
4241 
4242       // It's legal to sink the instruction if all its uses occur in the
4243       // predicated block. Otherwise, there's nothing to do yet, and we may
4244       // need to reanalyze the instruction.
4245       if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
4246         InstsToReanalyze.push_back(I);
4247         continue;
4248       }
4249 
4250       // Move the instruction to the beginning of the predicated block, and add
4251       // it's operands to the worklist.
4252       I->moveBefore(&*PredBB->getFirstInsertionPt());
4253       Worklist.insert(I->op_begin(), I->op_end());
4254 
4255       // The sinking may have enabled other instructions to be sunk, so we will
4256       // need to iterate.
4257       Changed = true;
4258     }
4259   } while (Changed);
4260 }
4261 
4262 void InnerLoopVectorizer::fixNonInductionPHIs() {
4263   for (PHINode *OrigPhi : OrigPHIsToFix) {
4264     PHINode *NewPhi =
4265         cast<PHINode>(VectorLoopValueMap.getVectorValue(OrigPhi, 0));
4266     unsigned NumIncomingValues = OrigPhi->getNumIncomingValues();
4267 
4268     SmallVector<BasicBlock *, 2> ScalarBBPredecessors(
4269         predecessors(OrigPhi->getParent()));
4270     SmallVector<BasicBlock *, 2> VectorBBPredecessors(
4271         predecessors(NewPhi->getParent()));
4272     assert(ScalarBBPredecessors.size() == VectorBBPredecessors.size() &&
4273            "Scalar and Vector BB should have the same number of predecessors");
4274 
4275     // The insertion point in Builder may be invalidated by the time we get
4276     // here. Force the Builder insertion point to something valid so that we do
4277     // not run into issues during insertion point restore in
4278     // getOrCreateVectorValue calls below.
4279     Builder.SetInsertPoint(NewPhi);
4280 
4281     // The predecessor order is preserved and we can rely on mapping between
4282     // scalar and vector block predecessors.
4283     for (unsigned i = 0; i < NumIncomingValues; ++i) {
4284       BasicBlock *NewPredBB = VectorBBPredecessors[i];
4285 
4286       // When looking up the new scalar/vector values to fix up, use incoming
4287       // values from original phi.
4288       Value *ScIncV =
4289           OrigPhi->getIncomingValueForBlock(ScalarBBPredecessors[i]);
4290 
4291       // Scalar incoming value may need a broadcast
4292       Value *NewIncV = getOrCreateVectorValue(ScIncV, 0);
4293       NewPhi->addIncoming(NewIncV, NewPredBB);
4294     }
4295   }
4296 }
4297 
4298 void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPValue *VPDef,
4299                                    VPUser &Operands, unsigned UF,
4300                                    ElementCount VF, bool IsPtrLoopInvariant,
4301                                    SmallBitVector &IsIndexLoopInvariant,
4302                                    VPTransformState &State) {
4303   // Construct a vector GEP by widening the operands of the scalar GEP as
4304   // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
4305   // results in a vector of pointers when at least one operand of the GEP
4306   // is vector-typed. Thus, to keep the representation compact, we only use
4307   // vector-typed operands for loop-varying values.
4308 
4309   if (VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) {
4310     // If we are vectorizing, but the GEP has only loop-invariant operands,
4311     // the GEP we build (by only using vector-typed operands for
4312     // loop-varying values) would be a scalar pointer. Thus, to ensure we
4313     // produce a vector of pointers, we need to either arbitrarily pick an
4314     // operand to broadcast, or broadcast a clone of the original GEP.
4315     // Here, we broadcast a clone of the original.
4316     //
4317     // TODO: If at some point we decide to scalarize instructions having
4318     //       loop-invariant operands, this special case will no longer be
4319     //       required. We would add the scalarization decision to
4320     //       collectLoopScalars() and teach getVectorValue() to broadcast
4321     //       the lane-zero scalar value.
4322     auto *Clone = Builder.Insert(GEP->clone());
4323     for (unsigned Part = 0; Part < UF; ++Part) {
4324       Value *EntryPart = Builder.CreateVectorSplat(VF, Clone);
4325       State.set(VPDef, GEP, EntryPart, Part);
4326       addMetadata(EntryPart, GEP);
4327     }
4328   } else {
4329     // If the GEP has at least one loop-varying operand, we are sure to
4330     // produce a vector of pointers. But if we are only unrolling, we want
4331     // to produce a scalar GEP for each unroll part. Thus, the GEP we
4332     // produce with the code below will be scalar (if VF == 1) or vector
4333     // (otherwise). Note that for the unroll-only case, we still maintain
4334     // values in the vector mapping with initVector, as we do for other
4335     // instructions.
4336     for (unsigned Part = 0; Part < UF; ++Part) {
4337       // The pointer operand of the new GEP. If it's loop-invariant, we
4338       // won't broadcast it.
4339       auto *Ptr = IsPtrLoopInvariant ? State.get(Operands.getOperand(0), {0, 0})
4340                                      : State.get(Operands.getOperand(0), Part);
4341 
4342       // Collect all the indices for the new GEP. If any index is
4343       // loop-invariant, we won't broadcast it.
4344       SmallVector<Value *, 4> Indices;
4345       for (unsigned I = 1, E = Operands.getNumOperands(); I < E; I++) {
4346         VPValue *Operand = Operands.getOperand(I);
4347         if (IsIndexLoopInvariant[I - 1])
4348           Indices.push_back(State.get(Operand, {0, 0}));
4349         else
4350           Indices.push_back(State.get(Operand, Part));
4351       }
4352 
4353       // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
4354       // but it should be a vector, otherwise.
4355       auto *NewGEP =
4356           GEP->isInBounds()
4357               ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr,
4358                                           Indices)
4359               : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices);
4360       assert((VF.isScalar() || NewGEP->getType()->isVectorTy()) &&
4361              "NewGEP is not a pointer vector");
4362       State.set(VPDef, GEP, NewGEP, Part);
4363       addMetadata(NewGEP, GEP);
4364     }
4365   }
4366 }
4367 
4368 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
4369                                               ElementCount VF) {
4370   assert(!VF.isScalable() && "scalable vectors not yet supported.");
4371   PHINode *P = cast<PHINode>(PN);
4372   if (EnableVPlanNativePath) {
4373     // Currently we enter here in the VPlan-native path for non-induction
4374     // PHIs where all control flow is uniform. We simply widen these PHIs.
4375     // Create a vector phi with no operands - the vector phi operands will be
4376     // set at the end of vector code generation.
4377     Type *VecTy =
4378         (VF.isScalar()) ? PN->getType() : VectorType::get(PN->getType(), VF);
4379     Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi");
4380     VectorLoopValueMap.setVectorValue(P, 0, VecPhi);
4381     OrigPHIsToFix.push_back(P);
4382 
4383     return;
4384   }
4385 
4386   assert(PN->getParent() == OrigLoop->getHeader() &&
4387          "Non-header phis should have been handled elsewhere");
4388 
4389   // In order to support recurrences we need to be able to vectorize Phi nodes.
4390   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
4391   // stage #1: We create a new vector PHI node with no incoming edges. We'll use
4392   // this value when we vectorize all of the instructions that use the PHI.
4393   if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) {
4394     for (unsigned Part = 0; Part < UF; ++Part) {
4395       // This is phase one of vectorizing PHIs.
4396       bool ScalarPHI =
4397           (VF.isScalar()) || Cost->isInLoopReduction(cast<PHINode>(PN));
4398       Type *VecTy =
4399           ScalarPHI ? PN->getType() : VectorType::get(PN->getType(), VF);
4400       Value *EntryPart = PHINode::Create(
4401           VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt());
4402       VectorLoopValueMap.setVectorValue(P, Part, EntryPart);
4403     }
4404     return;
4405   }
4406 
4407   setDebugLocFromInst(Builder, P);
4408 
4409   // This PHINode must be an induction variable.
4410   // Make sure that we know about it.
4411   assert(Legal->getInductionVars().count(P) && "Not an induction variable");
4412 
4413   InductionDescriptor II = Legal->getInductionVars().lookup(P);
4414   const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
4415 
4416   // FIXME: The newly created binary instructions should contain nsw/nuw flags,
4417   // which can be found from the original scalar operations.
4418   switch (II.getKind()) {
4419   case InductionDescriptor::IK_NoInduction:
4420     llvm_unreachable("Unknown induction");
4421   case InductionDescriptor::IK_IntInduction:
4422   case InductionDescriptor::IK_FpInduction:
4423     llvm_unreachable("Integer/fp induction is handled elsewhere.");
4424   case InductionDescriptor::IK_PtrInduction: {
4425     // Handle the pointer induction variable case.
4426     assert(P->getType()->isPointerTy() && "Unexpected type.");
4427 
4428     if (Cost->isScalarAfterVectorization(P, VF)) {
4429       // This is the normalized GEP that starts counting at zero.
4430       Value *PtrInd =
4431           Builder.CreateSExtOrTrunc(Induction, II.getStep()->getType());
4432       // Determine the number of scalars we need to generate for each unroll
4433       // iteration. If the instruction is uniform, we only need to generate the
4434       // first lane. Otherwise, we generate all VF values.
4435       unsigned Lanes =
4436           Cost->isUniformAfterVectorization(P, VF) ? 1 : VF.getKnownMinValue();
4437       for (unsigned Part = 0; Part < UF; ++Part) {
4438         for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
4439           Constant *Idx = ConstantInt::get(PtrInd->getType(),
4440                                            Lane + Part * VF.getKnownMinValue());
4441           Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
4442           Value *SclrGep =
4443               emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II);
4444           SclrGep->setName("next.gep");
4445           VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep);
4446         }
4447       }
4448       return;
4449     }
4450     assert(isa<SCEVConstant>(II.getStep()) &&
4451            "Induction step not a SCEV constant!");
4452     Type *PhiType = II.getStep()->getType();
4453 
4454     // Build a pointer phi
4455     Value *ScalarStartValue = II.getStartValue();
4456     Type *ScStValueType = ScalarStartValue->getType();
4457     PHINode *NewPointerPhi =
4458         PHINode::Create(ScStValueType, 2, "pointer.phi", Induction);
4459     NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader);
4460 
4461     // A pointer induction, performed by using a gep
4462     BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
4463     Instruction *InductionLoc = LoopLatch->getTerminator();
4464     const SCEV *ScalarStep = II.getStep();
4465     SCEVExpander Exp(*PSE.getSE(), DL, "induction");
4466     Value *ScalarStepValue =
4467         Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc);
4468     Value *InductionGEP = GetElementPtrInst::Create(
4469         ScStValueType->getPointerElementType(), NewPointerPhi,
4470         Builder.CreateMul(
4471             ScalarStepValue,
4472             ConstantInt::get(PhiType, VF.getKnownMinValue() * UF)),
4473         "ptr.ind", InductionLoc);
4474     NewPointerPhi->addIncoming(InductionGEP, LoopLatch);
4475 
4476     // Create UF many actual address geps that use the pointer
4477     // phi as base and a vectorized version of the step value
4478     // (<step*0, ..., step*N>) as offset.
4479     for (unsigned Part = 0; Part < UF; ++Part) {
4480       SmallVector<Constant *, 8> Indices;
4481       // Create a vector of consecutive numbers from zero to VF.
4482       for (unsigned i = 0; i < VF.getKnownMinValue(); ++i)
4483         Indices.push_back(
4484             ConstantInt::get(PhiType, i + Part * VF.getKnownMinValue()));
4485       Constant *StartOffset = ConstantVector::get(Indices);
4486 
4487       Value *GEP = Builder.CreateGEP(
4488           ScStValueType->getPointerElementType(), NewPointerPhi,
4489           Builder.CreateMul(
4490               StartOffset,
4491               Builder.CreateVectorSplat(VF.getKnownMinValue(), ScalarStepValue),
4492               "vector.gep"));
4493       VectorLoopValueMap.setVectorValue(P, Part, GEP);
4494     }
4495   }
4496   }
4497 }
4498 
4499 /// A helper function for checking whether an integer division-related
4500 /// instruction may divide by zero (in which case it must be predicated if
4501 /// executed conditionally in the scalar code).
4502 /// TODO: It may be worthwhile to generalize and check isKnownNonZero().
4503 /// Non-zero divisors that are non compile-time constants will not be
4504 /// converted into multiplication, so we will still end up scalarizing
4505 /// the division, but can do so w/o predication.
4506 static bool mayDivideByZero(Instruction &I) {
4507   assert((I.getOpcode() == Instruction::UDiv ||
4508           I.getOpcode() == Instruction::SDiv ||
4509           I.getOpcode() == Instruction::URem ||
4510           I.getOpcode() == Instruction::SRem) &&
4511          "Unexpected instruction");
4512   Value *Divisor = I.getOperand(1);
4513   auto *CInt = dyn_cast<ConstantInt>(Divisor);
4514   return !CInt || CInt->isZero();
4515 }
4516 
4517 void InnerLoopVectorizer::widenInstruction(Instruction &I, VPValue *Def,
4518                                            VPUser &User,
4519                                            VPTransformState &State) {
4520   assert(!VF.isScalable() && "scalable vectors not yet supported.");
4521   switch (I.getOpcode()) {
4522   case Instruction::Call:
4523   case Instruction::Br:
4524   case Instruction::PHI:
4525   case Instruction::GetElementPtr:
4526   case Instruction::Select:
4527     llvm_unreachable("This instruction is handled by a different recipe.");
4528   case Instruction::UDiv:
4529   case Instruction::SDiv:
4530   case Instruction::SRem:
4531   case Instruction::URem:
4532   case Instruction::Add:
4533   case Instruction::FAdd:
4534   case Instruction::Sub:
4535   case Instruction::FSub:
4536   case Instruction::FNeg:
4537   case Instruction::Mul:
4538   case Instruction::FMul:
4539   case Instruction::FDiv:
4540   case Instruction::FRem:
4541   case Instruction::Shl:
4542   case Instruction::LShr:
4543   case Instruction::AShr:
4544   case Instruction::And:
4545   case Instruction::Or:
4546   case Instruction::Xor: {
4547     // Just widen unops and binops.
4548     setDebugLocFromInst(Builder, &I);
4549 
4550     for (unsigned Part = 0; Part < UF; ++Part) {
4551       SmallVector<Value *, 2> Ops;
4552       for (VPValue *VPOp : User.operands())
4553         Ops.push_back(State.get(VPOp, Part));
4554 
4555       Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops);
4556 
4557       if (auto *VecOp = dyn_cast<Instruction>(V))
4558         VecOp->copyIRFlags(&I);
4559 
4560       // Use this vector value for all users of the original instruction.
4561       State.set(Def, &I, V, Part);
4562       addMetadata(V, &I);
4563     }
4564 
4565     break;
4566   }
4567   case Instruction::ICmp:
4568   case Instruction::FCmp: {
4569     // Widen compares. Generate vector compares.
4570     bool FCmp = (I.getOpcode() == Instruction::FCmp);
4571     auto *Cmp = cast<CmpInst>(&I);
4572     setDebugLocFromInst(Builder, Cmp);
4573     for (unsigned Part = 0; Part < UF; ++Part) {
4574       Value *A = State.get(User.getOperand(0), Part);
4575       Value *B = State.get(User.getOperand(1), Part);
4576       Value *C = nullptr;
4577       if (FCmp) {
4578         // Propagate fast math flags.
4579         IRBuilder<>::FastMathFlagGuard FMFG(Builder);
4580         Builder.setFastMathFlags(Cmp->getFastMathFlags());
4581         C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
4582       } else {
4583         C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
4584       }
4585       State.set(Def, &I, C, Part);
4586       addMetadata(C, &I);
4587     }
4588 
4589     break;
4590   }
4591 
4592   case Instruction::ZExt:
4593   case Instruction::SExt:
4594   case Instruction::FPToUI:
4595   case Instruction::FPToSI:
4596   case Instruction::FPExt:
4597   case Instruction::PtrToInt:
4598   case Instruction::IntToPtr:
4599   case Instruction::SIToFP:
4600   case Instruction::UIToFP:
4601   case Instruction::Trunc:
4602   case Instruction::FPTrunc:
4603   case Instruction::BitCast: {
4604     auto *CI = cast<CastInst>(&I);
4605     setDebugLocFromInst(Builder, CI);
4606 
4607     /// Vectorize casts.
4608     assert(!VF.isScalable() && "VF is assumed to be non scalable.");
4609     Type *DestTy =
4610         (VF.isScalar()) ? CI->getType() : VectorType::get(CI->getType(), VF);
4611 
4612     for (unsigned Part = 0; Part < UF; ++Part) {
4613       Value *A = State.get(User.getOperand(0), Part);
4614       Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
4615       State.set(Def, &I, Cast, Part);
4616       addMetadata(Cast, &I);
4617     }
4618     break;
4619   }
4620   default:
4621     // This instruction is not vectorized by simple widening.
4622     LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
4623     llvm_unreachable("Unhandled instruction!");
4624   } // end of switch.
4625 }
4626 
4627 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def,
4628                                                VPUser &ArgOperands,
4629                                                VPTransformState &State) {
4630   assert(!isa<DbgInfoIntrinsic>(I) &&
4631          "DbgInfoIntrinsic should have been dropped during VPlan construction");
4632   setDebugLocFromInst(Builder, &I);
4633 
4634   Module *M = I.getParent()->getParent()->getParent();
4635   auto *CI = cast<CallInst>(&I);
4636 
4637   SmallVector<Type *, 4> Tys;
4638   for (Value *ArgOperand : CI->arg_operands())
4639     Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue()));
4640 
4641   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
4642 
4643   // The flag shows whether we use Intrinsic or a usual Call for vectorized
4644   // version of the instruction.
4645   // Is it beneficial to perform intrinsic call compared to lib call?
4646   bool NeedToScalarize = false;
4647   unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize);
4648   bool UseVectorIntrinsic =
4649       ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost;
4650   assert((UseVectorIntrinsic || !NeedToScalarize) &&
4651          "Instruction should be scalarized elsewhere.");
4652 
4653   for (unsigned Part = 0; Part < UF; ++Part) {
4654     SmallVector<Value *, 4> Args;
4655     for (auto &I : enumerate(ArgOperands.operands())) {
4656       // Some intrinsics have a scalar argument - don't replace it with a
4657       // vector.
4658       Value *Arg;
4659       if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index()))
4660         Arg = State.get(I.value(), Part);
4661       else
4662         Arg = State.get(I.value(), {0, 0});
4663       Args.push_back(Arg);
4664     }
4665 
4666     Function *VectorF;
4667     if (UseVectorIntrinsic) {
4668       // Use vector version of the intrinsic.
4669       Type *TysForDecl[] = {CI->getType()};
4670       if (VF.isVector()) {
4671         assert(!VF.isScalable() && "VF is assumed to be non scalable.");
4672         TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
4673       }
4674       VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
4675       assert(VectorF && "Can't retrieve vector intrinsic.");
4676     } else {
4677       // Use vector version of the function call.
4678       const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
4679 #ifndef NDEBUG
4680       assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr &&
4681              "Can't create vector function.");
4682 #endif
4683         VectorF = VFDatabase(*CI).getVectorizedFunction(Shape);
4684     }
4685       SmallVector<OperandBundleDef, 1> OpBundles;
4686       CI->getOperandBundlesAsDefs(OpBundles);
4687       CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
4688 
4689       if (isa<FPMathOperator>(V))
4690         V->copyFastMathFlags(CI);
4691 
4692       State.set(Def, &I, V, Part);
4693       addMetadata(V, &I);
4694   }
4695 }
4696 
4697 void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I, VPValue *VPDef,
4698                                                  VPUser &Operands,
4699                                                  bool InvariantCond,
4700                                                  VPTransformState &State) {
4701   setDebugLocFromInst(Builder, &I);
4702 
4703   // The condition can be loop invariant  but still defined inside the
4704   // loop. This means that we can't just use the original 'cond' value.
4705   // We have to take the 'vectorized' value and pick the first lane.
4706   // Instcombine will make this a no-op.
4707   auto *InvarCond =
4708       InvariantCond ? State.get(Operands.getOperand(0), {0, 0}) : nullptr;
4709 
4710   for (unsigned Part = 0; Part < UF; ++Part) {
4711     Value *Cond =
4712         InvarCond ? InvarCond : State.get(Operands.getOperand(0), Part);
4713     Value *Op0 = State.get(Operands.getOperand(1), Part);
4714     Value *Op1 = State.get(Operands.getOperand(2), Part);
4715     Value *Sel = Builder.CreateSelect(Cond, Op0, Op1);
4716     State.set(VPDef, &I, Sel, Part);
4717     addMetadata(Sel, &I);
4718   }
4719 }
4720 
4721 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
4722   // We should not collect Scalars more than once per VF. Right now, this
4723   // function is called from collectUniformsAndScalars(), which already does
4724   // this check. Collecting Scalars for VF=1 does not make any sense.
4725   assert(VF.isVector() && Scalars.find(VF) == Scalars.end() &&
4726          "This function should not be visited twice for the same VF");
4727 
4728   SmallSetVector<Instruction *, 8> Worklist;
4729 
4730   // These sets are used to seed the analysis with pointers used by memory
4731   // accesses that will remain scalar.
4732   SmallSetVector<Instruction *, 8> ScalarPtrs;
4733   SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
4734   auto *Latch = TheLoop->getLoopLatch();
4735 
4736   // A helper that returns true if the use of Ptr by MemAccess will be scalar.
4737   // The pointer operands of loads and stores will be scalar as long as the
4738   // memory access is not a gather or scatter operation. The value operand of a
4739   // store will remain scalar if the store is scalarized.
4740   auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
4741     InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
4742     assert(WideningDecision != CM_Unknown &&
4743            "Widening decision should be ready at this moment");
4744     if (auto *Store = dyn_cast<StoreInst>(MemAccess))
4745       if (Ptr == Store->getValueOperand())
4746         return WideningDecision == CM_Scalarize;
4747     assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
4748            "Ptr is neither a value or pointer operand");
4749     return WideningDecision != CM_GatherScatter;
4750   };
4751 
4752   // A helper that returns true if the given value is a bitcast or
4753   // getelementptr instruction contained in the loop.
4754   auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
4755     return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
4756             isa<GetElementPtrInst>(V)) &&
4757            !TheLoop->isLoopInvariant(V);
4758   };
4759 
4760   auto isScalarPtrInduction = [&](Instruction *MemAccess, Value *Ptr) {
4761     if (!isa<PHINode>(Ptr) ||
4762         !Legal->getInductionVars().count(cast<PHINode>(Ptr)))
4763       return false;
4764     auto &Induction = Legal->getInductionVars()[cast<PHINode>(Ptr)];
4765     if (Induction.getKind() != InductionDescriptor::IK_PtrInduction)
4766       return false;
4767     return isScalarUse(MemAccess, Ptr);
4768   };
4769 
4770   // A helper that evaluates a memory access's use of a pointer. If the
4771   // pointer is actually the pointer induction of a loop, it is being
4772   // inserted into Worklist. If the use will be a scalar use, and the
4773   // pointer is only used by memory accesses, we place the pointer in
4774   // ScalarPtrs. Otherwise, the pointer is placed in PossibleNonScalarPtrs.
4775   auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
4776     if (isScalarPtrInduction(MemAccess, Ptr)) {
4777       Worklist.insert(cast<Instruction>(Ptr));
4778       Instruction *Update = cast<Instruction>(
4779           cast<PHINode>(Ptr)->getIncomingValueForBlock(Latch));
4780       Worklist.insert(Update);
4781       LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Ptr
4782                         << "\n");
4783       LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Update
4784                         << "\n");
4785       return;
4786     }
4787     // We only care about bitcast and getelementptr instructions contained in
4788     // the loop.
4789     if (!isLoopVaryingBitCastOrGEP(Ptr))
4790       return;
4791 
4792     // If the pointer has already been identified as scalar (e.g., if it was
4793     // also identified as uniform), there's nothing to do.
4794     auto *I = cast<Instruction>(Ptr);
4795     if (Worklist.count(I))
4796       return;
4797 
4798     // If the use of the pointer will be a scalar use, and all users of the
4799     // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
4800     // place the pointer in PossibleNonScalarPtrs.
4801     if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
4802           return isa<LoadInst>(U) || isa<StoreInst>(U);
4803         }))
4804       ScalarPtrs.insert(I);
4805     else
4806       PossibleNonScalarPtrs.insert(I);
4807   };
4808 
4809   // We seed the scalars analysis with three classes of instructions: (1)
4810   // instructions marked uniform-after-vectorization and (2) bitcast,
4811   // getelementptr and (pointer) phi instructions used by memory accesses
4812   // requiring a scalar use.
4813   //
4814   // (1) Add to the worklist all instructions that have been identified as
4815   // uniform-after-vectorization.
4816   Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
4817 
4818   // (2) Add to the worklist all bitcast and getelementptr instructions used by
4819   // memory accesses requiring a scalar use. The pointer operands of loads and
4820   // stores will be scalar as long as the memory accesses is not a gather or
4821   // scatter operation. The value operand of a store will remain scalar if the
4822   // store is scalarized.
4823   for (auto *BB : TheLoop->blocks())
4824     for (auto &I : *BB) {
4825       if (auto *Load = dyn_cast<LoadInst>(&I)) {
4826         evaluatePtrUse(Load, Load->getPointerOperand());
4827       } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
4828         evaluatePtrUse(Store, Store->getPointerOperand());
4829         evaluatePtrUse(Store, Store->getValueOperand());
4830       }
4831     }
4832   for (auto *I : ScalarPtrs)
4833     if (!PossibleNonScalarPtrs.count(I)) {
4834       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
4835       Worklist.insert(I);
4836     }
4837 
4838   // Insert the forced scalars.
4839   // FIXME: Currently widenPHIInstruction() often creates a dead vector
4840   // induction variable when the PHI user is scalarized.
4841   auto ForcedScalar = ForcedScalars.find(VF);
4842   if (ForcedScalar != ForcedScalars.end())
4843     for (auto *I : ForcedScalar->second)
4844       Worklist.insert(I);
4845 
4846   // Expand the worklist by looking through any bitcasts and getelementptr
4847   // instructions we've already identified as scalar. This is similar to the
4848   // expansion step in collectLoopUniforms(); however, here we're only
4849   // expanding to include additional bitcasts and getelementptr instructions.
4850   unsigned Idx = 0;
4851   while (Idx != Worklist.size()) {
4852     Instruction *Dst = Worklist[Idx++];
4853     if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
4854       continue;
4855     auto *Src = cast<Instruction>(Dst->getOperand(0));
4856     if (llvm::all_of(Src->users(), [&](User *U) -> bool {
4857           auto *J = cast<Instruction>(U);
4858           return !TheLoop->contains(J) || Worklist.count(J) ||
4859                  ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
4860                   isScalarUse(J, Src));
4861         })) {
4862       Worklist.insert(Src);
4863       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
4864     }
4865   }
4866 
4867   // An induction variable will remain scalar if all users of the induction
4868   // variable and induction variable update remain scalar.
4869   for (auto &Induction : Legal->getInductionVars()) {
4870     auto *Ind = Induction.first;
4871     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4872 
4873     // If tail-folding is applied, the primary induction variable will be used
4874     // to feed a vector compare.
4875     if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
4876       continue;
4877 
4878     // Determine if all users of the induction variable are scalar after
4879     // vectorization.
4880     auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4881       auto *I = cast<Instruction>(U);
4882       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I);
4883     });
4884     if (!ScalarInd)
4885       continue;
4886 
4887     // Determine if all users of the induction variable update instruction are
4888     // scalar after vectorization.
4889     auto ScalarIndUpdate =
4890         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4891           auto *I = cast<Instruction>(U);
4892           return I == Ind || !TheLoop->contains(I) || Worklist.count(I);
4893         });
4894     if (!ScalarIndUpdate)
4895       continue;
4896 
4897     // The induction variable and its update instruction will remain scalar.
4898     Worklist.insert(Ind);
4899     Worklist.insert(IndUpdate);
4900     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
4901     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
4902                       << "\n");
4903   }
4904 
4905   Scalars[VF].insert(Worklist.begin(), Worklist.end());
4906 }
4907 
4908 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I,
4909                                                          ElementCount VF) {
4910   assert(!VF.isScalable() && "scalable vectors not yet supported.");
4911   if (!blockNeedsPredication(I->getParent()))
4912     return false;
4913   switch(I->getOpcode()) {
4914   default:
4915     break;
4916   case Instruction::Load:
4917   case Instruction::Store: {
4918     if (!Legal->isMaskRequired(I))
4919       return false;
4920     auto *Ptr = getLoadStorePointerOperand(I);
4921     auto *Ty = getMemInstValueType(I);
4922     // We have already decided how to vectorize this instruction, get that
4923     // result.
4924     if (VF.isVector()) {
4925       InstWidening WideningDecision = getWideningDecision(I, VF);
4926       assert(WideningDecision != CM_Unknown &&
4927              "Widening decision should be ready at this moment");
4928       return WideningDecision == CM_Scalarize;
4929     }
4930     const Align Alignment = getLoadStoreAlignment(I);
4931     return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
4932                                 isLegalMaskedGather(Ty, Alignment))
4933                             : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
4934                                 isLegalMaskedScatter(Ty, Alignment));
4935   }
4936   case Instruction::UDiv:
4937   case Instruction::SDiv:
4938   case Instruction::SRem:
4939   case Instruction::URem:
4940     return mayDivideByZero(*I);
4941   }
4942   return false;
4943 }
4944 
4945 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
4946     Instruction *I, ElementCount VF) {
4947   assert(isAccessInterleaved(I) && "Expecting interleaved access.");
4948   assert(getWideningDecision(I, VF) == CM_Unknown &&
4949          "Decision should not be set yet.");
4950   auto *Group = getInterleavedAccessGroup(I);
4951   assert(Group && "Must have a group.");
4952 
4953   // If the instruction's allocated size doesn't equal it's type size, it
4954   // requires padding and will be scalarized.
4955   auto &DL = I->getModule()->getDataLayout();
4956   auto *ScalarTy = getMemInstValueType(I);
4957   if (hasIrregularType(ScalarTy, DL, VF))
4958     return false;
4959 
4960   // Check if masking is required.
4961   // A Group may need masking for one of two reasons: it resides in a block that
4962   // needs predication, or it was decided to use masking to deal with gaps.
4963   bool PredicatedAccessRequiresMasking =
4964       Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I);
4965   bool AccessWithGapsRequiresMasking =
4966       Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
4967   if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking)
4968     return true;
4969 
4970   // If masked interleaving is required, we expect that the user/target had
4971   // enabled it, because otherwise it either wouldn't have been created or
4972   // it should have been invalidated by the CostModel.
4973   assert(useMaskedInterleavedAccesses(TTI) &&
4974          "Masked interleave-groups for predicated accesses are not enabled.");
4975 
4976   auto *Ty = getMemInstValueType(I);
4977   const Align Alignment = getLoadStoreAlignment(I);
4978   return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
4979                           : TTI.isLegalMaskedStore(Ty, Alignment);
4980 }
4981 
4982 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
4983     Instruction *I, ElementCount VF) {
4984   // Get and ensure we have a valid memory instruction.
4985   LoadInst *LI = dyn_cast<LoadInst>(I);
4986   StoreInst *SI = dyn_cast<StoreInst>(I);
4987   assert((LI || SI) && "Invalid memory instruction");
4988 
4989   auto *Ptr = getLoadStorePointerOperand(I);
4990 
4991   // In order to be widened, the pointer should be consecutive, first of all.
4992   if (!Legal->isConsecutivePtr(Ptr))
4993     return false;
4994 
4995   // If the instruction is a store located in a predicated block, it will be
4996   // scalarized.
4997   if (isScalarWithPredication(I))
4998     return false;
4999 
5000   // If the instruction's allocated size doesn't equal it's type size, it
5001   // requires padding and will be scalarized.
5002   auto &DL = I->getModule()->getDataLayout();
5003   auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType();
5004   if (hasIrregularType(ScalarTy, DL, VF))
5005     return false;
5006 
5007   return true;
5008 }
5009 
5010 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
5011   // We should not collect Uniforms more than once per VF. Right now,
5012   // this function is called from collectUniformsAndScalars(), which
5013   // already does this check. Collecting Uniforms for VF=1 does not make any
5014   // sense.
5015 
5016   assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() &&
5017          "This function should not be visited twice for the same VF");
5018 
5019   // Visit the list of Uniforms. If we'll not find any uniform value, we'll
5020   // not analyze again.  Uniforms.count(VF) will return 1.
5021   Uniforms[VF].clear();
5022 
5023   // We now know that the loop is vectorizable!
5024   // Collect instructions inside the loop that will remain uniform after
5025   // vectorization.
5026 
5027   // Global values, params and instructions outside of current loop are out of
5028   // scope.
5029   auto isOutOfScope = [&](Value *V) -> bool {
5030     Instruction *I = dyn_cast<Instruction>(V);
5031     return (!I || !TheLoop->contains(I));
5032   };
5033 
5034   SetVector<Instruction *> Worklist;
5035   BasicBlock *Latch = TheLoop->getLoopLatch();
5036 
5037   // Instructions that are scalar with predication must not be considered
5038   // uniform after vectorization, because that would create an erroneous
5039   // replicating region where only a single instance out of VF should be formed.
5040   // TODO: optimize such seldom cases if found important, see PR40816.
5041   auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
5042     if (isOutOfScope(I)) {
5043       LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: "
5044                         << *I << "\n");
5045       return;
5046     }
5047     if (isScalarWithPredication(I, VF)) {
5048       LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "
5049                         << *I << "\n");
5050       return;
5051     }
5052     LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
5053     Worklist.insert(I);
5054   };
5055 
5056   // Start with the conditional branch. If the branch condition is an
5057   // instruction contained in the loop that is only used by the branch, it is
5058   // uniform.
5059   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
5060   if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
5061     addToWorklistIfAllowed(Cmp);
5062 
5063   // Holds consecutive and consecutive-like pointers. Consecutive-like pointers
5064   // are pointers that are treated like consecutive pointers during
5065   // vectorization. The pointer operands of interleaved accesses are an
5066   // example.
5067   SmallSetVector<Value *, 8> ConsecutiveLikePtrs;
5068 
5069   // Holds pointer operands of instructions that are possibly non-uniform.
5070   SmallPtrSet<Value *, 8> PossibleNonUniformPtrs;
5071 
5072   auto isUniformDecision = [&](Instruction *I, ElementCount VF) {
5073     InstWidening WideningDecision = getWideningDecision(I, VF);
5074     assert(WideningDecision != CM_Unknown &&
5075            "Widening decision should be ready at this moment");
5076 
5077     // The address of a uniform mem op is itself uniform.  We exclude stores
5078     // here as there's an assumption in the current code that all uses of
5079     // uniform instructions are uniform and, as noted below, uniform stores are
5080     // still handled via replication (i.e. aren't uniform after vectorization).
5081     if (isa<LoadInst>(I) && Legal->isUniformMemOp(*I)) {
5082       assert(WideningDecision == CM_Scalarize);
5083       return true;
5084     }
5085 
5086     return (WideningDecision == CM_Widen ||
5087             WideningDecision == CM_Widen_Reverse ||
5088             WideningDecision == CM_Interleave);
5089   };
5090 
5091 
5092   // Returns true if Ptr is the pointer operand of a memory access instruction
5093   // I, and I is known to not require scalarization.
5094   auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
5095     return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF);
5096   };
5097 
5098   // Iterate over the instructions in the loop, and collect all
5099   // consecutive-like pointer operands in ConsecutiveLikePtrs. If it's possible
5100   // that a consecutive-like pointer operand will be scalarized, we collect it
5101   // in PossibleNonUniformPtrs instead. We use two sets here because a single
5102   // getelementptr instruction can be used by both vectorized and scalarized
5103   // memory instructions. For example, if a loop loads and stores from the same
5104   // location, but the store is conditional, the store will be scalarized, and
5105   // the getelementptr won't remain uniform.
5106   for (auto *BB : TheLoop->blocks())
5107     for (auto &I : *BB) {
5108       // If there's no pointer operand, there's nothing to do.
5109       auto *Ptr = getLoadStorePointerOperand(&I);
5110       if (!Ptr)
5111         continue;
5112 
5113       // For now, avoid walking use lists in other functions.
5114       // TODO: Rewrite this algorithm from uses up.
5115       if (!isa<Instruction>(Ptr) && !isa<Argument>(Ptr))
5116         continue;
5117 
5118       // A uniform memory op is itself uniform.  We exclude stores here as we
5119       // haven't yet added dedicated logic in the CLONE path and rely on
5120       // REPLICATE + DSE for correctness.
5121       if (isa<LoadInst>(I) && Legal->isUniformMemOp(I))
5122         addToWorklistIfAllowed(&I);
5123 
5124       // True if all users of Ptr are memory accesses that have Ptr as their
5125       // pointer operand.  Since loops are assumed to be in LCSSA form, this
5126       // disallows uses outside the loop as well.
5127       auto UsersAreMemAccesses =
5128           llvm::all_of(Ptr->users(), [&](User *U) -> bool {
5129             return getLoadStorePointerOperand(U) == Ptr;
5130           });
5131 
5132       // Ensure the memory instruction will not be scalarized or used by
5133       // gather/scatter, making its pointer operand non-uniform. If the pointer
5134       // operand is used by any instruction other than a memory access, we
5135       // conservatively assume the pointer operand may be non-uniform.
5136       if (!UsersAreMemAccesses || !isUniformDecision(&I, VF))
5137         PossibleNonUniformPtrs.insert(Ptr);
5138 
5139       // If the memory instruction will be vectorized and its pointer operand
5140       // is consecutive-like, or interleaving - the pointer operand should
5141       // remain uniform.
5142       else
5143         ConsecutiveLikePtrs.insert(Ptr);
5144     }
5145 
5146   // Add to the Worklist all consecutive and consecutive-like pointers that
5147   // aren't also identified as possibly non-uniform.
5148   for (auto *V : ConsecutiveLikePtrs)
5149     if (!PossibleNonUniformPtrs.count(V))
5150       if (auto *I = dyn_cast<Instruction>(V))
5151         addToWorklistIfAllowed(I);
5152 
5153   // Expand Worklist in topological order: whenever a new instruction
5154   // is added , its users should be already inside Worklist.  It ensures
5155   // a uniform instruction will only be used by uniform instructions.
5156   unsigned idx = 0;
5157   while (idx != Worklist.size()) {
5158     Instruction *I = Worklist[idx++];
5159 
5160     for (auto OV : I->operand_values()) {
5161       // isOutOfScope operands cannot be uniform instructions.
5162       if (isOutOfScope(OV))
5163         continue;
5164       // First order recurrence Phi's should typically be considered
5165       // non-uniform.
5166       auto *OP = dyn_cast<PHINode>(OV);
5167       if (OP && Legal->isFirstOrderRecurrence(OP))
5168         continue;
5169       // If all the users of the operand are uniform, then add the
5170       // operand into the uniform worklist.
5171       auto *OI = cast<Instruction>(OV);
5172       if (llvm::all_of(OI->users(), [&](User *U) -> bool {
5173             auto *J = cast<Instruction>(U);
5174             return Worklist.count(J) || isVectorizedMemAccessUse(J, OI);
5175           }))
5176         addToWorklistIfAllowed(OI);
5177     }
5178   }
5179 
5180   // For an instruction to be added into Worklist above, all its users inside
5181   // the loop should also be in Worklist. However, this condition cannot be
5182   // true for phi nodes that form a cyclic dependence. We must process phi
5183   // nodes separately. An induction variable will remain uniform if all users
5184   // of the induction variable and induction variable update remain uniform.
5185   // The code below handles both pointer and non-pointer induction variables.
5186   for (auto &Induction : Legal->getInductionVars()) {
5187     auto *Ind = Induction.first;
5188     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
5189 
5190     // Determine if all users of the induction variable are uniform after
5191     // vectorization.
5192     auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
5193       auto *I = cast<Instruction>(U);
5194       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
5195              isVectorizedMemAccessUse(I, Ind);
5196     });
5197     if (!UniformInd)
5198       continue;
5199 
5200     // Determine if all users of the induction variable update instruction are
5201     // uniform after vectorization.
5202     auto UniformIndUpdate =
5203         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
5204           auto *I = cast<Instruction>(U);
5205           return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
5206                  isVectorizedMemAccessUse(I, IndUpdate);
5207         });
5208     if (!UniformIndUpdate)
5209       continue;
5210 
5211     // The induction variable and its update instruction will remain uniform.
5212     addToWorklistIfAllowed(Ind);
5213     addToWorklistIfAllowed(IndUpdate);
5214   }
5215 
5216   Uniforms[VF].insert(Worklist.begin(), Worklist.end());
5217 }
5218 
5219 bool LoopVectorizationCostModel::runtimeChecksRequired() {
5220   LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
5221 
5222   if (Legal->getRuntimePointerChecking()->Need) {
5223     reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
5224         "runtime pointer checks needed. Enable vectorization of this "
5225         "loop with '#pragma clang loop vectorize(enable)' when "
5226         "compiling with -Os/-Oz",
5227         "CantVersionLoopWithOptForSize", ORE, TheLoop);
5228     return true;
5229   }
5230 
5231   if (!PSE.getUnionPredicate().getPredicates().empty()) {
5232     reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
5233         "runtime SCEV checks needed. Enable vectorization of this "
5234         "loop with '#pragma clang loop vectorize(enable)' when "
5235         "compiling with -Os/-Oz",
5236         "CantVersionLoopWithOptForSize", ORE, TheLoop);
5237     return true;
5238   }
5239 
5240   // FIXME: Avoid specializing for stride==1 instead of bailing out.
5241   if (!Legal->getLAI()->getSymbolicStrides().empty()) {
5242     reportVectorizationFailure("Runtime stride check for small trip count",
5243         "runtime stride == 1 checks needed. Enable vectorization of "
5244         "this loop without such check by compiling with -Os/-Oz",
5245         "CantVersionLoopWithOptForSize", ORE, TheLoop);
5246     return true;
5247   }
5248 
5249   return false;
5250 }
5251 
5252 Optional<ElementCount>
5253 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
5254   if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
5255     // TODO: It may by useful to do since it's still likely to be dynamically
5256     // uniform if the target can skip.
5257     reportVectorizationFailure(
5258         "Not inserting runtime ptr check for divergent target",
5259         "runtime pointer checks needed. Not enabled for divergent target",
5260         "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
5261     return None;
5262   }
5263 
5264   unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
5265   LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
5266   if (TC == 1) {
5267     reportVectorizationFailure("Single iteration (non) loop",
5268         "loop trip count is one, irrelevant for vectorization",
5269         "SingleIterationLoop", ORE, TheLoop);
5270     return None;
5271   }
5272 
5273   switch (ScalarEpilogueStatus) {
5274   case CM_ScalarEpilogueAllowed:
5275     return UserVF ? UserVF : computeFeasibleMaxVF(TC);
5276   case CM_ScalarEpilogueNotNeededUsePredicate:
5277     LLVM_DEBUG(
5278         dbgs() << "LV: vector predicate hint/switch found.\n"
5279                << "LV: Not allowing scalar epilogue, creating predicated "
5280                << "vector loop.\n");
5281     break;
5282   case CM_ScalarEpilogueNotAllowedLowTripLoop:
5283     // fallthrough as a special case of OptForSize
5284   case CM_ScalarEpilogueNotAllowedOptSize:
5285     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
5286       LLVM_DEBUG(
5287           dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
5288     else
5289       LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
5290                         << "count.\n");
5291 
5292     // Bail if runtime checks are required, which are not good when optimising
5293     // for size.
5294     if (runtimeChecksRequired())
5295       return None;
5296     break;
5297   }
5298 
5299   // Now try the tail folding
5300 
5301   // Invalidate interleave groups that require an epilogue if we can't mask
5302   // the interleave-group.
5303   if (!useMaskedInterleavedAccesses(TTI)) {
5304     assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() &&
5305            "No decisions should have been taken at this point");
5306     // Note: There is no need to invalidate any cost modeling decisions here, as
5307     // non where taken so far.
5308     InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
5309   }
5310 
5311   ElementCount MaxVF = UserVF ? UserVF : computeFeasibleMaxVF(TC);
5312   assert(!MaxVF.isScalable() &&
5313          "Scalable vectors do not yet support tail folding");
5314   assert((UserVF.isNonZero() || isPowerOf2_32(MaxVF.getFixedValue())) &&
5315          "MaxVF must be a power of 2");
5316   unsigned MaxVFtimesIC =
5317       UserIC ? MaxVF.getFixedValue() * UserIC : MaxVF.getFixedValue();
5318   if (TC > 0 && TC % MaxVFtimesIC == 0) {
5319     // Accept MaxVF if we do not have a tail.
5320     LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
5321     return MaxVF;
5322   }
5323 
5324   // If we don't know the precise trip count, or if the trip count that we
5325   // found modulo the vectorization factor is not zero, try to fold the tail
5326   // by masking.
5327   // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
5328   if (Legal->prepareToFoldTailByMasking()) {
5329     FoldTailByMasking = true;
5330     return MaxVF;
5331   }
5332 
5333   // If there was a tail-folding hint/switch, but we can't fold the tail by
5334   // masking, fallback to a vectorization with a scalar epilogue.
5335   if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
5336     if (PreferPredicateOverEpilogue == PreferPredicateTy::PredicateOrDontVectorize) {
5337       LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n");
5338       return None;
5339     }
5340     LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
5341                          "scalar epilogue instead.\n");
5342     ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
5343     return MaxVF;
5344   }
5345 
5346   if (TC == 0) {
5347     reportVectorizationFailure(
5348         "Unable to calculate the loop count due to complex control flow",
5349         "unable to calculate the loop count due to complex control flow",
5350         "UnknownLoopCountComplexCFG", ORE, TheLoop);
5351     return None;
5352   }
5353 
5354   reportVectorizationFailure(
5355       "Cannot optimize for size and vectorize at the same time.",
5356       "cannot optimize for size and vectorize at the same time. "
5357       "Enable vectorization of this loop with '#pragma clang loop "
5358       "vectorize(enable)' when compiling with -Os/-Oz",
5359       "NoTailLoopWithOptForSize", ORE, TheLoop);
5360   return None;
5361 }
5362 
5363 ElementCount
5364 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) {
5365   MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
5366   unsigned SmallestType, WidestType;
5367   std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
5368   unsigned WidestRegister = TTI.getRegisterBitWidth(true);
5369 
5370   // Get the maximum safe dependence distance in bits computed by LAA.
5371   // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
5372   // the memory accesses that is most restrictive (involved in the smallest
5373   // dependence distance).
5374   unsigned MaxSafeVectorWidthInBits = Legal->getMaxSafeVectorWidthInBits();
5375 
5376   WidestRegister = std::min(WidestRegister, MaxSafeVectorWidthInBits);
5377 
5378   // Ensure MaxVF is a power of 2; the dependence distance bound may not be.
5379   // Note that both WidestRegister and WidestType may not be a powers of 2.
5380   unsigned MaxVectorSize = PowerOf2Floor(WidestRegister / WidestType);
5381 
5382   LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
5383                     << " / " << WidestType << " bits.\n");
5384   LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
5385                     << WidestRegister << " bits.\n");
5386 
5387   assert(MaxVectorSize <= WidestRegister &&
5388          "Did not expect to pack so many elements"
5389          " into one vector!");
5390   if (MaxVectorSize == 0) {
5391     LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n");
5392     MaxVectorSize = 1;
5393     return ElementCount::getFixed(MaxVectorSize);
5394   } else if (ConstTripCount && ConstTripCount < MaxVectorSize &&
5395              isPowerOf2_32(ConstTripCount)) {
5396     // We need to clamp the VF to be the ConstTripCount. There is no point in
5397     // choosing a higher viable VF as done in the loop below.
5398     LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: "
5399                       << ConstTripCount << "\n");
5400     MaxVectorSize = ConstTripCount;
5401     return ElementCount::getFixed(MaxVectorSize);
5402   }
5403 
5404   unsigned MaxVF = MaxVectorSize;
5405   if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) ||
5406       (MaximizeBandwidth && isScalarEpilogueAllowed())) {
5407     // Collect all viable vectorization factors larger than the default MaxVF
5408     // (i.e. MaxVectorSize).
5409     SmallVector<ElementCount, 8> VFs;
5410     unsigned NewMaxVectorSize = WidestRegister / SmallestType;
5411     for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2)
5412       VFs.push_back(ElementCount::getFixed(VS));
5413 
5414     // For each VF calculate its register usage.
5415     auto RUs = calculateRegisterUsage(VFs);
5416 
5417     // Select the largest VF which doesn't require more registers than existing
5418     // ones.
5419     for (int i = RUs.size() - 1; i >= 0; --i) {
5420       bool Selected = true;
5421       for (auto& pair : RUs[i].MaxLocalUsers) {
5422         unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5423         if (pair.second > TargetNumRegisters)
5424           Selected = false;
5425       }
5426       if (Selected) {
5427         MaxVF = VFs[i].getKnownMinValue();
5428         break;
5429       }
5430     }
5431     if (unsigned MinVF = TTI.getMinimumVF(SmallestType)) {
5432       if (MaxVF < MinVF) {
5433         LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
5434                           << ") with target's minimum: " << MinVF << '\n');
5435         MaxVF = MinVF;
5436       }
5437     }
5438   }
5439   return ElementCount::getFixed(MaxVF);
5440 }
5441 
5442 VectorizationFactor
5443 LoopVectorizationCostModel::selectVectorizationFactor(ElementCount MaxVF) {
5444   assert(!MaxVF.isScalable() && "scalable vectors not yet supported");
5445 
5446   float Cost = expectedCost(ElementCount::getFixed(1)).first;
5447   const float ScalarCost = Cost;
5448   unsigned Width = 1;
5449   LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n");
5450 
5451   bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
5452   if (ForceVectorization && MaxVF.isVector()) {
5453     // Ignore scalar width, because the user explicitly wants vectorization.
5454     // Initialize cost to max so that VF = 2 is, at least, chosen during cost
5455     // evaluation.
5456     Cost = std::numeric_limits<float>::max();
5457   }
5458 
5459   for (unsigned i = 2; i <= MaxVF.getFixedValue(); i *= 2) {
5460     // Notice that the vector loop needs to be executed less times, so
5461     // we need to divide the cost of the vector loops by the width of
5462     // the vector elements.
5463     VectorizationCostTy C = expectedCost(ElementCount::getFixed(i));
5464     float VectorCost = C.first / (float)i;
5465     LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
5466                       << " costs: " << (int)VectorCost << ".\n");
5467     if (!C.second && !ForceVectorization) {
5468       LLVM_DEBUG(
5469           dbgs() << "LV: Not considering vector loop of width " << i
5470                  << " because it will not generate any vector instructions.\n");
5471       continue;
5472     }
5473     if (VectorCost < Cost) {
5474       Cost = VectorCost;
5475       Width = i;
5476     }
5477   }
5478 
5479   if (!EnableCondStoresVectorization && NumPredStores) {
5480     reportVectorizationFailure("There are conditional stores.",
5481         "store that is conditionally executed prevents vectorization",
5482         "ConditionalStore", ORE, TheLoop);
5483     Width = 1;
5484     Cost = ScalarCost;
5485   }
5486 
5487   LLVM_DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs()
5488              << "LV: Vectorization seems to be not beneficial, "
5489              << "but was forced by a user.\n");
5490   LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n");
5491   VectorizationFactor Factor = {ElementCount::getFixed(Width),
5492                                 (unsigned)(Width * Cost)};
5493   return Factor;
5494 }
5495 
5496 std::pair<unsigned, unsigned>
5497 LoopVectorizationCostModel::getSmallestAndWidestTypes() {
5498   unsigned MinWidth = -1U;
5499   unsigned MaxWidth = 8;
5500   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5501 
5502   // For each block.
5503   for (BasicBlock *BB : TheLoop->blocks()) {
5504     // For each instruction in the loop.
5505     for (Instruction &I : BB->instructionsWithoutDebug()) {
5506       Type *T = I.getType();
5507 
5508       // Skip ignored values.
5509       if (ValuesToIgnore.count(&I))
5510         continue;
5511 
5512       // Only examine Loads, Stores and PHINodes.
5513       if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
5514         continue;
5515 
5516       // Examine PHI nodes that are reduction variables. Update the type to
5517       // account for the recurrence type.
5518       if (auto *PN = dyn_cast<PHINode>(&I)) {
5519         if (!Legal->isReductionVariable(PN))
5520           continue;
5521         RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[PN];
5522         T = RdxDesc.getRecurrenceType();
5523       }
5524 
5525       // Examine the stored values.
5526       if (auto *ST = dyn_cast<StoreInst>(&I))
5527         T = ST->getValueOperand()->getType();
5528 
5529       // Ignore loaded pointer types and stored pointer types that are not
5530       // vectorizable.
5531       //
5532       // FIXME: The check here attempts to predict whether a load or store will
5533       //        be vectorized. We only know this for certain after a VF has
5534       //        been selected. Here, we assume that if an access can be
5535       //        vectorized, it will be. We should also look at extending this
5536       //        optimization to non-pointer types.
5537       //
5538       if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) &&
5539           !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I))
5540         continue;
5541 
5542       MinWidth = std::min(MinWidth,
5543                           (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
5544       MaxWidth = std::max(MaxWidth,
5545                           (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
5546     }
5547   }
5548 
5549   return {MinWidth, MaxWidth};
5550 }
5551 
5552 unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
5553                                                            unsigned LoopCost) {
5554   // -- The interleave heuristics --
5555   // We interleave the loop in order to expose ILP and reduce the loop overhead.
5556   // There are many micro-architectural considerations that we can't predict
5557   // at this level. For example, frontend pressure (on decode or fetch) due to
5558   // code size, or the number and capabilities of the execution ports.
5559   //
5560   // We use the following heuristics to select the interleave count:
5561   // 1. If the code has reductions, then we interleave to break the cross
5562   // iteration dependency.
5563   // 2. If the loop is really small, then we interleave to reduce the loop
5564   // overhead.
5565   // 3. We don't interleave if we think that we will spill registers to memory
5566   // due to the increased register pressure.
5567 
5568   if (!isScalarEpilogueAllowed())
5569     return 1;
5570 
5571   // We used the distance for the interleave count.
5572   if (Legal->getMaxSafeDepDistBytes() != -1U)
5573     return 1;
5574 
5575   auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
5576   const bool HasReductions = !Legal->getReductionVars().empty();
5577   // Do not interleave loops with a relatively small known or estimated trip
5578   // count. But we will interleave when InterleaveSmallLoopScalarReduction is
5579   // enabled, and the code has scalar reductions(HasReductions && VF = 1),
5580   // because with the above conditions interleaving can expose ILP and break
5581   // cross iteration dependences for reductions.
5582   if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) &&
5583       !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar()))
5584     return 1;
5585 
5586   RegisterUsage R = calculateRegisterUsage({VF})[0];
5587   // We divide by these constants so assume that we have at least one
5588   // instruction that uses at least one register.
5589   for (auto& pair : R.MaxLocalUsers) {
5590     pair.second = std::max(pair.second, 1U);
5591   }
5592 
5593   // We calculate the interleave count using the following formula.
5594   // Subtract the number of loop invariants from the number of available
5595   // registers. These registers are used by all of the interleaved instances.
5596   // Next, divide the remaining registers by the number of registers that is
5597   // required by the loop, in order to estimate how many parallel instances
5598   // fit without causing spills. All of this is rounded down if necessary to be
5599   // a power of two. We want power of two interleave count to simplify any
5600   // addressing operations or alignment considerations.
5601   // We also want power of two interleave counts to ensure that the induction
5602   // variable of the vector loop wraps to zero, when tail is folded by masking;
5603   // this currently happens when OptForSize, in which case IC is set to 1 above.
5604   unsigned IC = UINT_MAX;
5605 
5606   for (auto& pair : R.MaxLocalUsers) {
5607     unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5608     LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
5609                       << " registers of "
5610                       << TTI.getRegisterClassName(pair.first) << " register class\n");
5611     if (VF.isScalar()) {
5612       if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
5613         TargetNumRegisters = ForceTargetNumScalarRegs;
5614     } else {
5615       if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
5616         TargetNumRegisters = ForceTargetNumVectorRegs;
5617     }
5618     unsigned MaxLocalUsers = pair.second;
5619     unsigned LoopInvariantRegs = 0;
5620     if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end())
5621       LoopInvariantRegs = R.LoopInvariantRegs[pair.first];
5622 
5623     unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers);
5624     // Don't count the induction variable as interleaved.
5625     if (EnableIndVarRegisterHeur) {
5626       TmpIC =
5627           PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) /
5628                         std::max(1U, (MaxLocalUsers - 1)));
5629     }
5630 
5631     IC = std::min(IC, TmpIC);
5632   }
5633 
5634   // Clamp the interleave ranges to reasonable counts.
5635   assert(!VF.isScalable() && "scalable vectors not yet supported.");
5636   unsigned MaxInterleaveCount =
5637       TTI.getMaxInterleaveFactor(VF.getKnownMinValue());
5638 
5639   // Check if the user has overridden the max.
5640   if (VF.isScalar()) {
5641     if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
5642       MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
5643   } else {
5644     if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
5645       MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
5646   }
5647 
5648   // If trip count is known or estimated compile time constant, limit the
5649   // interleave count to be less than the trip count divided by VF, provided it
5650   // is at least 1.
5651   if (BestKnownTC) {
5652     MaxInterleaveCount =
5653         std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount);
5654     // Make sure MaxInterleaveCount is greater than 0.
5655     MaxInterleaveCount = std::max(1u, MaxInterleaveCount);
5656   }
5657 
5658   assert(MaxInterleaveCount > 0 &&
5659          "Maximum interleave count must be greater than 0");
5660 
5661   // Clamp the calculated IC to be between the 1 and the max interleave count
5662   // that the target and trip count allows.
5663   if (IC > MaxInterleaveCount)
5664     IC = MaxInterleaveCount;
5665   else
5666     // Make sure IC is greater than 0.
5667     IC = std::max(1u, IC);
5668 
5669   assert(IC > 0 && "Interleave count must be greater than 0.");
5670 
5671   // If we did not calculate the cost for VF (because the user selected the VF)
5672   // then we calculate the cost of VF here.
5673   if (LoopCost == 0)
5674     LoopCost = expectedCost(VF).first;
5675 
5676   assert(LoopCost && "Non-zero loop cost expected");
5677 
5678   // Interleave if we vectorized this loop and there is a reduction that could
5679   // benefit from interleaving.
5680   if (VF.isVector() && HasReductions) {
5681     LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
5682     return IC;
5683   }
5684 
5685   // Note that if we've already vectorized the loop we will have done the
5686   // runtime check and so interleaving won't require further checks.
5687   bool InterleavingRequiresRuntimePointerCheck =
5688       (VF.isScalar() && Legal->getRuntimePointerChecking()->Need);
5689 
5690   // We want to interleave small loops in order to reduce the loop overhead and
5691   // potentially expose ILP opportunities.
5692   LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'
5693                     << "LV: IC is " << IC << '\n'
5694                     << "LV: VF is " << VF.getKnownMinValue() << '\n');
5695   const bool AggressivelyInterleaveReductions =
5696       TTI.enableAggressiveInterleaving(HasReductions);
5697   if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) {
5698     // We assume that the cost overhead is 1 and we use the cost model
5699     // to estimate the cost of the loop and interleave until the cost of the
5700     // loop overhead is about 5% of the cost of the loop.
5701     unsigned SmallIC =
5702         std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost));
5703 
5704     // Interleave until store/load ports (estimated by max interleave count) are
5705     // saturated.
5706     unsigned NumStores = Legal->getNumStores();
5707     unsigned NumLoads = Legal->getNumLoads();
5708     unsigned StoresIC = IC / (NumStores ? NumStores : 1);
5709     unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
5710 
5711     // If we have a scalar reduction (vector reductions are already dealt with
5712     // by this point), we can increase the critical path length if the loop
5713     // we're interleaving is inside another loop. Limit, by default to 2, so the
5714     // critical path only gets increased by one reduction operation.
5715     if (HasReductions && TheLoop->getLoopDepth() > 1) {
5716       unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
5717       SmallIC = std::min(SmallIC, F);
5718       StoresIC = std::min(StoresIC, F);
5719       LoadsIC = std::min(LoadsIC, F);
5720     }
5721 
5722     if (EnableLoadStoreRuntimeInterleave &&
5723         std::max(StoresIC, LoadsIC) > SmallIC) {
5724       LLVM_DEBUG(
5725           dbgs() << "LV: Interleaving to saturate store or load ports.\n");
5726       return std::max(StoresIC, LoadsIC);
5727     }
5728 
5729     // If there are scalar reductions and TTI has enabled aggressive
5730     // interleaving for reductions, we will interleave to expose ILP.
5731     if (InterleaveSmallLoopScalarReduction && VF.isScalar() &&
5732         AggressivelyInterleaveReductions) {
5733       LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5734       // Interleave no less than SmallIC but not as aggressive as the normal IC
5735       // to satisfy the rare situation when resources are too limited.
5736       return std::max(IC / 2, SmallIC);
5737     } else {
5738       LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
5739       return SmallIC;
5740     }
5741   }
5742 
5743   // Interleave if this is a large loop (small loops are already dealt with by
5744   // this point) that could benefit from interleaving.
5745   if (AggressivelyInterleaveReductions) {
5746     LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5747     return IC;
5748   }
5749 
5750   LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
5751   return 1;
5752 }
5753 
5754 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
5755 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
5756   // This function calculates the register usage by measuring the highest number
5757   // of values that are alive at a single location. Obviously, this is a very
5758   // rough estimation. We scan the loop in a topological order in order and
5759   // assign a number to each instruction. We use RPO to ensure that defs are
5760   // met before their users. We assume that each instruction that has in-loop
5761   // users starts an interval. We record every time that an in-loop value is
5762   // used, so we have a list of the first and last occurrences of each
5763   // instruction. Next, we transpose this data structure into a multi map that
5764   // holds the list of intervals that *end* at a specific location. This multi
5765   // map allows us to perform a linear search. We scan the instructions linearly
5766   // and record each time that a new interval starts, by placing it in a set.
5767   // If we find this value in the multi-map then we remove it from the set.
5768   // The max register usage is the maximum size of the set.
5769   // We also search for instructions that are defined outside the loop, but are
5770   // used inside the loop. We need this number separately from the max-interval
5771   // usage number because when we unroll, loop-invariant values do not take
5772   // more register.
5773   LoopBlocksDFS DFS(TheLoop);
5774   DFS.perform(LI);
5775 
5776   RegisterUsage RU;
5777 
5778   // Each 'key' in the map opens a new interval. The values
5779   // of the map are the index of the 'last seen' usage of the
5780   // instruction that is the key.
5781   using IntervalMap = DenseMap<Instruction *, unsigned>;
5782 
5783   // Maps instruction to its index.
5784   SmallVector<Instruction *, 64> IdxToInstr;
5785   // Marks the end of each interval.
5786   IntervalMap EndPoint;
5787   // Saves the list of instruction indices that are used in the loop.
5788   SmallPtrSet<Instruction *, 8> Ends;
5789   // Saves the list of values that are used in the loop but are
5790   // defined outside the loop, such as arguments and constants.
5791   SmallPtrSet<Value *, 8> LoopInvariants;
5792 
5793   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
5794     for (Instruction &I : BB->instructionsWithoutDebug()) {
5795       IdxToInstr.push_back(&I);
5796 
5797       // Save the end location of each USE.
5798       for (Value *U : I.operands()) {
5799         auto *Instr = dyn_cast<Instruction>(U);
5800 
5801         // Ignore non-instruction values such as arguments, constants, etc.
5802         if (!Instr)
5803           continue;
5804 
5805         // If this instruction is outside the loop then record it and continue.
5806         if (!TheLoop->contains(Instr)) {
5807           LoopInvariants.insert(Instr);
5808           continue;
5809         }
5810 
5811         // Overwrite previous end points.
5812         EndPoint[Instr] = IdxToInstr.size();
5813         Ends.insert(Instr);
5814       }
5815     }
5816   }
5817 
5818   // Saves the list of intervals that end with the index in 'key'.
5819   using InstrList = SmallVector<Instruction *, 2>;
5820   DenseMap<unsigned, InstrList> TransposeEnds;
5821 
5822   // Transpose the EndPoints to a list of values that end at each index.
5823   for (auto &Interval : EndPoint)
5824     TransposeEnds[Interval.second].push_back(Interval.first);
5825 
5826   SmallPtrSet<Instruction *, 8> OpenIntervals;
5827   SmallVector<RegisterUsage, 8> RUs(VFs.size());
5828   SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size());
5829 
5830   LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
5831 
5832   // A lambda that gets the register usage for the given type and VF.
5833   const auto &TTICapture = TTI;
5834   auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) {
5835     if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty))
5836       return 0U;
5837     return TTICapture.getRegUsageForType(VectorType::get(Ty, VF));
5838   };
5839 
5840   for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
5841     Instruction *I = IdxToInstr[i];
5842 
5843     // Remove all of the instructions that end at this location.
5844     InstrList &List = TransposeEnds[i];
5845     for (Instruction *ToRemove : List)
5846       OpenIntervals.erase(ToRemove);
5847 
5848     // Ignore instructions that are never used within the loop.
5849     if (!Ends.count(I))
5850       continue;
5851 
5852     // Skip ignored values.
5853     if (ValuesToIgnore.count(I))
5854       continue;
5855 
5856     // For each VF find the maximum usage of registers.
5857     for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
5858       // Count the number of live intervals.
5859       SmallMapVector<unsigned, unsigned, 4> RegUsage;
5860 
5861       if (VFs[j].isScalar()) {
5862         for (auto Inst : OpenIntervals) {
5863           unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
5864           if (RegUsage.find(ClassID) == RegUsage.end())
5865             RegUsage[ClassID] = 1;
5866           else
5867             RegUsage[ClassID] += 1;
5868         }
5869       } else {
5870         collectUniformsAndScalars(VFs[j]);
5871         for (auto Inst : OpenIntervals) {
5872           // Skip ignored values for VF > 1.
5873           if (VecValuesToIgnore.count(Inst))
5874             continue;
5875           if (isScalarAfterVectorization(Inst, VFs[j])) {
5876             unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
5877             if (RegUsage.find(ClassID) == RegUsage.end())
5878               RegUsage[ClassID] = 1;
5879             else
5880               RegUsage[ClassID] += 1;
5881           } else {
5882             unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType());
5883             if (RegUsage.find(ClassID) == RegUsage.end())
5884               RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]);
5885             else
5886               RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]);
5887           }
5888         }
5889       }
5890 
5891       for (auto& pair : RegUsage) {
5892         if (MaxUsages[j].find(pair.first) != MaxUsages[j].end())
5893           MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second);
5894         else
5895           MaxUsages[j][pair.first] = pair.second;
5896       }
5897     }
5898 
5899     LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
5900                       << OpenIntervals.size() << '\n');
5901 
5902     // Add the current instruction to the list of open intervals.
5903     OpenIntervals.insert(I);
5904   }
5905 
5906   for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
5907     SmallMapVector<unsigned, unsigned, 4> Invariant;
5908 
5909     for (auto Inst : LoopInvariants) {
5910       unsigned Usage =
5911           VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]);
5912       unsigned ClassID =
5913           TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType());
5914       if (Invariant.find(ClassID) == Invariant.end())
5915         Invariant[ClassID] = Usage;
5916       else
5917         Invariant[ClassID] += Usage;
5918     }
5919 
5920     LLVM_DEBUG({
5921       dbgs() << "LV(REG): VF = " << VFs[i] << '\n';
5922       dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size()
5923              << " item\n";
5924       for (const auto &pair : MaxUsages[i]) {
5925         dbgs() << "LV(REG): RegisterClass: "
5926                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5927                << " registers\n";
5928       }
5929       dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()
5930              << " item\n";
5931       for (const auto &pair : Invariant) {
5932         dbgs() << "LV(REG): RegisterClass: "
5933                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5934                << " registers\n";
5935       }
5936     });
5937 
5938     RU.LoopInvariantRegs = Invariant;
5939     RU.MaxLocalUsers = MaxUsages[i];
5940     RUs[i] = RU;
5941   }
5942 
5943   return RUs;
5944 }
5945 
5946 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){
5947   // TODO: Cost model for emulated masked load/store is completely
5948   // broken. This hack guides the cost model to use an artificially
5949   // high enough value to practically disable vectorization with such
5950   // operations, except where previously deployed legality hack allowed
5951   // using very low cost values. This is to avoid regressions coming simply
5952   // from moving "masked load/store" check from legality to cost model.
5953   // Masked Load/Gather emulation was previously never allowed.
5954   // Limited number of Masked Store/Scatter emulation was allowed.
5955   assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction");
5956   return isa<LoadInst>(I) ||
5957          (isa<StoreInst>(I) &&
5958           NumPredStores > NumberOfStoresToPredicate);
5959 }
5960 
5961 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) {
5962   // If we aren't vectorizing the loop, or if we've already collected the
5963   // instructions to scalarize, there's nothing to do. Collection may already
5964   // have occurred if we have a user-selected VF and are now computing the
5965   // expected cost for interleaving.
5966   if (VF.isScalar() || VF.isZero() ||
5967       InstsToScalarize.find(VF) != InstsToScalarize.end())
5968     return;
5969 
5970   // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
5971   // not profitable to scalarize any instructions, the presence of VF in the
5972   // map will indicate that we've analyzed it already.
5973   ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
5974 
5975   // Find all the instructions that are scalar with predication in the loop and
5976   // determine if it would be better to not if-convert the blocks they are in.
5977   // If so, we also record the instructions to scalarize.
5978   for (BasicBlock *BB : TheLoop->blocks()) {
5979     if (!blockNeedsPredication(BB))
5980       continue;
5981     for (Instruction &I : *BB)
5982       if (isScalarWithPredication(&I)) {
5983         ScalarCostsTy ScalarCosts;
5984         // Do not apply discount logic if hacked cost is needed
5985         // for emulated masked memrefs.
5986         if (!useEmulatedMaskMemRefHack(&I) &&
5987             computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
5988           ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
5989         // Remember that BB will remain after vectorization.
5990         PredicatedBBsAfterVectorization.insert(BB);
5991       }
5992   }
5993 }
5994 
5995 int LoopVectorizationCostModel::computePredInstDiscount(
5996     Instruction *PredInst, DenseMap<Instruction *, unsigned> &ScalarCosts,
5997     ElementCount VF) {
5998   assert(!isUniformAfterVectorization(PredInst, VF) &&
5999          "Instruction marked uniform-after-vectorization will be predicated");
6000 
6001   // Initialize the discount to zero, meaning that the scalar version and the
6002   // vector version cost the same.
6003   int Discount = 0;
6004 
6005   // Holds instructions to analyze. The instructions we visit are mapped in
6006   // ScalarCosts. Those instructions are the ones that would be scalarized if
6007   // we find that the scalar version costs less.
6008   SmallVector<Instruction *, 8> Worklist;
6009 
6010   // Returns true if the given instruction can be scalarized.
6011   auto canBeScalarized = [&](Instruction *I) -> bool {
6012     // We only attempt to scalarize instructions forming a single-use chain
6013     // from the original predicated block that would otherwise be vectorized.
6014     // Although not strictly necessary, we give up on instructions we know will
6015     // already be scalar to avoid traversing chains that are unlikely to be
6016     // beneficial.
6017     if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
6018         isScalarAfterVectorization(I, VF))
6019       return false;
6020 
6021     // If the instruction is scalar with predication, it will be analyzed
6022     // separately. We ignore it within the context of PredInst.
6023     if (isScalarWithPredication(I))
6024       return false;
6025 
6026     // If any of the instruction's operands are uniform after vectorization,
6027     // the instruction cannot be scalarized. This prevents, for example, a
6028     // masked load from being scalarized.
6029     //
6030     // We assume we will only emit a value for lane zero of an instruction
6031     // marked uniform after vectorization, rather than VF identical values.
6032     // Thus, if we scalarize an instruction that uses a uniform, we would
6033     // create uses of values corresponding to the lanes we aren't emitting code
6034     // for. This behavior can be changed by allowing getScalarValue to clone
6035     // the lane zero values for uniforms rather than asserting.
6036     for (Use &U : I->operands())
6037       if (auto *J = dyn_cast<Instruction>(U.get()))
6038         if (isUniformAfterVectorization(J, VF))
6039           return false;
6040 
6041     // Otherwise, we can scalarize the instruction.
6042     return true;
6043   };
6044 
6045   // Compute the expected cost discount from scalarizing the entire expression
6046   // feeding the predicated instruction. We currently only consider expressions
6047   // that are single-use instruction chains.
6048   Worklist.push_back(PredInst);
6049   while (!Worklist.empty()) {
6050     Instruction *I = Worklist.pop_back_val();
6051 
6052     // If we've already analyzed the instruction, there's nothing to do.
6053     if (ScalarCosts.find(I) != ScalarCosts.end())
6054       continue;
6055 
6056     // Compute the cost of the vector instruction. Note that this cost already
6057     // includes the scalarization overhead of the predicated instruction.
6058     unsigned VectorCost = getInstructionCost(I, VF).first;
6059 
6060     // Compute the cost of the scalarized instruction. This cost is the cost of
6061     // the instruction as if it wasn't if-converted and instead remained in the
6062     // predicated block. We will scale this cost by block probability after
6063     // computing the scalarization overhead.
6064     assert(!VF.isScalable() && "scalable vectors not yet supported.");
6065     unsigned ScalarCost =
6066         VF.getKnownMinValue() *
6067         getInstructionCost(I, ElementCount::getFixed(1)).first;
6068 
6069     // Compute the scalarization overhead of needed insertelement instructions
6070     // and phi nodes.
6071     if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) {
6072       ScalarCost += TTI.getScalarizationOverhead(
6073           cast<VectorType>(ToVectorTy(I->getType(), VF)),
6074           APInt::getAllOnesValue(VF.getKnownMinValue()), true, false);
6075       assert(!VF.isScalable() && "scalable vectors not yet supported.");
6076       ScalarCost +=
6077           VF.getKnownMinValue() *
6078           TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput);
6079     }
6080 
6081     // Compute the scalarization overhead of needed extractelement
6082     // instructions. For each of the instruction's operands, if the operand can
6083     // be scalarized, add it to the worklist; otherwise, account for the
6084     // overhead.
6085     for (Use &U : I->operands())
6086       if (auto *J = dyn_cast<Instruction>(U.get())) {
6087         assert(VectorType::isValidElementType(J->getType()) &&
6088                "Instruction has non-scalar type");
6089         if (canBeScalarized(J))
6090           Worklist.push_back(J);
6091         else if (needsExtract(J, VF)) {
6092           assert(!VF.isScalable() && "scalable vectors not yet supported.");
6093           ScalarCost += TTI.getScalarizationOverhead(
6094               cast<VectorType>(ToVectorTy(J->getType(), VF)),
6095               APInt::getAllOnesValue(VF.getKnownMinValue()), false, true);
6096         }
6097       }
6098 
6099     // Scale the total scalar cost by block probability.
6100     ScalarCost /= getReciprocalPredBlockProb();
6101 
6102     // Compute the discount. A non-negative discount means the vector version
6103     // of the instruction costs more, and scalarizing would be beneficial.
6104     Discount += VectorCost - ScalarCost;
6105     ScalarCosts[I] = ScalarCost;
6106   }
6107 
6108   return Discount;
6109 }
6110 
6111 LoopVectorizationCostModel::VectorizationCostTy
6112 LoopVectorizationCostModel::expectedCost(ElementCount VF) {
6113   assert(!VF.isScalable() && "scalable vectors not yet supported.");
6114   VectorizationCostTy Cost;
6115 
6116   // For each block.
6117   for (BasicBlock *BB : TheLoop->blocks()) {
6118     VectorizationCostTy BlockCost;
6119 
6120     // For each instruction in the old loop.
6121     for (Instruction &I : BB->instructionsWithoutDebug()) {
6122       // Skip ignored values.
6123       if (ValuesToIgnore.count(&I) ||
6124           (VF.isVector() && VecValuesToIgnore.count(&I)))
6125         continue;
6126 
6127       VectorizationCostTy C = getInstructionCost(&I, VF);
6128 
6129       // Check if we should override the cost.
6130       if (ForceTargetInstructionCost.getNumOccurrences() > 0)
6131         C.first = ForceTargetInstructionCost;
6132 
6133       BlockCost.first += C.first;
6134       BlockCost.second |= C.second;
6135       LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first
6136                         << " for VF " << VF << " For instruction: " << I
6137                         << '\n');
6138     }
6139 
6140     // If we are vectorizing a predicated block, it will have been
6141     // if-converted. This means that the block's instructions (aside from
6142     // stores and instructions that may divide by zero) will now be
6143     // unconditionally executed. For the scalar case, we may not always execute
6144     // the predicated block. Thus, scale the block's cost by the probability of
6145     // executing it.
6146     if (VF.isScalar() && blockNeedsPredication(BB))
6147       BlockCost.first /= getReciprocalPredBlockProb();
6148 
6149     Cost.first += BlockCost.first;
6150     Cost.second |= BlockCost.second;
6151   }
6152 
6153   return Cost;
6154 }
6155 
6156 /// Gets Address Access SCEV after verifying that the access pattern
6157 /// is loop invariant except the induction variable dependence.
6158 ///
6159 /// This SCEV can be sent to the Target in order to estimate the address
6160 /// calculation cost.
6161 static const SCEV *getAddressAccessSCEV(
6162               Value *Ptr,
6163               LoopVectorizationLegality *Legal,
6164               PredicatedScalarEvolution &PSE,
6165               const Loop *TheLoop) {
6166 
6167   auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
6168   if (!Gep)
6169     return nullptr;
6170 
6171   // We are looking for a gep with all loop invariant indices except for one
6172   // which should be an induction variable.
6173   auto SE = PSE.getSE();
6174   unsigned NumOperands = Gep->getNumOperands();
6175   for (unsigned i = 1; i < NumOperands; ++i) {
6176     Value *Opd = Gep->getOperand(i);
6177     if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
6178         !Legal->isInductionVariable(Opd))
6179       return nullptr;
6180   }
6181 
6182   // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
6183   return PSE.getSCEV(Ptr);
6184 }
6185 
6186 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
6187   return Legal->hasStride(I->getOperand(0)) ||
6188          Legal->hasStride(I->getOperand(1));
6189 }
6190 
6191 unsigned
6192 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
6193                                                         ElementCount VF) {
6194   assert(VF.isVector() &&
6195          "Scalarization cost of instruction implies vectorization.");
6196   assert(!VF.isScalable() && "scalable vectors not yet supported.");
6197   Type *ValTy = getMemInstValueType(I);
6198   auto SE = PSE.getSE();
6199 
6200   unsigned AS = getLoadStoreAddressSpace(I);
6201   Value *Ptr = getLoadStorePointerOperand(I);
6202   Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
6203 
6204   // Figure out whether the access is strided and get the stride value
6205   // if it's known in compile time
6206   const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
6207 
6208   // Get the cost of the scalar memory instruction and address computation.
6209   unsigned Cost =
6210       VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
6211 
6212   // Don't pass *I here, since it is scalar but will actually be part of a
6213   // vectorized loop where the user of it is a vectorized instruction.
6214   const Align Alignment = getLoadStoreAlignment(I);
6215   Cost += VF.getKnownMinValue() *
6216           TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment,
6217                               AS, TTI::TCK_RecipThroughput);
6218 
6219   // Get the overhead of the extractelement and insertelement instructions
6220   // we might create due to scalarization.
6221   Cost += getScalarizationOverhead(I, VF);
6222 
6223   // If we have a predicated store, it may not be executed for each vector
6224   // lane. Scale the cost by the probability of executing the predicated
6225   // block.
6226   if (isPredicatedInst(I)) {
6227     Cost /= getReciprocalPredBlockProb();
6228 
6229     if (useEmulatedMaskMemRefHack(I))
6230       // Artificially setting to a high enough value to practically disable
6231       // vectorization with such operations.
6232       Cost = 3000000;
6233   }
6234 
6235   return Cost;
6236 }
6237 
6238 unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
6239                                                              ElementCount VF) {
6240   Type *ValTy = getMemInstValueType(I);
6241   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6242   Value *Ptr = getLoadStorePointerOperand(I);
6243   unsigned AS = getLoadStoreAddressSpace(I);
6244   int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
6245   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6246 
6247   assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6248          "Stride should be 1 or -1 for consecutive memory access");
6249   const Align Alignment = getLoadStoreAlignment(I);
6250   unsigned Cost = 0;
6251   if (Legal->isMaskRequired(I))
6252     Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
6253                                       CostKind);
6254   else
6255     Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
6256                                 CostKind, I);
6257 
6258   bool Reverse = ConsecutiveStride < 0;
6259   if (Reverse)
6260     Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
6261   return Cost;
6262 }
6263 
6264 unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
6265                                                          ElementCount VF) {
6266   assert(Legal->isUniformMemOp(*I));
6267 
6268   Type *ValTy = getMemInstValueType(I);
6269   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6270   const Align Alignment = getLoadStoreAlignment(I);
6271   unsigned AS = getLoadStoreAddressSpace(I);
6272   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6273   if (isa<LoadInst>(I)) {
6274     return TTI.getAddressComputationCost(ValTy) +
6275            TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS,
6276                                CostKind) +
6277            TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
6278   }
6279   StoreInst *SI = cast<StoreInst>(I);
6280 
6281   bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand());
6282   return TTI.getAddressComputationCost(ValTy) +
6283          TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS,
6284                              CostKind) +
6285          (isLoopInvariantStoreValue
6286               ? 0
6287               : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
6288                                        VF.getKnownMinValue() - 1));
6289 }
6290 
6291 unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
6292                                                           ElementCount VF) {
6293   Type *ValTy = getMemInstValueType(I);
6294   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6295   const Align Alignment = getLoadStoreAlignment(I);
6296   const Value *Ptr = getLoadStorePointerOperand(I);
6297 
6298   return TTI.getAddressComputationCost(VectorTy) +
6299          TTI.getGatherScatterOpCost(
6300              I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment,
6301              TargetTransformInfo::TCK_RecipThroughput, I);
6302 }
6303 
6304 unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
6305                                                             ElementCount VF) {
6306   Type *ValTy = getMemInstValueType(I);
6307   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6308   unsigned AS = getLoadStoreAddressSpace(I);
6309 
6310   auto Group = getInterleavedAccessGroup(I);
6311   assert(Group && "Fail to get an interleaved access group.");
6312 
6313   unsigned InterleaveFactor = Group->getFactor();
6314   assert(!VF.isScalable() && "scalable vectors not yet supported.");
6315   auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
6316 
6317   // Holds the indices of existing members in an interleaved load group.
6318   // An interleaved store group doesn't need this as it doesn't allow gaps.
6319   SmallVector<unsigned, 4> Indices;
6320   if (isa<LoadInst>(I)) {
6321     for (unsigned i = 0; i < InterleaveFactor; i++)
6322       if (Group->getMember(i))
6323         Indices.push_back(i);
6324   }
6325 
6326   // Calculate the cost of the whole interleaved group.
6327   bool UseMaskForGaps =
6328       Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
6329   unsigned Cost = TTI.getInterleavedMemoryOpCost(
6330       I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(),
6331       AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps);
6332 
6333   if (Group->isReverse()) {
6334     // TODO: Add support for reversed masked interleaved access.
6335     assert(!Legal->isMaskRequired(I) &&
6336            "Reverse masked interleaved access not supported.");
6337     Cost += Group->getNumMembers() *
6338             TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
6339   }
6340   return Cost;
6341 }
6342 
6343 unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
6344                                                               ElementCount VF) {
6345   // Calculate scalar cost only. Vectorization cost should be ready at this
6346   // moment.
6347   if (VF.isScalar()) {
6348     Type *ValTy = getMemInstValueType(I);
6349     const Align Alignment = getLoadStoreAlignment(I);
6350     unsigned AS = getLoadStoreAddressSpace(I);
6351 
6352     return TTI.getAddressComputationCost(ValTy) +
6353            TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS,
6354                                TTI::TCK_RecipThroughput, I);
6355   }
6356   return getWideningCost(I, VF);
6357 }
6358 
6359 LoopVectorizationCostModel::VectorizationCostTy
6360 LoopVectorizationCostModel::getInstructionCost(Instruction *I,
6361                                                ElementCount VF) {
6362   assert(!VF.isScalable() &&
6363          "the cost model is not yet implemented for scalable vectorization");
6364   // If we know that this instruction will remain uniform, check the cost of
6365   // the scalar version.
6366   if (isUniformAfterVectorization(I, VF))
6367     VF = ElementCount::getFixed(1);
6368 
6369   if (VF.isVector() && isProfitableToScalarize(I, VF))
6370     return VectorizationCostTy(InstsToScalarize[VF][I], false);
6371 
6372   // Forced scalars do not have any scalarization overhead.
6373   auto ForcedScalar = ForcedScalars.find(VF);
6374   if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
6375     auto InstSet = ForcedScalar->second;
6376     if (InstSet.count(I))
6377       return VectorizationCostTy(
6378           (getInstructionCost(I, ElementCount::getFixed(1)).first *
6379            VF.getKnownMinValue()),
6380           false);
6381   }
6382 
6383   Type *VectorTy;
6384   unsigned C = getInstructionCost(I, VF, VectorTy);
6385 
6386   bool TypeNotScalarized =
6387       VF.isVector() && VectorTy->isVectorTy() &&
6388       TTI.getNumberOfParts(VectorTy) < VF.getKnownMinValue();
6389   return VectorizationCostTy(C, TypeNotScalarized);
6390 }
6391 
6392 unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
6393                                                               ElementCount VF) {
6394 
6395   assert(!VF.isScalable() &&
6396          "cannot compute scalarization overhead for scalable vectorization");
6397   if (VF.isScalar())
6398     return 0;
6399 
6400   unsigned Cost = 0;
6401   Type *RetTy = ToVectorTy(I->getType(), VF);
6402   if (!RetTy->isVoidTy() &&
6403       (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
6404     Cost += TTI.getScalarizationOverhead(
6405         cast<VectorType>(RetTy), APInt::getAllOnesValue(VF.getKnownMinValue()),
6406         true, false);
6407 
6408   // Some targets keep addresses scalar.
6409   if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
6410     return Cost;
6411 
6412   // Some targets support efficient element stores.
6413   if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
6414     return Cost;
6415 
6416   // Collect operands to consider.
6417   CallInst *CI = dyn_cast<CallInst>(I);
6418   Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands();
6419 
6420   // Skip operands that do not require extraction/scalarization and do not incur
6421   // any overhead.
6422   return Cost + TTI.getOperandsScalarizationOverhead(
6423                     filterExtractingOperands(Ops, VF), VF.getKnownMinValue());
6424 }
6425 
6426 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
6427   assert(!VF.isScalable() && "scalable vectors not yet supported.");
6428   if (VF.isScalar())
6429     return;
6430   NumPredStores = 0;
6431   for (BasicBlock *BB : TheLoop->blocks()) {
6432     // For each instruction in the old loop.
6433     for (Instruction &I : *BB) {
6434       Value *Ptr =  getLoadStorePointerOperand(&I);
6435       if (!Ptr)
6436         continue;
6437 
6438       // TODO: We should generate better code and update the cost model for
6439       // predicated uniform stores. Today they are treated as any other
6440       // predicated store (see added test cases in
6441       // invariant-store-vectorization.ll).
6442       if (isa<StoreInst>(&I) && isScalarWithPredication(&I))
6443         NumPredStores++;
6444 
6445       if (Legal->isUniformMemOp(I)) {
6446         // TODO: Avoid replicating loads and stores instead of
6447         // relying on instcombine to remove them.
6448         // Load: Scalar load + broadcast
6449         // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
6450         unsigned Cost = getUniformMemOpCost(&I, VF);
6451         setWideningDecision(&I, VF, CM_Scalarize, Cost);
6452         continue;
6453       }
6454 
6455       // We assume that widening is the best solution when possible.
6456       if (memoryInstructionCanBeWidened(&I, VF)) {
6457         unsigned Cost = getConsecutiveMemOpCost(&I, VF);
6458         int ConsecutiveStride =
6459                Legal->isConsecutivePtr(getLoadStorePointerOperand(&I));
6460         assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6461                "Expected consecutive stride.");
6462         InstWidening Decision =
6463             ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
6464         setWideningDecision(&I, VF, Decision, Cost);
6465         continue;
6466       }
6467 
6468       // Choose between Interleaving, Gather/Scatter or Scalarization.
6469       unsigned InterleaveCost = std::numeric_limits<unsigned>::max();
6470       unsigned NumAccesses = 1;
6471       if (isAccessInterleaved(&I)) {
6472         auto Group = getInterleavedAccessGroup(&I);
6473         assert(Group && "Fail to get an interleaved access group.");
6474 
6475         // Make one decision for the whole group.
6476         if (getWideningDecision(&I, VF) != CM_Unknown)
6477           continue;
6478 
6479         NumAccesses = Group->getNumMembers();
6480         if (interleavedAccessCanBeWidened(&I, VF))
6481           InterleaveCost = getInterleaveGroupCost(&I, VF);
6482       }
6483 
6484       unsigned GatherScatterCost =
6485           isLegalGatherOrScatter(&I)
6486               ? getGatherScatterCost(&I, VF) * NumAccesses
6487               : std::numeric_limits<unsigned>::max();
6488 
6489       unsigned ScalarizationCost =
6490           getMemInstScalarizationCost(&I, VF) * NumAccesses;
6491 
6492       // Choose better solution for the current VF,
6493       // write down this decision and use it during vectorization.
6494       unsigned Cost;
6495       InstWidening Decision;
6496       if (InterleaveCost <= GatherScatterCost &&
6497           InterleaveCost < ScalarizationCost) {
6498         Decision = CM_Interleave;
6499         Cost = InterleaveCost;
6500       } else if (GatherScatterCost < ScalarizationCost) {
6501         Decision = CM_GatherScatter;
6502         Cost = GatherScatterCost;
6503       } else {
6504         Decision = CM_Scalarize;
6505         Cost = ScalarizationCost;
6506       }
6507       // If the instructions belongs to an interleave group, the whole group
6508       // receives the same decision. The whole group receives the cost, but
6509       // the cost will actually be assigned to one instruction.
6510       if (auto Group = getInterleavedAccessGroup(&I))
6511         setWideningDecision(Group, VF, Decision, Cost);
6512       else
6513         setWideningDecision(&I, VF, Decision, Cost);
6514     }
6515   }
6516 
6517   // Make sure that any load of address and any other address computation
6518   // remains scalar unless there is gather/scatter support. This avoids
6519   // inevitable extracts into address registers, and also has the benefit of
6520   // activating LSR more, since that pass can't optimize vectorized
6521   // addresses.
6522   if (TTI.prefersVectorizedAddressing())
6523     return;
6524 
6525   // Start with all scalar pointer uses.
6526   SmallPtrSet<Instruction *, 8> AddrDefs;
6527   for (BasicBlock *BB : TheLoop->blocks())
6528     for (Instruction &I : *BB) {
6529       Instruction *PtrDef =
6530         dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
6531       if (PtrDef && TheLoop->contains(PtrDef) &&
6532           getWideningDecision(&I, VF) != CM_GatherScatter)
6533         AddrDefs.insert(PtrDef);
6534     }
6535 
6536   // Add all instructions used to generate the addresses.
6537   SmallVector<Instruction *, 4> Worklist;
6538   for (auto *I : AddrDefs)
6539     Worklist.push_back(I);
6540   while (!Worklist.empty()) {
6541     Instruction *I = Worklist.pop_back_val();
6542     for (auto &Op : I->operands())
6543       if (auto *InstOp = dyn_cast<Instruction>(Op))
6544         if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
6545             AddrDefs.insert(InstOp).second)
6546           Worklist.push_back(InstOp);
6547   }
6548 
6549   for (auto *I : AddrDefs) {
6550     if (isa<LoadInst>(I)) {
6551       // Setting the desired widening decision should ideally be handled in
6552       // by cost functions, but since this involves the task of finding out
6553       // if the loaded register is involved in an address computation, it is
6554       // instead changed here when we know this is the case.
6555       InstWidening Decision = getWideningDecision(I, VF);
6556       if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
6557         // Scalarize a widened load of address.
6558         setWideningDecision(
6559             I, VF, CM_Scalarize,
6560             (VF.getKnownMinValue() *
6561              getMemoryInstructionCost(I, ElementCount::getFixed(1))));
6562       else if (auto Group = getInterleavedAccessGroup(I)) {
6563         // Scalarize an interleave group of address loads.
6564         for (unsigned I = 0; I < Group->getFactor(); ++I) {
6565           if (Instruction *Member = Group->getMember(I))
6566             setWideningDecision(
6567                 Member, VF, CM_Scalarize,
6568                 (VF.getKnownMinValue() *
6569                  getMemoryInstructionCost(Member, ElementCount::getFixed(1))));
6570         }
6571       }
6572     } else
6573       // Make sure I gets scalarized and a cost estimate without
6574       // scalarization overhead.
6575       ForcedScalars[VF].insert(I);
6576   }
6577 }
6578 
6579 unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
6580                                                         ElementCount VF,
6581                                                         Type *&VectorTy) {
6582   Type *RetTy = I->getType();
6583   if (canTruncateToMinimalBitwidth(I, VF))
6584     RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
6585   VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF);
6586   auto SE = PSE.getSE();
6587   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6588 
6589   // TODO: We need to estimate the cost of intrinsic calls.
6590   switch (I->getOpcode()) {
6591   case Instruction::GetElementPtr:
6592     // We mark this instruction as zero-cost because the cost of GEPs in
6593     // vectorized code depends on whether the corresponding memory instruction
6594     // is scalarized or not. Therefore, we handle GEPs with the memory
6595     // instruction cost.
6596     return 0;
6597   case Instruction::Br: {
6598     // In cases of scalarized and predicated instructions, there will be VF
6599     // predicated blocks in the vectorized loop. Each branch around these
6600     // blocks requires also an extract of its vector compare i1 element.
6601     bool ScalarPredicatedBB = false;
6602     BranchInst *BI = cast<BranchInst>(I);
6603     if (VF.isVector() && BI->isConditional() &&
6604         (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) ||
6605          PredicatedBBsAfterVectorization.count(BI->getSuccessor(1))))
6606       ScalarPredicatedBB = true;
6607 
6608     if (ScalarPredicatedBB) {
6609       // Return cost for branches around scalarized and predicated blocks.
6610       assert(!VF.isScalable() && "scalable vectors not yet supported.");
6611       auto *Vec_i1Ty =
6612           VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
6613       return (TTI.getScalarizationOverhead(
6614                   Vec_i1Ty, APInt::getAllOnesValue(VF.getKnownMinValue()),
6615                   false, true) +
6616               (TTI.getCFInstrCost(Instruction::Br, CostKind) *
6617                VF.getKnownMinValue()));
6618     } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar())
6619       // The back-edge branch will remain, as will all scalar branches.
6620       return TTI.getCFInstrCost(Instruction::Br, CostKind);
6621     else
6622       // This branch will be eliminated by if-conversion.
6623       return 0;
6624     // Note: We currently assume zero cost for an unconditional branch inside
6625     // a predicated block since it will become a fall-through, although we
6626     // may decide in the future to call TTI for all branches.
6627   }
6628   case Instruction::PHI: {
6629     auto *Phi = cast<PHINode>(I);
6630 
6631     // First-order recurrences are replaced by vector shuffles inside the loop.
6632     // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type.
6633     if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi))
6634       return TTI.getShuffleCost(
6635           TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy),
6636           VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1));
6637 
6638     // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
6639     // converted into select instructions. We require N - 1 selects per phi
6640     // node, where N is the number of incoming values.
6641     if (VF.isVector() && Phi->getParent() != TheLoop->getHeader())
6642       return (Phi->getNumIncomingValues() - 1) *
6643              TTI.getCmpSelInstrCost(
6644                  Instruction::Select, ToVectorTy(Phi->getType(), VF),
6645                  ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF),
6646                  CmpInst::BAD_ICMP_PREDICATE, CostKind);
6647 
6648     return TTI.getCFInstrCost(Instruction::PHI, CostKind);
6649   }
6650   case Instruction::UDiv:
6651   case Instruction::SDiv:
6652   case Instruction::URem:
6653   case Instruction::SRem:
6654     // If we have a predicated instruction, it may not be executed for each
6655     // vector lane. Get the scalarization cost and scale this amount by the
6656     // probability of executing the predicated block. If the instruction is not
6657     // predicated, we fall through to the next case.
6658     if (VF.isVector() && isScalarWithPredication(I)) {
6659       unsigned Cost = 0;
6660 
6661       // These instructions have a non-void type, so account for the phi nodes
6662       // that we will create. This cost is likely to be zero. The phi node
6663       // cost, if any, should be scaled by the block probability because it
6664       // models a copy at the end of each predicated block.
6665       Cost += VF.getKnownMinValue() *
6666               TTI.getCFInstrCost(Instruction::PHI, CostKind);
6667 
6668       // The cost of the non-predicated instruction.
6669       Cost += VF.getKnownMinValue() *
6670               TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind);
6671 
6672       // The cost of insertelement and extractelement instructions needed for
6673       // scalarization.
6674       Cost += getScalarizationOverhead(I, VF);
6675 
6676       // Scale the cost by the probability of executing the predicated blocks.
6677       // This assumes the predicated block for each vector lane is equally
6678       // likely.
6679       return Cost / getReciprocalPredBlockProb();
6680     }
6681     LLVM_FALLTHROUGH;
6682   case Instruction::Add:
6683   case Instruction::FAdd:
6684   case Instruction::Sub:
6685   case Instruction::FSub:
6686   case Instruction::Mul:
6687   case Instruction::FMul:
6688   case Instruction::FDiv:
6689   case Instruction::FRem:
6690   case Instruction::Shl:
6691   case Instruction::LShr:
6692   case Instruction::AShr:
6693   case Instruction::And:
6694   case Instruction::Or:
6695   case Instruction::Xor: {
6696     // Since we will replace the stride by 1 the multiplication should go away.
6697     if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal))
6698       return 0;
6699     // Certain instructions can be cheaper to vectorize if they have a constant
6700     // second vector operand. One example of this are shifts on x86.
6701     Value *Op2 = I->getOperand(1);
6702     TargetTransformInfo::OperandValueProperties Op2VP;
6703     TargetTransformInfo::OperandValueKind Op2VK =
6704         TTI.getOperandInfo(Op2, Op2VP);
6705     if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2))
6706       Op2VK = TargetTransformInfo::OK_UniformValue;
6707 
6708     SmallVector<const Value *, 4> Operands(I->operand_values());
6709     unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1;
6710     return N * TTI.getArithmeticInstrCost(
6711                    I->getOpcode(), VectorTy, CostKind,
6712                    TargetTransformInfo::OK_AnyValue,
6713                    Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I);
6714   }
6715   case Instruction::FNeg: {
6716     assert(!VF.isScalable() && "VF is assumed to be non scalable.");
6717     unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1;
6718     return N * TTI.getArithmeticInstrCost(
6719                    I->getOpcode(), VectorTy, CostKind,
6720                    TargetTransformInfo::OK_AnyValue,
6721                    TargetTransformInfo::OK_AnyValue,
6722                    TargetTransformInfo::OP_None, TargetTransformInfo::OP_None,
6723                    I->getOperand(0), I);
6724   }
6725   case Instruction::Select: {
6726     SelectInst *SI = cast<SelectInst>(I);
6727     const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
6728     bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
6729     Type *CondTy = SI->getCondition()->getType();
6730     if (!ScalarCond) {
6731       assert(!VF.isScalable() && "VF is assumed to be non scalable.");
6732       CondTy = VectorType::get(CondTy, VF);
6733     }
6734     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy,
6735                                   CmpInst::BAD_ICMP_PREDICATE, CostKind, I);
6736   }
6737   case Instruction::ICmp:
6738   case Instruction::FCmp: {
6739     Type *ValTy = I->getOperand(0)->getType();
6740     Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
6741     if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
6742       ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
6743     VectorTy = ToVectorTy(ValTy, VF);
6744     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr,
6745                                   CmpInst::BAD_ICMP_PREDICATE, CostKind, I);
6746   }
6747   case Instruction::Store:
6748   case Instruction::Load: {
6749     ElementCount Width = VF;
6750     if (Width.isVector()) {
6751       InstWidening Decision = getWideningDecision(I, Width);
6752       assert(Decision != CM_Unknown &&
6753              "CM decision should be taken at this point");
6754       if (Decision == CM_Scalarize)
6755         Width = ElementCount::getFixed(1);
6756     }
6757     VectorTy = ToVectorTy(getMemInstValueType(I), Width);
6758     return getMemoryInstructionCost(I, VF);
6759   }
6760   case Instruction::ZExt:
6761   case Instruction::SExt:
6762   case Instruction::FPToUI:
6763   case Instruction::FPToSI:
6764   case Instruction::FPExt:
6765   case Instruction::PtrToInt:
6766   case Instruction::IntToPtr:
6767   case Instruction::SIToFP:
6768   case Instruction::UIToFP:
6769   case Instruction::Trunc:
6770   case Instruction::FPTrunc:
6771   case Instruction::BitCast: {
6772     // Computes the CastContextHint from a Load/Store instruction.
6773     auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint {
6774       assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
6775              "Expected a load or a store!");
6776 
6777       if (VF.isScalar() || !TheLoop->contains(I))
6778         return TTI::CastContextHint::Normal;
6779 
6780       switch (getWideningDecision(I, VF)) {
6781       case LoopVectorizationCostModel::CM_GatherScatter:
6782         return TTI::CastContextHint::GatherScatter;
6783       case LoopVectorizationCostModel::CM_Interleave:
6784         return TTI::CastContextHint::Interleave;
6785       case LoopVectorizationCostModel::CM_Scalarize:
6786       case LoopVectorizationCostModel::CM_Widen:
6787         return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked
6788                                         : TTI::CastContextHint::Normal;
6789       case LoopVectorizationCostModel::CM_Widen_Reverse:
6790         return TTI::CastContextHint::Reversed;
6791       case LoopVectorizationCostModel::CM_Unknown:
6792         llvm_unreachable("Instr did not go through cost modelling?");
6793       }
6794 
6795       llvm_unreachable("Unhandled case!");
6796     };
6797 
6798     unsigned Opcode = I->getOpcode();
6799     TTI::CastContextHint CCH = TTI::CastContextHint::None;
6800     // For Trunc, the context is the only user, which must be a StoreInst.
6801     if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
6802       if (I->hasOneUse())
6803         if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin()))
6804           CCH = ComputeCCH(Store);
6805     }
6806     // For Z/Sext, the context is the operand, which must be a LoadInst.
6807     else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
6808              Opcode == Instruction::FPExt) {
6809       if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0)))
6810         CCH = ComputeCCH(Load);
6811     }
6812 
6813     // We optimize the truncation of induction variables having constant
6814     // integer steps. The cost of these truncations is the same as the scalar
6815     // operation.
6816     if (isOptimizableIVTruncate(I, VF)) {
6817       auto *Trunc = cast<TruncInst>(I);
6818       return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
6819                                   Trunc->getSrcTy(), CCH, CostKind, Trunc);
6820     }
6821 
6822     Type *SrcScalarTy = I->getOperand(0)->getType();
6823     Type *SrcVecTy =
6824         VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
6825     if (canTruncateToMinimalBitwidth(I, VF)) {
6826       // This cast is going to be shrunk. This may remove the cast or it might
6827       // turn it into slightly different cast. For example, if MinBW == 16,
6828       // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
6829       //
6830       // Calculate the modified src and dest types.
6831       Type *MinVecTy = VectorTy;
6832       if (Opcode == Instruction::Trunc) {
6833         SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
6834         VectorTy =
6835             largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
6836       } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) {
6837         SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy);
6838         VectorTy =
6839             smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
6840       }
6841     }
6842 
6843     assert(!VF.isScalable() && "VF is assumed to be non scalable");
6844     unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1;
6845     return N *
6846            TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
6847   }
6848   case Instruction::Call: {
6849     bool NeedToScalarize;
6850     CallInst *CI = cast<CallInst>(I);
6851     unsigned CallCost = getVectorCallCost(CI, VF, NeedToScalarize);
6852     if (getVectorIntrinsicIDForCall(CI, TLI))
6853       return std::min(CallCost, getVectorIntrinsicCost(CI, VF));
6854     return CallCost;
6855   }
6856   default:
6857     // The cost of executing VF copies of the scalar instruction. This opcode
6858     // is unknown. Assume that it is the same as 'mul'.
6859     return VF.getKnownMinValue() * TTI.getArithmeticInstrCost(
6860                                        Instruction::Mul, VectorTy, CostKind) +
6861            getScalarizationOverhead(I, VF);
6862   } // end of switch.
6863 }
6864 
6865 char LoopVectorize::ID = 0;
6866 
6867 static const char lv_name[] = "Loop Vectorization";
6868 
6869 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
6870 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
6871 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)
6872 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
6873 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
6874 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
6875 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
6876 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
6877 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
6878 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
6879 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
6880 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
6881 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
6882 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
6883 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy)
6884 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
6885 
6886 namespace llvm {
6887 
6888 Pass *createLoopVectorizePass() { return new LoopVectorize(); }
6889 
6890 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced,
6891                               bool VectorizeOnlyWhenForced) {
6892   return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced);
6893 }
6894 
6895 } // end namespace llvm
6896 
6897 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
6898   // Check if the pointer operand of a load or store instruction is
6899   // consecutive.
6900   if (auto *Ptr = getLoadStorePointerOperand(Inst))
6901     return Legal->isConsecutivePtr(Ptr);
6902   return false;
6903 }
6904 
6905 void LoopVectorizationCostModel::collectValuesToIgnore() {
6906   // Ignore ephemeral values.
6907   CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
6908 
6909   // Ignore type-promoting instructions we identified during reduction
6910   // detection.
6911   for (auto &Reduction : Legal->getReductionVars()) {
6912     RecurrenceDescriptor &RedDes = Reduction.second;
6913     const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
6914     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
6915   }
6916   // Ignore type-casting instructions we identified during induction
6917   // detection.
6918   for (auto &Induction : Legal->getInductionVars()) {
6919     InductionDescriptor &IndDes = Induction.second;
6920     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
6921     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
6922   }
6923 }
6924 
6925 void LoopVectorizationCostModel::collectInLoopReductions() {
6926   for (auto &Reduction : Legal->getReductionVars()) {
6927     PHINode *Phi = Reduction.first;
6928     RecurrenceDescriptor &RdxDesc = Reduction.second;
6929 
6930     // We don't collect reductions that are type promoted (yet).
6931     if (RdxDesc.getRecurrenceType() != Phi->getType())
6932       continue;
6933 
6934     // If the target would prefer this reduction to happen "in-loop", then we
6935     // want to record it as such.
6936     unsigned Opcode = RdxDesc.getRecurrenceBinOp();
6937     if (!PreferInLoopReductions &&
6938         !TTI.preferInLoopReduction(Opcode, Phi->getType(),
6939                                    TargetTransformInfo::ReductionFlags()))
6940       continue;
6941 
6942     // Check that we can correctly put the reductions into the loop, by
6943     // finding the chain of operations that leads from the phi to the loop
6944     // exit value.
6945     SmallVector<Instruction *, 4> ReductionOperations =
6946         RdxDesc.getReductionOpChain(Phi, TheLoop);
6947     bool InLoop = !ReductionOperations.empty();
6948     if (InLoop)
6949       InLoopReductionChains[Phi] = ReductionOperations;
6950     LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop")
6951                       << " reduction for phi: " << *Phi << "\n");
6952   }
6953 }
6954 
6955 // TODO: we could return a pair of values that specify the max VF and
6956 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
6957 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
6958 // doesn't have a cost model that can choose which plan to execute if
6959 // more than one is generated.
6960 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits,
6961                                  LoopVectorizationCostModel &CM) {
6962   unsigned WidestType;
6963   std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
6964   return WidestVectorRegBits / WidestType;
6965 }
6966 
6967 VectorizationFactor
6968 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
6969   assert(!UserVF.isScalable() && "scalable vectors not yet supported");
6970   ElementCount VF = UserVF;
6971   // Outer loop handling: They may require CFG and instruction level
6972   // transformations before even evaluating whether vectorization is profitable.
6973   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
6974   // the vectorization pipeline.
6975   if (!OrigLoop->isInnermost()) {
6976     // If the user doesn't provide a vectorization factor, determine a
6977     // reasonable one.
6978     if (UserVF.isZero()) {
6979       VF = ElementCount::getFixed(
6980           determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM));
6981       LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
6982 
6983       // Make sure we have a VF > 1 for stress testing.
6984       if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) {
6985         LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
6986                           << "overriding computed VF.\n");
6987         VF = ElementCount::getFixed(4);
6988       }
6989     }
6990     assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
6991     assert(isPowerOf2_32(VF.getKnownMinValue()) &&
6992            "VF needs to be a power of two");
6993     LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "")
6994                       << "VF " << VF << " to build VPlans.\n");
6995     buildVPlans(VF, VF);
6996 
6997     // For VPlan build stress testing, we bail out after VPlan construction.
6998     if (VPlanBuildStressTest)
6999       return VectorizationFactor::Disabled();
7000 
7001     return {VF, 0 /*Cost*/};
7002   }
7003 
7004   LLVM_DEBUG(
7005       dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
7006                 "VPlan-native path.\n");
7007   return VectorizationFactor::Disabled();
7008 }
7009 
7010 Optional<VectorizationFactor>
7011 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
7012   assert(!UserVF.isScalable() && "scalable vectorization not yet handled");
7013   assert(OrigLoop->isInnermost() && "Inner loop expected.");
7014   Optional<ElementCount> MaybeMaxVF = CM.computeMaxVF(UserVF, UserIC);
7015   if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved.
7016     return None;
7017 
7018   // Invalidate interleave groups if all blocks of loop will be predicated.
7019   if (CM.blockNeedsPredication(OrigLoop->getHeader()) &&
7020       !useMaskedInterleavedAccesses(*TTI)) {
7021     LLVM_DEBUG(
7022         dbgs()
7023         << "LV: Invalidate all interleaved groups due to fold-tail by masking "
7024            "which requires masked-interleaved support.\n");
7025     if (CM.InterleaveInfo.invalidateGroups())
7026       // Invalidating interleave groups also requires invalidating all decisions
7027       // based on them, which includes widening decisions and uniform and scalar
7028       // values.
7029       CM.invalidateCostModelingDecisions();
7030   }
7031 
7032   if (!UserVF.isZero()) {
7033     LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
7034     assert(isPowerOf2_32(UserVF.getKnownMinValue()) &&
7035            "VF needs to be a power of two");
7036     // Collect the instructions (and their associated costs) that will be more
7037     // profitable to scalarize.
7038     CM.selectUserVectorizationFactor(UserVF);
7039     CM.collectInLoopReductions();
7040     buildVPlansWithVPRecipes(UserVF, UserVF);
7041     LLVM_DEBUG(printPlans(dbgs()));
7042     return {{UserVF, 0}};
7043   }
7044 
7045   ElementCount MaxVF = MaybeMaxVF.getValue();
7046   assert(MaxVF.isNonZero() && "MaxVF is zero.");
7047 
7048   for (ElementCount VF = ElementCount::getFixed(1);
7049        ElementCount::isKnownLE(VF, MaxVF); VF *= 2) {
7050     // Collect Uniform and Scalar instructions after vectorization with VF.
7051     CM.collectUniformsAndScalars(VF);
7052 
7053     // Collect the instructions (and their associated costs) that will be more
7054     // profitable to scalarize.
7055     if (VF.isVector())
7056       CM.collectInstsToScalarize(VF);
7057   }
7058 
7059   CM.collectInLoopReductions();
7060 
7061   buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxVF);
7062   LLVM_DEBUG(printPlans(dbgs()));
7063   if (MaxVF.isScalar())
7064     return VectorizationFactor::Disabled();
7065 
7066   // Select the optimal vectorization factor.
7067   return CM.selectVectorizationFactor(MaxVF);
7068 }
7069 
7070 void LoopVectorizationPlanner::setBestPlan(ElementCount VF, unsigned UF) {
7071   LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF
7072                     << '\n');
7073   BestVF = VF;
7074   BestUF = UF;
7075 
7076   erase_if(VPlans, [VF](const VPlanPtr &Plan) {
7077     return !Plan->hasVF(VF);
7078   });
7079   assert(VPlans.size() == 1 && "Best VF has not a single VPlan.");
7080 }
7081 
7082 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV,
7083                                            DominatorTree *DT) {
7084   // Perform the actual loop transformation.
7085 
7086   // 1. Create a new empty loop. Unlink the old loop and connect the new one.
7087   VPCallbackILV CallbackILV(ILV);
7088 
7089   assert(BestVF.hasValue() && "Vectorization Factor is missing");
7090 
7091   VPTransformState State{*BestVF, BestUF,      LI,
7092                          DT,      ILV.Builder, ILV.VectorLoopValueMap,
7093                          &ILV,    CallbackILV};
7094   State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
7095   State.TripCount = ILV.getOrCreateTripCount(nullptr);
7096   State.CanonicalIV = ILV.Induction;
7097 
7098   //===------------------------------------------------===//
7099   //
7100   // Notice: any optimization or new instruction that go
7101   // into the code below should also be implemented in
7102   // the cost-model.
7103   //
7104   //===------------------------------------------------===//
7105 
7106   // 2. Copy and widen instructions from the old loop into the new loop.
7107   assert(VPlans.size() == 1 && "Not a single VPlan to execute.");
7108   VPlans.front()->execute(&State);
7109 
7110   // 3. Fix the vectorized code: take care of header phi's, live-outs,
7111   //    predication, updating analyses.
7112   ILV.fixVectorizedLoop();
7113 }
7114 
7115 void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
7116     SmallPtrSetImpl<Instruction *> &DeadInstructions) {
7117   BasicBlock *Latch = OrigLoop->getLoopLatch();
7118 
7119   // We create new control-flow for the vectorized loop, so the original
7120   // condition will be dead after vectorization if it's only used by the
7121   // branch.
7122   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
7123   if (Cmp && Cmp->hasOneUse()) {
7124     DeadInstructions.insert(Cmp);
7125 
7126     // The operands of the icmp is often a dead trunc, used by IndUpdate.
7127     for (Value *Op : Cmp->operands()) {
7128       if (isa<TruncInst>(Op) && Op->hasOneUse())
7129           DeadInstructions.insert(cast<Instruction>(Op));
7130     }
7131   }
7132 
7133   // We create new "steps" for induction variable updates to which the original
7134   // induction variables map. An original update instruction will be dead if
7135   // all its users except the induction variable are dead.
7136   for (auto &Induction : Legal->getInductionVars()) {
7137     PHINode *Ind = Induction.first;
7138     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
7139 
7140     // If the tail is to be folded by masking, the primary induction variable,
7141     // if exists, isn't dead: it will be used for masking. Don't kill it.
7142     if (CM.foldTailByMasking() && IndUpdate == Legal->getPrimaryInduction())
7143       continue;
7144 
7145     if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
7146           return U == Ind || DeadInstructions.count(cast<Instruction>(U));
7147         }))
7148       DeadInstructions.insert(IndUpdate);
7149 
7150     // We record as "Dead" also the type-casting instructions we had identified
7151     // during induction analysis. We don't need any handling for them in the
7152     // vectorized loop because we have proven that, under a proper runtime
7153     // test guarding the vectorized loop, the value of the phi, and the casted
7154     // value of the phi, are the same. The last instruction in this casting chain
7155     // will get its scalar/vector/widened def from the scalar/vector/widened def
7156     // of the respective phi node. Any other casts in the induction def-use chain
7157     // have no other uses outside the phi update chain, and will be ignored.
7158     InductionDescriptor &IndDes = Induction.second;
7159     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
7160     DeadInstructions.insert(Casts.begin(), Casts.end());
7161   }
7162 }
7163 
7164 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; }
7165 
7166 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; }
7167 
7168 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step,
7169                                         Instruction::BinaryOps BinOp) {
7170   // When unrolling and the VF is 1, we only need to add a simple scalar.
7171   Type *Ty = Val->getType();
7172   assert(!Ty->isVectorTy() && "Val must be a scalar");
7173 
7174   if (Ty->isFloatingPointTy()) {
7175     Constant *C = ConstantFP::get(Ty, (double)StartIdx);
7176 
7177     // Floating point operations had to be 'fast' to enable the unrolling.
7178     Value *MulOp = addFastMathFlag(Builder.CreateFMul(C, Step));
7179     return addFastMathFlag(Builder.CreateBinOp(BinOp, Val, MulOp));
7180   }
7181   Constant *C = ConstantInt::get(Ty, StartIdx);
7182   return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction");
7183 }
7184 
7185 static void AddRuntimeUnrollDisableMetaData(Loop *L) {
7186   SmallVector<Metadata *, 4> MDs;
7187   // Reserve first location for self reference to the LoopID metadata node.
7188   MDs.push_back(nullptr);
7189   bool IsUnrollMetadata = false;
7190   MDNode *LoopID = L->getLoopID();
7191   if (LoopID) {
7192     // First find existing loop unrolling disable metadata.
7193     for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
7194       auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
7195       if (MD) {
7196         const auto *S = dyn_cast<MDString>(MD->getOperand(0));
7197         IsUnrollMetadata =
7198             S && S->getString().startswith("llvm.loop.unroll.disable");
7199       }
7200       MDs.push_back(LoopID->getOperand(i));
7201     }
7202   }
7203 
7204   if (!IsUnrollMetadata) {
7205     // Add runtime unroll disable metadata.
7206     LLVMContext &Context = L->getHeader()->getContext();
7207     SmallVector<Metadata *, 1> DisableOperands;
7208     DisableOperands.push_back(
7209         MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
7210     MDNode *DisableNode = MDNode::get(Context, DisableOperands);
7211     MDs.push_back(DisableNode);
7212     MDNode *NewLoopID = MDNode::get(Context, MDs);
7213     // Set operand 0 to refer to the loop id itself.
7214     NewLoopID->replaceOperandWith(0, NewLoopID);
7215     L->setLoopID(NewLoopID);
7216   }
7217 }
7218 
7219 bool LoopVectorizationPlanner::getDecisionAndClampRange(
7220     const std::function<bool(ElementCount)> &Predicate, VFRange &Range) {
7221   assert(!Range.isEmpty() && "Trying to test an empty VF range.");
7222   bool PredicateAtRangeStart = Predicate(Range.Start);
7223 
7224   for (ElementCount TmpVF = Range.Start * 2;
7225        ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2)
7226     if (Predicate(TmpVF) != PredicateAtRangeStart) {
7227       Range.End = TmpVF;
7228       break;
7229     }
7230 
7231   return PredicateAtRangeStart;
7232 }
7233 
7234 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
7235 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
7236 /// of VF's starting at a given VF and extending it as much as possible. Each
7237 /// vectorization decision can potentially shorten this sub-range during
7238 /// buildVPlan().
7239 void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF,
7240                                            ElementCount MaxVF) {
7241   auto MaxVFPlusOne = MaxVF.getWithIncrement(1);
7242   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) {
7243     VFRange SubRange = {VF, MaxVFPlusOne};
7244     VPlans.push_back(buildVPlan(SubRange));
7245     VF = SubRange.End;
7246   }
7247 }
7248 
7249 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
7250                                          VPlanPtr &Plan) {
7251   assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
7252 
7253   // Look for cached value.
7254   std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
7255   EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
7256   if (ECEntryIt != EdgeMaskCache.end())
7257     return ECEntryIt->second;
7258 
7259   VPValue *SrcMask = createBlockInMask(Src, Plan);
7260 
7261   // The terminator has to be a branch inst!
7262   BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
7263   assert(BI && "Unexpected terminator found");
7264 
7265   if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1))
7266     return EdgeMaskCache[Edge] = SrcMask;
7267 
7268   VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition());
7269   assert(EdgeMask && "No Edge Mask found for condition");
7270 
7271   if (BI->getSuccessor(0) != Dst)
7272     EdgeMask = Builder.createNot(EdgeMask);
7273 
7274   if (SrcMask) // Otherwise block in-mask is all-one, no need to AND.
7275     EdgeMask = Builder.createAnd(EdgeMask, SrcMask);
7276 
7277   return EdgeMaskCache[Edge] = EdgeMask;
7278 }
7279 
7280 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
7281   assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
7282 
7283   // Look for cached value.
7284   BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB);
7285   if (BCEntryIt != BlockMaskCache.end())
7286     return BCEntryIt->second;
7287 
7288   // All-one mask is modelled as no-mask following the convention for masked
7289   // load/store/gather/scatter. Initialize BlockMask to no-mask.
7290   VPValue *BlockMask = nullptr;
7291 
7292   if (OrigLoop->getHeader() == BB) {
7293     if (!CM.blockNeedsPredication(BB))
7294       return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one.
7295 
7296     // Create the block in mask as the first non-phi instruction in the block.
7297     VPBuilder::InsertPointGuard Guard(Builder);
7298     auto NewInsertionPoint = Builder.getInsertBlock()->getFirstNonPhi();
7299     Builder.setInsertPoint(Builder.getInsertBlock(), NewInsertionPoint);
7300 
7301     // Introduce the early-exit compare IV <= BTC to form header block mask.
7302     // This is used instead of IV < TC because TC may wrap, unlike BTC.
7303     // Start by constructing the desired canonical IV.
7304     VPValue *IV = nullptr;
7305     if (Legal->getPrimaryInduction())
7306       IV = Plan->getOrAddVPValue(Legal->getPrimaryInduction());
7307     else {
7308       auto IVRecipe = new VPWidenCanonicalIVRecipe();
7309       Builder.getInsertBlock()->insert(IVRecipe, NewInsertionPoint);
7310       IV = IVRecipe->getVPValue();
7311     }
7312     VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
7313     bool TailFolded = !CM.isScalarEpilogueAllowed();
7314 
7315     if (TailFolded && CM.TTI.emitGetActiveLaneMask()) {
7316       // While ActiveLaneMask is a binary op that consumes the loop tripcount
7317       // as a second argument, we only pass the IV here and extract the
7318       // tripcount from the transform state where codegen of the VP instructions
7319       // happen.
7320       BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV});
7321     } else {
7322       BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
7323     }
7324     return BlockMaskCache[BB] = BlockMask;
7325   }
7326 
7327   // This is the block mask. We OR all incoming edges.
7328   for (auto *Predecessor : predecessors(BB)) {
7329     VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan);
7330     if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too.
7331       return BlockMaskCache[BB] = EdgeMask;
7332 
7333     if (!BlockMask) { // BlockMask has its initialized nullptr value.
7334       BlockMask = EdgeMask;
7335       continue;
7336     }
7337 
7338     BlockMask = Builder.createOr(BlockMask, EdgeMask);
7339   }
7340 
7341   return BlockMaskCache[BB] = BlockMask;
7342 }
7343 
7344 VPWidenMemoryInstructionRecipe *
7345 VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range,
7346                                   VPlanPtr &Plan) {
7347   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
7348          "Must be called with either a load or store");
7349 
7350   auto willWiden = [&](ElementCount VF) -> bool {
7351     assert(!VF.isScalable() && "unexpected scalable ElementCount");
7352     if (VF.isScalar())
7353       return false;
7354     LoopVectorizationCostModel::InstWidening Decision =
7355         CM.getWideningDecision(I, VF);
7356     assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
7357            "CM decision should be taken at this point.");
7358     if (Decision == LoopVectorizationCostModel::CM_Interleave)
7359       return true;
7360     if (CM.isScalarAfterVectorization(I, VF) ||
7361         CM.isProfitableToScalarize(I, VF))
7362       return false;
7363     return Decision != LoopVectorizationCostModel::CM_Scalarize;
7364   };
7365 
7366   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
7367     return nullptr;
7368 
7369   VPValue *Mask = nullptr;
7370   if (Legal->isMaskRequired(I))
7371     Mask = createBlockInMask(I->getParent(), Plan);
7372 
7373   VPValue *Addr = Plan->getOrAddVPValue(getLoadStorePointerOperand(I));
7374   if (LoadInst *Load = dyn_cast<LoadInst>(I))
7375     return new VPWidenMemoryInstructionRecipe(*Load, Addr, Mask);
7376 
7377   StoreInst *Store = cast<StoreInst>(I);
7378   VPValue *StoredValue = Plan->getOrAddVPValue(Store->getValueOperand());
7379   return new VPWidenMemoryInstructionRecipe(*Store, Addr, StoredValue, Mask);
7380 }
7381 
7382 VPWidenIntOrFpInductionRecipe *
7383 VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi) const {
7384   // Check if this is an integer or fp induction. If so, build the recipe that
7385   // produces its scalar and vector values.
7386   InductionDescriptor II = Legal->getInductionVars().lookup(Phi);
7387   if (II.getKind() == InductionDescriptor::IK_IntInduction ||
7388       II.getKind() == InductionDescriptor::IK_FpInduction)
7389     return new VPWidenIntOrFpInductionRecipe(Phi);
7390 
7391   return nullptr;
7392 }
7393 
7394 VPWidenIntOrFpInductionRecipe *
7395 VPRecipeBuilder::tryToOptimizeInductionTruncate(TruncInst *I,
7396                                                 VFRange &Range) const {
7397   // Optimize the special case where the source is a constant integer
7398   // induction variable. Notice that we can only optimize the 'trunc' case
7399   // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
7400   // (c) other casts depend on pointer size.
7401 
7402   // Determine whether \p K is a truncation based on an induction variable that
7403   // can be optimized.
7404   auto isOptimizableIVTruncate =
7405       [&](Instruction *K) -> std::function<bool(ElementCount)> {
7406     return [=](ElementCount VF) -> bool {
7407       return CM.isOptimizableIVTruncate(K, VF);
7408     };
7409   };
7410 
7411   if (LoopVectorizationPlanner::getDecisionAndClampRange(
7412           isOptimizableIVTruncate(I), Range))
7413     return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)),
7414                                              I);
7415   return nullptr;
7416 }
7417 
7418 VPBlendRecipe *VPRecipeBuilder::tryToBlend(PHINode *Phi, VPlanPtr &Plan) {
7419   // We know that all PHIs in non-header blocks are converted into selects, so
7420   // we don't have to worry about the insertion order and we can just use the
7421   // builder. At this point we generate the predication tree. There may be
7422   // duplications since this is a simple recursive scan, but future
7423   // optimizations will clean it up.
7424 
7425   SmallVector<VPValue *, 2> Operands;
7426   unsigned NumIncoming = Phi->getNumIncomingValues();
7427   for (unsigned In = 0; In < NumIncoming; In++) {
7428     VPValue *EdgeMask =
7429       createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan);
7430     assert((EdgeMask || NumIncoming == 1) &&
7431            "Multiple predecessors with one having a full mask");
7432     Operands.push_back(Plan->getOrAddVPValue(Phi->getIncomingValue(In)));
7433     if (EdgeMask)
7434       Operands.push_back(EdgeMask);
7435   }
7436   return new VPBlendRecipe(Phi, Operands);
7437 }
7438 
7439 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, VFRange &Range,
7440                                                    VPlan &Plan) const {
7441 
7442   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
7443       [this, CI](ElementCount VF) {
7444         return CM.isScalarWithPredication(CI, VF);
7445       },
7446       Range);
7447 
7448   if (IsPredicated)
7449     return nullptr;
7450 
7451   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
7452   if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
7453              ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect ||
7454              ID == Intrinsic::pseudoprobe))
7455     return nullptr;
7456 
7457   auto willWiden = [&](ElementCount VF) -> bool {
7458     Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
7459     // The following case may be scalarized depending on the VF.
7460     // The flag shows whether we use Intrinsic or a usual Call for vectorized
7461     // version of the instruction.
7462     // Is it beneficial to perform intrinsic call compared to lib call?
7463     bool NeedToScalarize = false;
7464     unsigned CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize);
7465     bool UseVectorIntrinsic =
7466         ID && CM.getVectorIntrinsicCost(CI, VF) <= CallCost;
7467     return UseVectorIntrinsic || !NeedToScalarize;
7468   };
7469 
7470   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
7471     return nullptr;
7472 
7473   return new VPWidenCallRecipe(*CI, Plan.mapToVPValues(CI->arg_operands()));
7474 }
7475 
7476 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
7477   assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) &&
7478          !isa<StoreInst>(I) && "Instruction should have been handled earlier");
7479   // Instruction should be widened, unless it is scalar after vectorization,
7480   // scalarization is profitable or it is predicated.
7481   auto WillScalarize = [this, I](ElementCount VF) -> bool {
7482     return CM.isScalarAfterVectorization(I, VF) ||
7483            CM.isProfitableToScalarize(I, VF) ||
7484            CM.isScalarWithPredication(I, VF);
7485   };
7486   return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize,
7487                                                              Range);
7488 }
7489 
7490 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, VPlan &Plan) const {
7491   auto IsVectorizableOpcode = [](unsigned Opcode) {
7492     switch (Opcode) {
7493     case Instruction::Add:
7494     case Instruction::And:
7495     case Instruction::AShr:
7496     case Instruction::BitCast:
7497     case Instruction::FAdd:
7498     case Instruction::FCmp:
7499     case Instruction::FDiv:
7500     case Instruction::FMul:
7501     case Instruction::FNeg:
7502     case Instruction::FPExt:
7503     case Instruction::FPToSI:
7504     case Instruction::FPToUI:
7505     case Instruction::FPTrunc:
7506     case Instruction::FRem:
7507     case Instruction::FSub:
7508     case Instruction::ICmp:
7509     case Instruction::IntToPtr:
7510     case Instruction::LShr:
7511     case Instruction::Mul:
7512     case Instruction::Or:
7513     case Instruction::PtrToInt:
7514     case Instruction::SDiv:
7515     case Instruction::Select:
7516     case Instruction::SExt:
7517     case Instruction::Shl:
7518     case Instruction::SIToFP:
7519     case Instruction::SRem:
7520     case Instruction::Sub:
7521     case Instruction::Trunc:
7522     case Instruction::UDiv:
7523     case Instruction::UIToFP:
7524     case Instruction::URem:
7525     case Instruction::Xor:
7526     case Instruction::ZExt:
7527       return true;
7528     }
7529     return false;
7530   };
7531 
7532   if (!IsVectorizableOpcode(I->getOpcode()))
7533     return nullptr;
7534 
7535   // Success: widen this instruction.
7536   return new VPWidenRecipe(*I, Plan.mapToVPValues(I->operands()));
7537 }
7538 
7539 VPBasicBlock *VPRecipeBuilder::handleReplication(
7540     Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
7541     DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe,
7542     VPlanPtr &Plan) {
7543   bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
7544       [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },
7545       Range);
7546 
7547   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
7548       [&](ElementCount VF) { return CM.isScalarWithPredication(I, VF); },
7549       Range);
7550 
7551   auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()),
7552                                        IsUniform, IsPredicated);
7553   setRecipe(I, Recipe);
7554   Plan->addVPValue(I, Recipe);
7555 
7556   // Find if I uses a predicated instruction. If so, it will use its scalar
7557   // value. Avoid hoisting the insert-element which packs the scalar value into
7558   // a vector value, as that happens iff all users use the vector value.
7559   for (auto &Op : I->operands())
7560     if (auto *PredInst = dyn_cast<Instruction>(Op))
7561       if (PredInst2Recipe.find(PredInst) != PredInst2Recipe.end())
7562         PredInst2Recipe[PredInst]->setAlsoPack(false);
7563 
7564   // Finalize the recipe for Instr, first if it is not predicated.
7565   if (!IsPredicated) {
7566     LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
7567     VPBB->appendRecipe(Recipe);
7568     return VPBB;
7569   }
7570   LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
7571   assert(VPBB->getSuccessors().empty() &&
7572          "VPBB has successors when handling predicated replication.");
7573   // Record predicated instructions for above packing optimizations.
7574   PredInst2Recipe[I] = Recipe;
7575   VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan);
7576   VPBlockUtils::insertBlockAfter(Region, VPBB);
7577   auto *RegSucc = new VPBasicBlock();
7578   VPBlockUtils::insertBlockAfter(RegSucc, Region);
7579   return RegSucc;
7580 }
7581 
7582 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr,
7583                                                       VPRecipeBase *PredRecipe,
7584                                                       VPlanPtr &Plan) {
7585   // Instructions marked for predication are replicated and placed under an
7586   // if-then construct to prevent side-effects.
7587 
7588   // Generate recipes to compute the block mask for this region.
7589   VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan);
7590 
7591   // Build the triangular if-then region.
7592   std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
7593   assert(Instr->getParent() && "Predicated instruction not in any basic block");
7594   auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask);
7595   auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
7596   auto *PHIRecipe = Instr->getType()->isVoidTy()
7597                         ? nullptr
7598                         : new VPPredInstPHIRecipe(Plan->getOrAddVPValue(Instr));
7599   auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
7600   auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe);
7601   VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true);
7602 
7603   // Note: first set Entry as region entry and then connect successors starting
7604   // from it in order, to propagate the "parent" of each VPBasicBlock.
7605   VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry);
7606   VPBlockUtils::connectBlocks(Pred, Exit);
7607 
7608   return Region;
7609 }
7610 
7611 VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
7612                                                       VFRange &Range,
7613                                                       VPlanPtr &Plan) {
7614   // First, check for specific widening recipes that deal with calls, memory
7615   // operations, inductions and Phi nodes.
7616   if (auto *CI = dyn_cast<CallInst>(Instr))
7617     return tryToWidenCall(CI, Range, *Plan);
7618 
7619   if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
7620     return tryToWidenMemory(Instr, Range, Plan);
7621 
7622   VPRecipeBase *Recipe;
7623   if (auto Phi = dyn_cast<PHINode>(Instr)) {
7624     if (Phi->getParent() != OrigLoop->getHeader())
7625       return tryToBlend(Phi, Plan);
7626     if ((Recipe = tryToOptimizeInductionPHI(Phi)))
7627       return Recipe;
7628     return new VPWidenPHIRecipe(Phi);
7629   }
7630 
7631   if (isa<TruncInst>(Instr) &&
7632       (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Range)))
7633     return Recipe;
7634 
7635   if (!shouldWiden(Instr, Range))
7636     return nullptr;
7637 
7638   if (auto GEP = dyn_cast<GetElementPtrInst>(Instr))
7639     return new VPWidenGEPRecipe(GEP, Plan->mapToVPValues(GEP->operands()),
7640                                 OrigLoop);
7641 
7642   if (auto *SI = dyn_cast<SelectInst>(Instr)) {
7643     bool InvariantCond =
7644         PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop);
7645     return new VPWidenSelectRecipe(*SI, Plan->mapToVPValues(SI->operands()),
7646                                    InvariantCond);
7647   }
7648 
7649   return tryToWiden(Instr, *Plan);
7650 }
7651 
7652 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
7653                                                         ElementCount MaxVF) {
7654   assert(OrigLoop->isInnermost() && "Inner loop expected.");
7655 
7656   // Collect instructions from the original loop that will become trivially dead
7657   // in the vectorized loop. We don't need to vectorize these instructions. For
7658   // example, original induction update instructions can become dead because we
7659   // separately emit induction "steps" when generating code for the new loop.
7660   // Similarly, we create a new latch condition when setting up the structure
7661   // of the new loop, so the old one can become dead.
7662   SmallPtrSet<Instruction *, 4> DeadInstructions;
7663   collectTriviallyDeadInstructions(DeadInstructions);
7664 
7665   // Add assume instructions we need to drop to DeadInstructions, to prevent
7666   // them from being added to the VPlan.
7667   // TODO: We only need to drop assumes in blocks that get flattend. If the
7668   // control flow is preserved, we should keep them.
7669   auto &ConditionalAssumes = Legal->getConditionalAssumes();
7670   DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end());
7671 
7672   DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter();
7673   // Dead instructions do not need sinking. Remove them from SinkAfter.
7674   for (Instruction *I : DeadInstructions)
7675     SinkAfter.erase(I);
7676 
7677   auto MaxVFPlusOne = MaxVF.getWithIncrement(1);
7678   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) {
7679     VFRange SubRange = {VF, MaxVFPlusOne};
7680     VPlans.push_back(
7681         buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter));
7682     VF = SubRange.End;
7683   }
7684 }
7685 
7686 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
7687     VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions,
7688     const DenseMap<Instruction *, Instruction *> &SinkAfter) {
7689 
7690   // Hold a mapping from predicated instructions to their recipes, in order to
7691   // fix their AlsoPack behavior if a user is determined to replicate and use a
7692   // scalar instead of vector value.
7693   DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe;
7694 
7695   SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
7696 
7697   VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder);
7698 
7699   // ---------------------------------------------------------------------------
7700   // Pre-construction: record ingredients whose recipes we'll need to further
7701   // process after constructing the initial VPlan.
7702   // ---------------------------------------------------------------------------
7703 
7704   // Mark instructions we'll need to sink later and their targets as
7705   // ingredients whose recipe we'll need to record.
7706   for (auto &Entry : SinkAfter) {
7707     RecipeBuilder.recordRecipeOf(Entry.first);
7708     RecipeBuilder.recordRecipeOf(Entry.second);
7709   }
7710   for (auto &Reduction : CM.getInLoopReductionChains()) {
7711     PHINode *Phi = Reduction.first;
7712     RecurrenceDescriptor::RecurrenceKind Kind =
7713         Legal->getReductionVars()[Phi].getRecurrenceKind();
7714     const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
7715 
7716     RecipeBuilder.recordRecipeOf(Phi);
7717     for (auto &R : ReductionOperations) {
7718       RecipeBuilder.recordRecipeOf(R);
7719       // For min/max reducitons, where we have a pair of icmp/select, we also
7720       // need to record the ICmp recipe, so it can be removed later.
7721       if (Kind == RecurrenceDescriptor::RK_IntegerMinMax ||
7722           Kind == RecurrenceDescriptor::RK_FloatMinMax) {
7723         RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0)));
7724       }
7725     }
7726   }
7727 
7728   // For each interleave group which is relevant for this (possibly trimmed)
7729   // Range, add it to the set of groups to be later applied to the VPlan and add
7730   // placeholders for its members' Recipes which we'll be replacing with a
7731   // single VPInterleaveRecipe.
7732   for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
7733     auto applyIG = [IG, this](ElementCount VF) -> bool {
7734       return (VF.isVector() && // Query is illegal for VF == 1
7735               CM.getWideningDecision(IG->getInsertPos(), VF) ==
7736                   LoopVectorizationCostModel::CM_Interleave);
7737     };
7738     if (!getDecisionAndClampRange(applyIG, Range))
7739       continue;
7740     InterleaveGroups.insert(IG);
7741     for (unsigned i = 0; i < IG->getFactor(); i++)
7742       if (Instruction *Member = IG->getMember(i))
7743         RecipeBuilder.recordRecipeOf(Member);
7744   };
7745 
7746   // ---------------------------------------------------------------------------
7747   // Build initial VPlan: Scan the body of the loop in a topological order to
7748   // visit each basic block after having visited its predecessor basic blocks.
7749   // ---------------------------------------------------------------------------
7750 
7751   // Create a dummy pre-entry VPBasicBlock to start building the VPlan.
7752   auto Plan = std::make_unique<VPlan>();
7753   VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry");
7754   Plan->setEntry(VPBB);
7755 
7756   // Scan the body of the loop in a topological order to visit each basic block
7757   // after having visited its predecessor basic blocks.
7758   LoopBlocksDFS DFS(OrigLoop);
7759   DFS.perform(LI);
7760 
7761   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
7762     // Relevant instructions from basic block BB will be grouped into VPRecipe
7763     // ingredients and fill a new VPBasicBlock.
7764     unsigned VPBBsForBB = 0;
7765     auto *FirstVPBBForBB = new VPBasicBlock(BB->getName());
7766     VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB);
7767     VPBB = FirstVPBBForBB;
7768     Builder.setInsertPoint(VPBB);
7769 
7770     // Introduce each ingredient into VPlan.
7771     // TODO: Model and preserve debug instrinsics in VPlan.
7772     for (Instruction &I : BB->instructionsWithoutDebug()) {
7773       Instruction *Instr = &I;
7774 
7775       // First filter out irrelevant instructions, to ensure no recipes are
7776       // built for them.
7777       if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr))
7778         continue;
7779 
7780       if (auto Recipe =
7781               RecipeBuilder.tryToCreateWidenRecipe(Instr, Range, Plan)) {
7782         // Check if the recipe can be converted to a VPValue. We need the extra
7783         // down-casting step until VPRecipeBase inherits from VPValue.
7784         VPValue *MaybeVPValue = Recipe->toVPValue();
7785         if (!Instr->getType()->isVoidTy() && MaybeVPValue)
7786           Plan->addVPValue(Instr, MaybeVPValue);
7787 
7788         RecipeBuilder.setRecipe(Instr, Recipe);
7789         VPBB->appendRecipe(Recipe);
7790         continue;
7791       }
7792 
7793       // Otherwise, if all widening options failed, Instruction is to be
7794       // replicated. This may create a successor for VPBB.
7795       VPBasicBlock *NextVPBB = RecipeBuilder.handleReplication(
7796           Instr, Range, VPBB, PredInst2Recipe, Plan);
7797       if (NextVPBB != VPBB) {
7798         VPBB = NextVPBB;
7799         VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++)
7800                                     : "");
7801       }
7802     }
7803   }
7804 
7805   // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks
7806   // may also be empty, such as the last one VPBB, reflecting original
7807   // basic-blocks with no recipes.
7808   VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry());
7809   assert(PreEntry->empty() && "Expecting empty pre-entry block.");
7810   VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor());
7811   VPBlockUtils::disconnectBlocks(PreEntry, Entry);
7812   delete PreEntry;
7813 
7814   // ---------------------------------------------------------------------------
7815   // Transform initial VPlan: Apply previously taken decisions, in order, to
7816   // bring the VPlan to its final state.
7817   // ---------------------------------------------------------------------------
7818 
7819   // Apply Sink-After legal constraints.
7820   for (auto &Entry : SinkAfter) {
7821     VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first);
7822     VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second);
7823     Sink->moveAfter(Target);
7824   }
7825 
7826   // Interleave memory: for each Interleave Group we marked earlier as relevant
7827   // for this VPlan, replace the Recipes widening its memory instructions with a
7828   // single VPInterleaveRecipe at its insertion point.
7829   for (auto IG : InterleaveGroups) {
7830     auto *Recipe = cast<VPWidenMemoryInstructionRecipe>(
7831         RecipeBuilder.getRecipe(IG->getInsertPos()));
7832     SmallVector<VPValue *, 4> StoredValues;
7833     for (unsigned i = 0; i < IG->getFactor(); ++i)
7834       if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i)))
7835         StoredValues.push_back(Plan->getOrAddVPValue(SI->getOperand(0)));
7836 
7837     (new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues,
7838                             Recipe->getMask()))
7839         ->insertBefore(Recipe);
7840 
7841     for (unsigned i = 0; i < IG->getFactor(); ++i)
7842       if (Instruction *Member = IG->getMember(i)) {
7843         if (!Member->getType()->isVoidTy()) {
7844           VPValue *OriginalV = Plan->getVPValue(Member);
7845           Plan->removeVPValueFor(Member);
7846           OriginalV->replaceAllUsesWith(Plan->getOrAddVPValue(Member));
7847         }
7848         RecipeBuilder.getRecipe(Member)->eraseFromParent();
7849       }
7850   }
7851 
7852   // Adjust the recipes for any inloop reductions.
7853   if (Range.Start.isVector())
7854     adjustRecipesForInLoopReductions(Plan, RecipeBuilder);
7855 
7856   // Finally, if tail is folded by masking, introduce selects between the phi
7857   // and the live-out instruction of each reduction, at the end of the latch.
7858   if (CM.foldTailByMasking() && !Legal->getReductionVars().empty()) {
7859     Builder.setInsertPoint(VPBB);
7860     auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan);
7861     for (auto &Reduction : Legal->getReductionVars()) {
7862       if (CM.isInLoopReduction(Reduction.first))
7863         continue;
7864       VPValue *Phi = Plan->getOrAddVPValue(Reduction.first);
7865       VPValue *Red = Plan->getOrAddVPValue(Reduction.second.getLoopExitInstr());
7866       Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi});
7867     }
7868   }
7869 
7870   std::string PlanName;
7871   raw_string_ostream RSO(PlanName);
7872   ElementCount VF = Range.Start;
7873   Plan->addVF(VF);
7874   RSO << "Initial VPlan for VF={" << VF;
7875   for (VF *= 2; ElementCount::isKnownLT(VF, Range.End); VF *= 2) {
7876     Plan->addVF(VF);
7877     RSO << "," << VF;
7878   }
7879   RSO << "},UF>=1";
7880   RSO.flush();
7881   Plan->setName(PlanName);
7882 
7883   return Plan;
7884 }
7885 
7886 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
7887   // Outer loop handling: They may require CFG and instruction level
7888   // transformations before even evaluating whether vectorization is profitable.
7889   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
7890   // the vectorization pipeline.
7891   assert(!OrigLoop->isInnermost());
7892   assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
7893 
7894   // Create new empty VPlan
7895   auto Plan = std::make_unique<VPlan>();
7896 
7897   // Build hierarchical CFG
7898   VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
7899   HCFGBuilder.buildHierarchicalCFG();
7900 
7901   for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End);
7902        VF *= 2)
7903     Plan->addVF(VF);
7904 
7905   if (EnableVPlanPredication) {
7906     VPlanPredicator VPP(*Plan);
7907     VPP.predicate();
7908 
7909     // Avoid running transformation to recipes until masked code generation in
7910     // VPlan-native path is in place.
7911     return Plan;
7912   }
7913 
7914   SmallPtrSet<Instruction *, 1> DeadInstructions;
7915   VPlanTransforms::VPInstructionsToVPRecipes(
7916       OrigLoop, Plan, Legal->getInductionVars(), DeadInstructions);
7917   return Plan;
7918 }
7919 
7920 // Adjust the recipes for any inloop reductions. The chain of instructions
7921 // leading from the loop exit instr to the phi need to be converted to
7922 // reductions, with one operand being vector and the other being the scalar
7923 // reduction chain.
7924 void LoopVectorizationPlanner::adjustRecipesForInLoopReductions(
7925     VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder) {
7926   for (auto &Reduction : CM.getInLoopReductionChains()) {
7927     PHINode *Phi = Reduction.first;
7928     RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi];
7929     const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
7930 
7931     // ReductionOperations are orders top-down from the phi's use to the
7932     // LoopExitValue. We keep a track of the previous item (the Chain) to tell
7933     // which of the two operands will remain scalar and which will be reduced.
7934     // For minmax the chain will be the select instructions.
7935     Instruction *Chain = Phi;
7936     for (Instruction *R : ReductionOperations) {
7937       VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R);
7938       RecurrenceDescriptor::RecurrenceKind Kind = RdxDesc.getRecurrenceKind();
7939 
7940       VPValue *ChainOp = Plan->getVPValue(Chain);
7941       unsigned FirstOpId;
7942       if (Kind == RecurrenceDescriptor::RK_IntegerMinMax ||
7943           Kind == RecurrenceDescriptor::RK_FloatMinMax) {
7944         assert(isa<VPWidenSelectRecipe>(WidenRecipe) &&
7945                "Expected to replace a VPWidenSelectSC");
7946         FirstOpId = 1;
7947       } else {
7948         assert(isa<VPWidenRecipe>(WidenRecipe) &&
7949                "Expected to replace a VPWidenSC");
7950         FirstOpId = 0;
7951       }
7952       unsigned VecOpId =
7953           R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId;
7954       VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId));
7955 
7956       auto *CondOp = CM.foldTailByMasking()
7957                          ? RecipeBuilder.createBlockInMask(R->getParent(), Plan)
7958                          : nullptr;
7959       VPReductionRecipe *RedRecipe = new VPReductionRecipe(
7960           &RdxDesc, R, ChainOp, VecOp, CondOp, Legal->hasFunNoNaNAttr(), TTI);
7961       WidenRecipe->toVPValue()->replaceAllUsesWith(RedRecipe);
7962       Plan->removeVPValueFor(R);
7963       Plan->addVPValue(R, RedRecipe);
7964       WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator());
7965       WidenRecipe->eraseFromParent();
7966 
7967       if (Kind == RecurrenceDescriptor::RK_IntegerMinMax ||
7968           Kind == RecurrenceDescriptor::RK_FloatMinMax) {
7969         VPRecipeBase *CompareRecipe =
7970             RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0)));
7971         assert(isa<VPWidenRecipe>(CompareRecipe) &&
7972                "Expected to replace a VPWidenSC");
7973         assert(CompareRecipe->toVPValue()->getNumUsers() == 0 &&
7974                "Expected no remaining users");
7975         CompareRecipe->eraseFromParent();
7976       }
7977       Chain = R;
7978     }
7979   }
7980 }
7981 
7982 Value* LoopVectorizationPlanner::VPCallbackILV::
7983 getOrCreateVectorValues(Value *V, unsigned Part) {
7984       return ILV.getOrCreateVectorValue(V, Part);
7985 }
7986 
7987 Value *LoopVectorizationPlanner::VPCallbackILV::getOrCreateScalarValue(
7988     Value *V, const VPIteration &Instance) {
7989   return ILV.getOrCreateScalarValue(V, Instance);
7990 }
7991 
7992 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
7993                                VPSlotTracker &SlotTracker) const {
7994   O << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
7995   IG->getInsertPos()->printAsOperand(O, false);
7996   O << ", ";
7997   getAddr()->printAsOperand(O, SlotTracker);
7998   VPValue *Mask = getMask();
7999   if (Mask) {
8000     O << ", ";
8001     Mask->printAsOperand(O, SlotTracker);
8002   }
8003   for (unsigned i = 0; i < IG->getFactor(); ++i)
8004     if (Instruction *I = IG->getMember(i))
8005       O << "\\l\" +\n" << Indent << "\"  " << VPlanIngredient(I) << " " << i;
8006 }
8007 
8008 void VPWidenCallRecipe::execute(VPTransformState &State) {
8009   State.ILV->widenCallInstruction(*cast<CallInst>(getUnderlyingInstr()), this,
8010                                   *this, State);
8011 }
8012 
8013 void VPWidenSelectRecipe::execute(VPTransformState &State) {
8014   State.ILV->widenSelectInstruction(*cast<SelectInst>(getUnderlyingInstr()),
8015                                     this, *this, InvariantCond, State);
8016 }
8017 
8018 void VPWidenRecipe::execute(VPTransformState &State) {
8019   State.ILV->widenInstruction(*getUnderlyingInstr(), this, *this, State);
8020 }
8021 
8022 void VPWidenGEPRecipe::execute(VPTransformState &State) {
8023   State.ILV->widenGEP(cast<GetElementPtrInst>(getUnderlyingInstr()), this,
8024                       *this, State.UF, State.VF, IsPtrLoopInvariant,
8025                       IsIndexLoopInvariant, State);
8026 }
8027 
8028 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
8029   assert(!State.Instance && "Int or FP induction being replicated.");
8030   State.ILV->widenIntOrFpInduction(IV, Trunc);
8031 }
8032 
8033 void VPWidenPHIRecipe::execute(VPTransformState &State) {
8034   State.ILV->widenPHIInstruction(Phi, State.UF, State.VF);
8035 }
8036 
8037 void VPBlendRecipe::execute(VPTransformState &State) {
8038   State.ILV->setDebugLocFromInst(State.Builder, Phi);
8039   // We know that all PHIs in non-header blocks are converted into
8040   // selects, so we don't have to worry about the insertion order and we
8041   // can just use the builder.
8042   // At this point we generate the predication tree. There may be
8043   // duplications since this is a simple recursive scan, but future
8044   // optimizations will clean it up.
8045 
8046   unsigned NumIncoming = getNumIncomingValues();
8047 
8048   // Generate a sequence of selects of the form:
8049   // SELECT(Mask3, In3,
8050   //        SELECT(Mask2, In2,
8051   //               SELECT(Mask1, In1,
8052   //                      In0)))
8053   // Note that Mask0 is never used: lanes for which no path reaches this phi and
8054   // are essentially undef are taken from In0.
8055   InnerLoopVectorizer::VectorParts Entry(State.UF);
8056   for (unsigned In = 0; In < NumIncoming; ++In) {
8057     for (unsigned Part = 0; Part < State.UF; ++Part) {
8058       // We might have single edge PHIs (blocks) - use an identity
8059       // 'select' for the first PHI operand.
8060       Value *In0 = State.get(getIncomingValue(In), Part);
8061       if (In == 0)
8062         Entry[Part] = In0; // Initialize with the first incoming value.
8063       else {
8064         // Select between the current value and the previous incoming edge
8065         // based on the incoming mask.
8066         Value *Cond = State.get(getMask(In), Part);
8067         Entry[Part] =
8068             State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi");
8069       }
8070     }
8071   }
8072   for (unsigned Part = 0; Part < State.UF; ++Part)
8073     State.ValueMap.setVectorValue(Phi, Part, Entry[Part]);
8074 }
8075 
8076 void VPInterleaveRecipe::execute(VPTransformState &State) {
8077   assert(!State.Instance && "Interleave group being replicated.");
8078   State.ILV->vectorizeInterleaveGroup(IG, State, getAddr(), getStoredValues(),
8079                                       getMask());
8080 }
8081 
8082 void VPReductionRecipe::execute(VPTransformState &State) {
8083   assert(!State.Instance && "Reduction being replicated.");
8084   for (unsigned Part = 0; Part < State.UF; ++Part) {
8085     RecurrenceDescriptor::RecurrenceKind Kind = RdxDesc->getRecurrenceKind();
8086     Value *NewVecOp = State.get(getVecOp(), Part);
8087     if (VPValue *Cond = getCondOp()) {
8088       Value *NewCond = State.get(Cond, Part);
8089       VectorType *VecTy = cast<VectorType>(NewVecOp->getType());
8090       Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity(
8091           Kind, RdxDesc->getMinMaxRecurrenceKind(), VecTy->getElementType());
8092       Constant *IdenVec =
8093           ConstantVector::getSplat(VecTy->getElementCount(), Iden);
8094       Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec);
8095       NewVecOp = Select;
8096     }
8097     Value *NewRed =
8098         createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp, NoNaN);
8099     Value *PrevInChain = State.get(getChainOp(), Part);
8100     Value *NextInChain;
8101     if (Kind == RecurrenceDescriptor::RK_IntegerMinMax ||
8102         Kind == RecurrenceDescriptor::RK_FloatMinMax) {
8103       NextInChain =
8104           createMinMaxOp(State.Builder, RdxDesc->getMinMaxRecurrenceKind(),
8105                          NewRed, PrevInChain);
8106     } else {
8107       NextInChain = State.Builder.CreateBinOp(
8108           (Instruction::BinaryOps)getUnderlyingInstr()->getOpcode(), NewRed,
8109           PrevInChain);
8110     }
8111     State.set(this, getUnderlyingInstr(), NextInChain, Part);
8112   }
8113 }
8114 
8115 void VPReplicateRecipe::execute(VPTransformState &State) {
8116   if (State.Instance) { // Generate a single instance.
8117     State.ILV->scalarizeInstruction(getUnderlyingInstr(), *this,
8118                                     *State.Instance, IsPredicated, State);
8119     // Insert scalar instance packing it into a vector.
8120     if (AlsoPack && State.VF.isVector()) {
8121       // If we're constructing lane 0, initialize to start from undef.
8122       if (State.Instance->Lane == 0) {
8123         assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
8124         Value *Undef = UndefValue::get(
8125             VectorType::get(getUnderlyingValue()->getType(), State.VF));
8126         State.ValueMap.setVectorValue(getUnderlyingInstr(),
8127                                       State.Instance->Part, Undef);
8128       }
8129       State.ILV->packScalarIntoVectorValue(getUnderlyingInstr(),
8130                                            *State.Instance);
8131     }
8132     return;
8133   }
8134 
8135   // Generate scalar instances for all VF lanes of all UF parts, unless the
8136   // instruction is uniform inwhich case generate only the first lane for each
8137   // of the UF parts.
8138   unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue();
8139   for (unsigned Part = 0; Part < State.UF; ++Part)
8140     for (unsigned Lane = 0; Lane < EndLane; ++Lane)
8141       State.ILV->scalarizeInstruction(getUnderlyingInstr(), *this, {Part, Lane},
8142                                       IsPredicated, State);
8143 }
8144 
8145 void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
8146   assert(State.Instance && "Branch on Mask works only on single instance.");
8147 
8148   unsigned Part = State.Instance->Part;
8149   unsigned Lane = State.Instance->Lane;
8150 
8151   Value *ConditionBit = nullptr;
8152   VPValue *BlockInMask = getMask();
8153   if (BlockInMask) {
8154     ConditionBit = State.get(BlockInMask, Part);
8155     if (ConditionBit->getType()->isVectorTy())
8156       ConditionBit = State.Builder.CreateExtractElement(
8157           ConditionBit, State.Builder.getInt32(Lane));
8158   } else // Block in mask is all-one.
8159     ConditionBit = State.Builder.getTrue();
8160 
8161   // Replace the temporary unreachable terminator with a new conditional branch,
8162   // whose two destinations will be set later when they are created.
8163   auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
8164   assert(isa<UnreachableInst>(CurrentTerminator) &&
8165          "Expected to replace unreachable terminator with conditional branch.");
8166   auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
8167   CondBr->setSuccessor(0, nullptr);
8168   ReplaceInstWithInst(CurrentTerminator, CondBr);
8169 }
8170 
8171 void VPPredInstPHIRecipe::execute(VPTransformState &State) {
8172   assert(State.Instance && "Predicated instruction PHI works per instance.");
8173   Instruction *ScalarPredInst =
8174       cast<Instruction>(State.get(getOperand(0), *State.Instance));
8175   BasicBlock *PredicatedBB = ScalarPredInst->getParent();
8176   BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
8177   assert(PredicatingBB && "Predicated block has no single predecessor.");
8178 
8179   // By current pack/unpack logic we need to generate only a single phi node: if
8180   // a vector value for the predicated instruction exists at this point it means
8181   // the instruction has vector users only, and a phi for the vector value is
8182   // needed. In this case the recipe of the predicated instruction is marked to
8183   // also do that packing, thereby "hoisting" the insert-element sequence.
8184   // Otherwise, a phi node for the scalar value is needed.
8185   unsigned Part = State.Instance->Part;
8186   Instruction *PredInst =
8187       cast<Instruction>(getOperand(0)->getUnderlyingValue());
8188   if (State.ValueMap.hasVectorValue(PredInst, Part)) {
8189     Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part);
8190     InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
8191     PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
8192     VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
8193     VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
8194     State.ValueMap.resetVectorValue(PredInst, Part, VPhi); // Update cache.
8195   } else {
8196     Type *PredInstType = PredInst->getType();
8197     PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
8198     Phi->addIncoming(UndefValue::get(ScalarPredInst->getType()), PredicatingBB);
8199     Phi->addIncoming(ScalarPredInst, PredicatedBB);
8200     State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi);
8201   }
8202 }
8203 
8204 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
8205   Instruction *Instr = getUnderlyingInstr();
8206   VPValue *StoredValue = isa<StoreInst>(Instr) ? getStoredValue() : nullptr;
8207   State.ILV->vectorizeMemoryInstruction(Instr, State,
8208                                         StoredValue ? nullptr : this, getAddr(),
8209                                         StoredValue, getMask());
8210 }
8211 
8212 // Determine how to lower the scalar epilogue, which depends on 1) optimising
8213 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
8214 // predication, and 4) a TTI hook that analyses whether the loop is suitable
8215 // for predication.
8216 static ScalarEpilogueLowering getScalarEpilogueLowering(
8217     Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI,
8218     BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
8219     AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
8220     LoopVectorizationLegality &LVL) {
8221   // 1) OptSize takes precedence over all other options, i.e. if this is set,
8222   // don't look at hints or options, and don't request a scalar epilogue.
8223   // (For PGSO, as shouldOptimizeForSize isn't currently accessible from
8224   // LoopAccessInfo (due to code dependency and not being able to reliably get
8225   // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection
8226   // of strides in LoopAccessInfo::analyzeLoop() and vectorize without
8227   // versioning when the vectorization is forced, unlike hasOptSize. So revert
8228   // back to the old way and vectorize with versioning when forced. See D81345.)
8229   if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
8230                                                       PGSOQueryType::IRPass) &&
8231                           Hints.getForce() != LoopVectorizeHints::FK_Enabled))
8232     return CM_ScalarEpilogueNotAllowedOptSize;
8233 
8234   bool PredicateOptDisabled = PreferPredicateOverEpilogue.getNumOccurrences() &&
8235                               !PreferPredicateOverEpilogue;
8236 
8237   // 2) Next, if disabling predication is requested on the command line, honour
8238   // this and request a scalar epilogue.
8239   if (PredicateOptDisabled)
8240     return CM_ScalarEpilogueAllowed;
8241 
8242   // 3) and 4) look if enabling predication is requested on the command line,
8243   // with a loop hint, or if the TTI hook indicates this is profitable, request
8244   // predication.
8245   if (PreferPredicateOverEpilogue ||
8246       Hints.getPredicate() == LoopVectorizeHints::FK_Enabled ||
8247       (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT,
8248                                         LVL.getLAI()) &&
8249        Hints.getPredicate() != LoopVectorizeHints::FK_Disabled))
8250     return CM_ScalarEpilogueNotNeededUsePredicate;
8251 
8252   return CM_ScalarEpilogueAllowed;
8253 }
8254 
8255 void VPTransformState::set(VPValue *Def, Value *IRDef, Value *V,
8256                            unsigned Part) {
8257   set(Def, V, Part);
8258   ILV->setVectorValue(IRDef, Part, V);
8259 }
8260 
8261 // Process the loop in the VPlan-native vectorization path. This path builds
8262 // VPlan upfront in the vectorization pipeline, which allows to apply
8263 // VPlan-to-VPlan transformations from the very beginning without modifying the
8264 // input LLVM IR.
8265 static bool processLoopInVPlanNativePath(
8266     Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
8267     LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
8268     TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
8269     OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
8270     ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) {
8271 
8272   if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) {
8273     LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n");
8274     return false;
8275   }
8276   assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
8277   Function *F = L->getHeader()->getParent();
8278   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
8279 
8280   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
8281       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL);
8282 
8283   LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
8284                                 &Hints, IAI);
8285   // Use the planner for outer loop vectorization.
8286   // TODO: CM is not used at this point inside the planner. Turn CM into an
8287   // optional argument if we don't need it in the future.
8288   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE);
8289 
8290   // Get user vectorization factor.
8291   const unsigned UserVF = Hints.getWidth();
8292 
8293   // Plan how to best vectorize, return the best VF and its cost.
8294   const VectorizationFactor VF =
8295       LVP.planInVPlanNativePath(ElementCount::getFixed(UserVF));
8296 
8297   // If we are stress testing VPlan builds, do not attempt to generate vector
8298   // code. Masked vector code generation support will follow soon.
8299   // Also, do not attempt to vectorize if no vector code will be produced.
8300   if (VPlanBuildStressTest || EnableVPlanPredication ||
8301       VectorizationFactor::Disabled() == VF)
8302     return false;
8303 
8304   LVP.setBestPlan(VF.Width, 1);
8305 
8306   InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL,
8307                          &CM, BFI, PSI);
8308   LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
8309                     << L->getHeader()->getParent()->getName() << "\"\n");
8310   LVP.executePlan(LB, DT);
8311 
8312   // Mark the loop as already vectorized to avoid vectorizing again.
8313   Hints.setAlreadyVectorized();
8314 
8315   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
8316   return true;
8317 }
8318 
8319 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts)
8320     : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced ||
8321                                !EnableLoopInterleaving),
8322       VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced ||
8323                               !EnableLoopVectorization) {}
8324 
8325 bool LoopVectorizePass::processLoop(Loop *L) {
8326   assert((EnableVPlanNativePath || L->isInnermost()) &&
8327          "VPlan-native path is not enabled. Only process inner loops.");
8328 
8329 #ifndef NDEBUG
8330   const std::string DebugLocStr = getDebugLocString(L);
8331 #endif /* NDEBUG */
8332 
8333   LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \""
8334                     << L->getHeader()->getParent()->getName() << "\" from "
8335                     << DebugLocStr << "\n");
8336 
8337   LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE);
8338 
8339   LLVM_DEBUG(
8340       dbgs() << "LV: Loop hints:"
8341              << " force="
8342              << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
8343                      ? "disabled"
8344                      : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
8345                             ? "enabled"
8346                             : "?"))
8347              << " width=" << Hints.getWidth()
8348              << " unroll=" << Hints.getInterleave() << "\n");
8349 
8350   // Function containing loop
8351   Function *F = L->getHeader()->getParent();
8352 
8353   // Looking at the diagnostic output is the only way to determine if a loop
8354   // was vectorized (other than looking at the IR or machine code), so it
8355   // is important to generate an optimization remark for each loop. Most of
8356   // these messages are generated as OptimizationRemarkAnalysis. Remarks
8357   // generated as OptimizationRemark and OptimizationRemarkMissed are
8358   // less verbose reporting vectorized loops and unvectorized loops that may
8359   // benefit from vectorization, respectively.
8360 
8361   if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
8362     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
8363     return false;
8364   }
8365 
8366   PredicatedScalarEvolution PSE(*SE, *L);
8367 
8368   // Check if it is legal to vectorize the loop.
8369   LoopVectorizationRequirements Requirements(*ORE);
8370   LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE,
8371                                 &Requirements, &Hints, DB, AC, BFI, PSI);
8372   if (!LVL.canVectorize(EnableVPlanNativePath)) {
8373     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
8374     Hints.emitRemarkWithHints();
8375     return false;
8376   }
8377 
8378   // Check the function attributes and profiles to find out if this function
8379   // should be optimized for size.
8380   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
8381       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL);
8382 
8383   // Entrance to the VPlan-native vectorization path. Outer loops are processed
8384   // here. They may require CFG and instruction level transformations before
8385   // even evaluating whether vectorization is profitable. Since we cannot modify
8386   // the incoming IR, we need to build VPlan upfront in the vectorization
8387   // pipeline.
8388   if (!L->isInnermost())
8389     return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
8390                                         ORE, BFI, PSI, Hints);
8391 
8392   assert(L->isInnermost() && "Inner loop expected.");
8393 
8394   // Check the loop for a trip count threshold: vectorize loops with a tiny trip
8395   // count by optimizing for size, to minimize overheads.
8396   auto ExpectedTC = getSmallBestKnownTC(*SE, L);
8397   if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
8398     LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
8399                       << "This loop is worth vectorizing only if no scalar "
8400                       << "iteration overheads are incurred.");
8401     if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
8402       LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
8403     else {
8404       LLVM_DEBUG(dbgs() << "\n");
8405       SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
8406     }
8407   }
8408 
8409   // Check the function attributes to see if implicit floats are allowed.
8410   // FIXME: This check doesn't seem possibly correct -- what if the loop is
8411   // an integer loop and the vector instructions selected are purely integer
8412   // vector instructions?
8413   if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
8414     reportVectorizationFailure(
8415         "Can't vectorize when the NoImplicitFloat attribute is used",
8416         "loop not vectorized due to NoImplicitFloat attribute",
8417         "NoImplicitFloat", ORE, L);
8418     Hints.emitRemarkWithHints();
8419     return false;
8420   }
8421 
8422   // Check if the target supports potentially unsafe FP vectorization.
8423   // FIXME: Add a check for the type of safety issue (denormal, signaling)
8424   // for the target we're vectorizing for, to make sure none of the
8425   // additional fp-math flags can help.
8426   if (Hints.isPotentiallyUnsafe() &&
8427       TTI->isFPVectorizationPotentiallyUnsafe()) {
8428     reportVectorizationFailure(
8429         "Potentially unsafe FP op prevents vectorization",
8430         "loop not vectorized due to unsafe FP support.",
8431         "UnsafeFP", ORE, L);
8432     Hints.emitRemarkWithHints();
8433     return false;
8434   }
8435 
8436   bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
8437   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
8438 
8439   // If an override option has been passed in for interleaved accesses, use it.
8440   if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
8441     UseInterleaved = EnableInterleavedMemAccesses;
8442 
8443   // Analyze interleaved memory accesses.
8444   if (UseInterleaved) {
8445     IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
8446   }
8447 
8448   // Use the cost model.
8449   LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
8450                                 F, &Hints, IAI);
8451   CM.collectValuesToIgnore();
8452 
8453   // Use the planner for vectorization.
8454   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE);
8455 
8456   // Get user vectorization factor and interleave count.
8457   unsigned UserVF = Hints.getWidth();
8458   unsigned UserIC = Hints.getInterleave();
8459 
8460   // Plan how to best vectorize, return the best VF and its cost.
8461   Optional<VectorizationFactor> MaybeVF =
8462       LVP.plan(ElementCount::getFixed(UserVF), UserIC);
8463 
8464   VectorizationFactor VF = VectorizationFactor::Disabled();
8465   unsigned IC = 1;
8466 
8467   if (MaybeVF) {
8468     VF = *MaybeVF;
8469     // Select the interleave count.
8470     IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
8471   }
8472 
8473   // Identify the diagnostic messages that should be produced.
8474   std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
8475   bool VectorizeLoop = true, InterleaveLoop = true;
8476   if (Requirements.doesNotMeet(F, L, Hints)) {
8477     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization "
8478                          "requirements.\n");
8479     Hints.emitRemarkWithHints();
8480     return false;
8481   }
8482 
8483   if (VF.Width.isScalar()) {
8484     LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
8485     VecDiagMsg = std::make_pair(
8486         "VectorizationNotBeneficial",
8487         "the cost-model indicates that vectorization is not beneficial");
8488     VectorizeLoop = false;
8489   }
8490 
8491   if (!MaybeVF && UserIC > 1) {
8492     // Tell the user interleaving was avoided up-front, despite being explicitly
8493     // requested.
8494     LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
8495                          "interleaving should be avoided up front\n");
8496     IntDiagMsg = std::make_pair(
8497         "InterleavingAvoided",
8498         "Ignoring UserIC, because interleaving was avoided up front");
8499     InterleaveLoop = false;
8500   } else if (IC == 1 && UserIC <= 1) {
8501     // Tell the user interleaving is not beneficial.
8502     LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
8503     IntDiagMsg = std::make_pair(
8504         "InterleavingNotBeneficial",
8505         "the cost-model indicates that interleaving is not beneficial");
8506     InterleaveLoop = false;
8507     if (UserIC == 1) {
8508       IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
8509       IntDiagMsg.second +=
8510           " and is explicitly disabled or interleave count is set to 1";
8511     }
8512   } else if (IC > 1 && UserIC == 1) {
8513     // Tell the user interleaving is beneficial, but it explicitly disabled.
8514     LLVM_DEBUG(
8515         dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
8516     IntDiagMsg = std::make_pair(
8517         "InterleavingBeneficialButDisabled",
8518         "the cost-model indicates that interleaving is beneficial "
8519         "but is explicitly disabled or interleave count is set to 1");
8520     InterleaveLoop = false;
8521   }
8522 
8523   // Override IC if user provided an interleave count.
8524   IC = UserIC > 0 ? UserIC : IC;
8525 
8526   // Emit diagnostic messages, if any.
8527   const char *VAPassName = Hints.vectorizeAnalysisPassName();
8528   if (!VectorizeLoop && !InterleaveLoop) {
8529     // Do not vectorize or interleaving the loop.
8530     ORE->emit([&]() {
8531       return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
8532                                       L->getStartLoc(), L->getHeader())
8533              << VecDiagMsg.second;
8534     });
8535     ORE->emit([&]() {
8536       return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
8537                                       L->getStartLoc(), L->getHeader())
8538              << IntDiagMsg.second;
8539     });
8540     return false;
8541   } else if (!VectorizeLoop && InterleaveLoop) {
8542     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
8543     ORE->emit([&]() {
8544       return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
8545                                         L->getStartLoc(), L->getHeader())
8546              << VecDiagMsg.second;
8547     });
8548   } else if (VectorizeLoop && !InterleaveLoop) {
8549     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
8550                       << ") in " << DebugLocStr << '\n');
8551     ORE->emit([&]() {
8552       return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
8553                                         L->getStartLoc(), L->getHeader())
8554              << IntDiagMsg.second;
8555     });
8556   } else if (VectorizeLoop && InterleaveLoop) {
8557     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
8558                       << ") in " << DebugLocStr << '\n');
8559     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
8560   }
8561 
8562   LVP.setBestPlan(VF.Width, IC);
8563 
8564   using namespace ore;
8565   bool DisableRuntimeUnroll = false;
8566   MDNode *OrigLoopID = L->getLoopID();
8567 
8568   if (!VectorizeLoop) {
8569     assert(IC > 1 && "interleave count should not be 1 or 0");
8570     // If we decided that it is not legal to vectorize the loop, then
8571     // interleave it.
8572     InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, &CM,
8573                                BFI, PSI);
8574     LVP.executePlan(Unroller, DT);
8575 
8576     ORE->emit([&]() {
8577       return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
8578                                 L->getHeader())
8579              << "interleaved loop (interleaved count: "
8580              << NV("InterleaveCount", IC) << ")";
8581     });
8582   } else {
8583     // If we decided that it is *legal* to vectorize the loop, then do it.
8584     InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
8585                            &LVL, &CM, BFI, PSI);
8586     LVP.executePlan(LB, DT);
8587     ++LoopsVectorized;
8588 
8589     // Add metadata to disable runtime unrolling a scalar loop when there are
8590     // no runtime checks about strides and memory. A scalar loop that is
8591     // rarely used is not worth unrolling.
8592     if (!LB.areSafetyChecksAdded())
8593       DisableRuntimeUnroll = true;
8594 
8595     // Report the vectorization decision.
8596     ORE->emit([&]() {
8597       return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(),
8598                                 L->getHeader())
8599              << "vectorized loop (vectorization width: "
8600              << NV("VectorizationFactor", VF.Width)
8601              << ", interleaved count: " << NV("InterleaveCount", IC) << ")";
8602     });
8603   }
8604 
8605   Optional<MDNode *> RemainderLoopID =
8606       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
8607                                       LLVMLoopVectorizeFollowupEpilogue});
8608   if (RemainderLoopID.hasValue()) {
8609     L->setLoopID(RemainderLoopID.getValue());
8610   } else {
8611     if (DisableRuntimeUnroll)
8612       AddRuntimeUnrollDisableMetaData(L);
8613 
8614     // Mark the loop as already vectorized to avoid vectorizing again.
8615     Hints.setAlreadyVectorized();
8616   }
8617 
8618   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
8619   return true;
8620 }
8621 
8622 LoopVectorizeResult LoopVectorizePass::runImpl(
8623     Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_,
8624     DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_,
8625     DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_,
8626     std::function<const LoopAccessInfo &(Loop &)> &GetLAA_,
8627     OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) {
8628   SE = &SE_;
8629   LI = &LI_;
8630   TTI = &TTI_;
8631   DT = &DT_;
8632   BFI = &BFI_;
8633   TLI = TLI_;
8634   AA = &AA_;
8635   AC = &AC_;
8636   GetLAA = &GetLAA_;
8637   DB = &DB_;
8638   ORE = &ORE_;
8639   PSI = PSI_;
8640 
8641   // Don't attempt if
8642   // 1. the target claims to have no vector registers, and
8643   // 2. interleaving won't help ILP.
8644   //
8645   // The second condition is necessary because, even if the target has no
8646   // vector registers, loop vectorization may still enable scalar
8647   // interleaving.
8648   if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) &&
8649       TTI->getMaxInterleaveFactor(1) < 2)
8650     return LoopVectorizeResult(false, false);
8651 
8652   bool Changed = false, CFGChanged = false;
8653 
8654   // The vectorizer requires loops to be in simplified form.
8655   // Since simplification may add new inner loops, it has to run before the
8656   // legality and profitability checks. This means running the loop vectorizer
8657   // will simplify all loops, regardless of whether anything end up being
8658   // vectorized.
8659   for (auto &L : *LI)
8660     Changed |= CFGChanged |=
8661         simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
8662 
8663   // Build up a worklist of inner-loops to vectorize. This is necessary as
8664   // the act of vectorizing or partially unrolling a loop creates new loops
8665   // and can invalidate iterators across the loops.
8666   SmallVector<Loop *, 8> Worklist;
8667 
8668   for (Loop *L : *LI)
8669     collectSupportedLoops(*L, LI, ORE, Worklist);
8670 
8671   LoopsAnalyzed += Worklist.size();
8672 
8673   // Now walk the identified inner loops.
8674   while (!Worklist.empty()) {
8675     Loop *L = Worklist.pop_back_val();
8676 
8677     // For the inner loops we actually process, form LCSSA to simplify the
8678     // transform.
8679     Changed |= formLCSSARecursively(*L, *DT, LI, SE);
8680 
8681     Changed |= CFGChanged |= processLoop(L);
8682   }
8683 
8684   // Process each loop nest in the function.
8685   return LoopVectorizeResult(Changed, CFGChanged);
8686 }
8687 
8688 PreservedAnalyses LoopVectorizePass::run(Function &F,
8689                                          FunctionAnalysisManager &AM) {
8690     auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
8691     auto &LI = AM.getResult<LoopAnalysis>(F);
8692     auto &TTI = AM.getResult<TargetIRAnalysis>(F);
8693     auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
8694     auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F);
8695     auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
8696     auto &AA = AM.getResult<AAManager>(F);
8697     auto &AC = AM.getResult<AssumptionAnalysis>(F);
8698     auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
8699     auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
8700     MemorySSA *MSSA = EnableMSSALoopDependency
8701                           ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA()
8702                           : nullptr;
8703 
8704     auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
8705     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
8706         [&](Loop &L) -> const LoopAccessInfo & {
8707       LoopStandardAnalysisResults AR = {AA,  AC,  DT,      LI,  SE,
8708                                         TLI, TTI, nullptr, MSSA};
8709       return LAM.getResult<LoopAccessAnalysis>(L, AR);
8710     };
8711     auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
8712     ProfileSummaryInfo *PSI =
8713         MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
8714     LoopVectorizeResult Result =
8715         runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI);
8716     if (!Result.MadeAnyChange)
8717       return PreservedAnalyses::all();
8718     PreservedAnalyses PA;
8719 
8720     // We currently do not preserve loopinfo/dominator analyses with outer loop
8721     // vectorization. Until this is addressed, mark these analyses as preserved
8722     // only for non-VPlan-native path.
8723     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
8724     if (!EnableVPlanNativePath) {
8725       PA.preserve<LoopAnalysis>();
8726       PA.preserve<DominatorTreeAnalysis>();
8727     }
8728     PA.preserve<BasicAA>();
8729     PA.preserve<GlobalsAA>();
8730     if (!Result.MadeCFGChange)
8731       PA.preserveSet<CFGAnalyses>();
8732     return PA;
8733 }
8734