1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10 // and generates target-independent LLVM-IR.
11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12 // of instructions in order to estimate the profitability of vectorization.
13 //
14 // The loop vectorizer combines consecutive loop iterations into a single
15 // 'wide' iteration. After this transformation the index is incremented
16 // by the SIMD vector width, and not by one.
17 //
18 // This pass has three parts:
19 // 1. The main loop pass that drives the different parts.
20 // 2. LoopVectorizationLegality - A unit that checks for the legality
21 //    of the vectorization.
22 // 3. InnerLoopVectorizer - A unit that performs the actual
23 //    widening of instructions.
24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
25 //    of vectorization. It decides on the optimal vector width, which
26 //    can be one, if vectorization is not profitable.
27 //
28 // There is a development effort going on to migrate loop vectorizer to the
29 // VPlan infrastructure and to introduce outer loop vectorization support (see
30 // docs/Proposal/VectorizationPlan.rst and
31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32 // purpose, we temporarily introduced the VPlan-native vectorization path: an
33 // alternative vectorization path that is natively implemented on top of the
34 // VPlan infrastructure. See EnableVPlanNativePath for enabling.
35 //
36 //===----------------------------------------------------------------------===//
37 //
38 // The reduction-variable vectorization is based on the paper:
39 //  D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40 //
41 // Variable uniformity checks are inspired by:
42 //  Karrenberg, R. and Hack, S. Whole Function Vectorization.
43 //
44 // The interleaved access vectorization is based on the paper:
45 //  Dorit Nuzman, Ira Rosen and Ayal Zaks.  Auto-Vectorization of Interleaved
46 //  Data for SIMD
47 //
48 // Other ideas/concepts are from:
49 //  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50 //
51 //  S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua.  An Evaluation of
52 //  Vectorizing Compilers.
53 //
54 //===----------------------------------------------------------------------===//
55 
56 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
57 #include "LoopVectorizationPlanner.h"
58 #include "VPRecipeBuilder.h"
59 #include "VPlan.h"
60 #include "VPlanHCFGBuilder.h"
61 #include "VPlanPredicator.h"
62 #include "VPlanTransforms.h"
63 #include "llvm/ADT/APInt.h"
64 #include "llvm/ADT/ArrayRef.h"
65 #include "llvm/ADT/DenseMap.h"
66 #include "llvm/ADT/DenseMapInfo.h"
67 #include "llvm/ADT/Hashing.h"
68 #include "llvm/ADT/MapVector.h"
69 #include "llvm/ADT/None.h"
70 #include "llvm/ADT/Optional.h"
71 #include "llvm/ADT/STLExtras.h"
72 #include "llvm/ADT/SetVector.h"
73 #include "llvm/ADT/SmallPtrSet.h"
74 #include "llvm/ADT/SmallVector.h"
75 #include "llvm/ADT/Statistic.h"
76 #include "llvm/ADT/StringRef.h"
77 #include "llvm/ADT/Twine.h"
78 #include "llvm/ADT/iterator_range.h"
79 #include "llvm/Analysis/AssumptionCache.h"
80 #include "llvm/Analysis/BasicAliasAnalysis.h"
81 #include "llvm/Analysis/BlockFrequencyInfo.h"
82 #include "llvm/Analysis/CFG.h"
83 #include "llvm/Analysis/CodeMetrics.h"
84 #include "llvm/Analysis/DemandedBits.h"
85 #include "llvm/Analysis/GlobalsModRef.h"
86 #include "llvm/Analysis/LoopAccessAnalysis.h"
87 #include "llvm/Analysis/LoopAnalysisManager.h"
88 #include "llvm/Analysis/LoopInfo.h"
89 #include "llvm/Analysis/LoopIterator.h"
90 #include "llvm/Analysis/MemorySSA.h"
91 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
92 #include "llvm/Analysis/ProfileSummaryInfo.h"
93 #include "llvm/Analysis/ScalarEvolution.h"
94 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
95 #include "llvm/Analysis/TargetLibraryInfo.h"
96 #include "llvm/Analysis/TargetTransformInfo.h"
97 #include "llvm/Analysis/VectorUtils.h"
98 #include "llvm/IR/Attributes.h"
99 #include "llvm/IR/BasicBlock.h"
100 #include "llvm/IR/CFG.h"
101 #include "llvm/IR/Constant.h"
102 #include "llvm/IR/Constants.h"
103 #include "llvm/IR/DataLayout.h"
104 #include "llvm/IR/DebugInfoMetadata.h"
105 #include "llvm/IR/DebugLoc.h"
106 #include "llvm/IR/DerivedTypes.h"
107 #include "llvm/IR/DiagnosticInfo.h"
108 #include "llvm/IR/Dominators.h"
109 #include "llvm/IR/Function.h"
110 #include "llvm/IR/IRBuilder.h"
111 #include "llvm/IR/InstrTypes.h"
112 #include "llvm/IR/Instruction.h"
113 #include "llvm/IR/Instructions.h"
114 #include "llvm/IR/IntrinsicInst.h"
115 #include "llvm/IR/Intrinsics.h"
116 #include "llvm/IR/LLVMContext.h"
117 #include "llvm/IR/Metadata.h"
118 #include "llvm/IR/Module.h"
119 #include "llvm/IR/Operator.h"
120 #include "llvm/IR/Type.h"
121 #include "llvm/IR/Use.h"
122 #include "llvm/IR/User.h"
123 #include "llvm/IR/Value.h"
124 #include "llvm/IR/ValueHandle.h"
125 #include "llvm/IR/Verifier.h"
126 #include "llvm/InitializePasses.h"
127 #include "llvm/Pass.h"
128 #include "llvm/Support/Casting.h"
129 #include "llvm/Support/CommandLine.h"
130 #include "llvm/Support/Compiler.h"
131 #include "llvm/Support/Debug.h"
132 #include "llvm/Support/ErrorHandling.h"
133 #include "llvm/Support/InstructionCost.h"
134 #include "llvm/Support/MathExtras.h"
135 #include "llvm/Support/raw_ostream.h"
136 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
137 #include "llvm/Transforms/Utils/InjectTLIMappings.h"
138 #include "llvm/Transforms/Utils/LoopSimplify.h"
139 #include "llvm/Transforms/Utils/LoopUtils.h"
140 #include "llvm/Transforms/Utils/LoopVersioning.h"
141 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
142 #include "llvm/Transforms/Utils/SizeOpts.h"
143 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
144 #include <algorithm>
145 #include <cassert>
146 #include <cstdint>
147 #include <cstdlib>
148 #include <functional>
149 #include <iterator>
150 #include <limits>
151 #include <memory>
152 #include <string>
153 #include <tuple>
154 #include <utility>
155 
156 using namespace llvm;
157 
158 #define LV_NAME "loop-vectorize"
159 #define DEBUG_TYPE LV_NAME
160 
161 #ifndef NDEBUG
162 const char VerboseDebug[] = DEBUG_TYPE "-verbose";
163 #endif
164 
165 /// @{
166 /// Metadata attribute names
167 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all";
168 const char LLVMLoopVectorizeFollowupVectorized[] =
169     "llvm.loop.vectorize.followup_vectorized";
170 const char LLVMLoopVectorizeFollowupEpilogue[] =
171     "llvm.loop.vectorize.followup_epilogue";
172 /// @}
173 
174 STATISTIC(LoopsVectorized, "Number of loops vectorized");
175 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
176 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
177 
178 static cl::opt<bool> EnableEpilogueVectorization(
179     "enable-epilogue-vectorization", cl::init(true), cl::Hidden,
180     cl::desc("Enable vectorization of epilogue loops."));
181 
182 static cl::opt<unsigned> EpilogueVectorizationForceVF(
183     "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden,
184     cl::desc("When epilogue vectorization is enabled, and a value greater than "
185              "1 is specified, forces the given VF for all applicable epilogue "
186              "loops."));
187 
188 static cl::opt<unsigned> EpilogueVectorizationMinVF(
189     "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden,
190     cl::desc("Only loops with vectorization factor equal to or larger than "
191              "the specified value are considered for epilogue vectorization."));
192 
193 /// Loops with a known constant trip count below this number are vectorized only
194 /// if no scalar iteration overheads are incurred.
195 static cl::opt<unsigned> TinyTripCountVectorThreshold(
196     "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
197     cl::desc("Loops with a constant trip count that is smaller than this "
198              "value are vectorized only if no scalar iteration overheads "
199              "are incurred."));
200 
201 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
202 // that predication is preferred, and this lists all options. I.e., the
203 // vectorizer will try to fold the tail-loop (epilogue) into the vector body
204 // and predicate the instructions accordingly. If tail-folding fails, there are
205 // different fallback strategies depending on these values:
206 namespace PreferPredicateTy {
207   enum Option {
208     ScalarEpilogue = 0,
209     PredicateElseScalarEpilogue,
210     PredicateOrDontVectorize
211   };
212 } // namespace PreferPredicateTy
213 
214 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue(
215     "prefer-predicate-over-epilogue",
216     cl::init(PreferPredicateTy::ScalarEpilogue),
217     cl::Hidden,
218     cl::desc("Tail-folding and predication preferences over creating a scalar "
219              "epilogue loop."),
220     cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue,
221                          "scalar-epilogue",
222                          "Don't tail-predicate loops, create scalar epilogue"),
223               clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue,
224                          "predicate-else-scalar-epilogue",
225                          "prefer tail-folding, create scalar epilogue if tail "
226                          "folding fails."),
227               clEnumValN(PreferPredicateTy::PredicateOrDontVectorize,
228                          "predicate-dont-vectorize",
229                          "prefers tail-folding, don't attempt vectorization if "
230                          "tail-folding fails.")));
231 
232 static cl::opt<bool> MaximizeBandwidth(
233     "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
234     cl::desc("Maximize bandwidth when selecting vectorization factor which "
235              "will be determined by the smallest type in loop."));
236 
237 static cl::opt<bool> EnableInterleavedMemAccesses(
238     "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
239     cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
240 
241 /// An interleave-group may need masking if it resides in a block that needs
242 /// predication, or in order to mask away gaps.
243 static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
244     "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
245     cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
246 
247 static cl::opt<unsigned> TinyTripCountInterleaveThreshold(
248     "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden,
249     cl::desc("We don't interleave loops with a estimated constant trip count "
250              "below this number"));
251 
252 static cl::opt<unsigned> ForceTargetNumScalarRegs(
253     "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
254     cl::desc("A flag that overrides the target's number of scalar registers."));
255 
256 static cl::opt<unsigned> ForceTargetNumVectorRegs(
257     "force-target-num-vector-regs", cl::init(0), cl::Hidden,
258     cl::desc("A flag that overrides the target's number of vector registers."));
259 
260 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
261     "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
262     cl::desc("A flag that overrides the target's max interleave factor for "
263              "scalar loops."));
264 
265 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
266     "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
267     cl::desc("A flag that overrides the target's max interleave factor for "
268              "vectorized loops."));
269 
270 static cl::opt<unsigned> ForceTargetInstructionCost(
271     "force-target-instruction-cost", cl::init(0), cl::Hidden,
272     cl::desc("A flag that overrides the target's expected cost for "
273              "an instruction to a single constant value. Mostly "
274              "useful for getting consistent testing."));
275 
276 static cl::opt<bool> ForceTargetSupportsScalableVectors(
277     "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden,
278     cl::desc(
279         "Pretend that scalable vectors are supported, even if the target does "
280         "not support them. This flag should only be used for testing."));
281 
282 static cl::opt<unsigned> SmallLoopCost(
283     "small-loop-cost", cl::init(20), cl::Hidden,
284     cl::desc(
285         "The cost of a loop that is considered 'small' by the interleaver."));
286 
287 static cl::opt<bool> LoopVectorizeWithBlockFrequency(
288     "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
289     cl::desc("Enable the use of the block frequency analysis to access PGO "
290              "heuristics minimizing code growth in cold regions and being more "
291              "aggressive in hot regions."));
292 
293 // Runtime interleave loops for load/store throughput.
294 static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
295     "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
296     cl::desc(
297         "Enable runtime interleaving until load/store ports are saturated"));
298 
299 /// Interleave small loops with scalar reductions.
300 static cl::opt<bool> InterleaveSmallLoopScalarReduction(
301     "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden,
302     cl::desc("Enable interleaving for loops with small iteration counts that "
303              "contain scalar reductions to expose ILP."));
304 
305 /// The number of stores in a loop that are allowed to need predication.
306 static cl::opt<unsigned> NumberOfStoresToPredicate(
307     "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
308     cl::desc("Max number of stores to be predicated behind an if."));
309 
310 static cl::opt<bool> EnableIndVarRegisterHeur(
311     "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
312     cl::desc("Count the induction variable only once when interleaving"));
313 
314 static cl::opt<bool> EnableCondStoresVectorization(
315     "enable-cond-stores-vec", cl::init(true), cl::Hidden,
316     cl::desc("Enable if predication of stores during vectorization."));
317 
318 static cl::opt<unsigned> MaxNestedScalarReductionIC(
319     "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
320     cl::desc("The maximum interleave count to use when interleaving a scalar "
321              "reduction in a nested loop."));
322 
323 static cl::opt<bool>
324     PreferInLoopReductions("prefer-inloop-reductions", cl::init(false),
325                            cl::Hidden,
326                            cl::desc("Prefer in-loop vector reductions, "
327                                     "overriding the targets preference."));
328 
329 static cl::opt<bool> PreferPredicatedReductionSelect(
330     "prefer-predicated-reduction-select", cl::init(false), cl::Hidden,
331     cl::desc(
332         "Prefer predicating a reduction operation over an after loop select."));
333 
334 cl::opt<bool> EnableVPlanNativePath(
335     "enable-vplan-native-path", cl::init(false), cl::Hidden,
336     cl::desc("Enable VPlan-native vectorization path with "
337              "support for outer loop vectorization."));
338 
339 // FIXME: Remove this switch once we have divergence analysis. Currently we
340 // assume divergent non-backedge branches when this switch is true.
341 cl::opt<bool> EnableVPlanPredication(
342     "enable-vplan-predication", cl::init(false), cl::Hidden,
343     cl::desc("Enable VPlan-native vectorization path predicator with "
344              "support for outer loop vectorization."));
345 
346 // This flag enables the stress testing of the VPlan H-CFG construction in the
347 // VPlan-native vectorization path. It must be used in conjuction with
348 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
349 // verification of the H-CFGs built.
350 static cl::opt<bool> VPlanBuildStressTest(
351     "vplan-build-stress-test", cl::init(false), cl::Hidden,
352     cl::desc(
353         "Build VPlan for every supported loop nest in the function and bail "
354         "out right after the build (stress test the VPlan H-CFG construction "
355         "in the VPlan-native vectorization path)."));
356 
357 cl::opt<bool> llvm::EnableLoopInterleaving(
358     "interleave-loops", cl::init(true), cl::Hidden,
359     cl::desc("Enable loop interleaving in Loop vectorization passes"));
360 cl::opt<bool> llvm::EnableLoopVectorization(
361     "vectorize-loops", cl::init(true), cl::Hidden,
362     cl::desc("Run the Loop vectorization passes"));
363 
364 /// A helper function that returns the type of loaded or stored value.
365 static Type *getMemInstValueType(Value *I) {
366   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
367          "Expected Load or Store instruction");
368   if (auto *LI = dyn_cast<LoadInst>(I))
369     return LI->getType();
370   return cast<StoreInst>(I)->getValueOperand()->getType();
371 }
372 
373 /// A helper function that returns true if the given type is irregular. The
374 /// type is irregular if its allocated size doesn't equal the store size of an
375 /// element of the corresponding vector type at the given vectorization factor.
376 static bool hasIrregularType(Type *Ty, const DataLayout &DL, ElementCount VF) {
377   // Determine if an array of VF elements of type Ty is "bitcast compatible"
378   // with a <VF x Ty> vector.
379   if (VF.isVector()) {
380     auto *VectorTy = VectorType::get(Ty, VF);
381     return TypeSize::get(VF.getKnownMinValue() *
382                              DL.getTypeAllocSize(Ty).getFixedValue(),
383                          VF.isScalable()) != DL.getTypeStoreSize(VectorTy);
384   }
385 
386   // If the vectorization factor is one, we just check if an array of type Ty
387   // requires padding between elements.
388   return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
389 }
390 
391 /// A helper function that returns the reciprocal of the block probability of
392 /// predicated blocks. If we return X, we are assuming the predicated block
393 /// will execute once for every X iterations of the loop header.
394 ///
395 /// TODO: We should use actual block probability here, if available. Currently,
396 ///       we always assume predicated blocks have a 50% chance of executing.
397 static unsigned getReciprocalPredBlockProb() { return 2; }
398 
399 /// A helper function that adds a 'fast' flag to floating-point operations.
400 static Value *addFastMathFlag(Value *V) {
401   if (isa<FPMathOperator>(V))
402     cast<Instruction>(V)->setFastMathFlags(FastMathFlags::getFast());
403   return V;
404 }
405 
406 /// A helper function that returns an integer or floating-point constant with
407 /// value C.
408 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
409   return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
410                            : ConstantFP::get(Ty, C);
411 }
412 
413 /// Returns "best known" trip count for the specified loop \p L as defined by
414 /// the following procedure:
415 ///   1) Returns exact trip count if it is known.
416 ///   2) Returns expected trip count according to profile data if any.
417 ///   3) Returns upper bound estimate if it is known.
418 ///   4) Returns None if all of the above failed.
419 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) {
420   // Check if exact trip count is known.
421   if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
422     return ExpectedTC;
423 
424   // Check if there is an expected trip count available from profile data.
425   if (LoopVectorizeWithBlockFrequency)
426     if (auto EstimatedTC = getLoopEstimatedTripCount(L))
427       return EstimatedTC;
428 
429   // Check if upper bound estimate is known.
430   if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
431     return ExpectedTC;
432 
433   return None;
434 }
435 
436 namespace llvm {
437 
438 /// InnerLoopVectorizer vectorizes loops which contain only one basic
439 /// block to a specified vectorization factor (VF).
440 /// This class performs the widening of scalars into vectors, or multiple
441 /// scalars. This class also implements the following features:
442 /// * It inserts an epilogue loop for handling loops that don't have iteration
443 ///   counts that are known to be a multiple of the vectorization factor.
444 /// * It handles the code generation for reduction variables.
445 /// * Scalarization (implementation using scalars) of un-vectorizable
446 ///   instructions.
447 /// InnerLoopVectorizer does not perform any vectorization-legality
448 /// checks, and relies on the caller to check for the different legality
449 /// aspects. The InnerLoopVectorizer relies on the
450 /// LoopVectorizationLegality class to provide information about the induction
451 /// and reduction variables that were found to a given vectorization factor.
452 class InnerLoopVectorizer {
453 public:
454   InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
455                       LoopInfo *LI, DominatorTree *DT,
456                       const TargetLibraryInfo *TLI,
457                       const TargetTransformInfo *TTI, AssumptionCache *AC,
458                       OptimizationRemarkEmitter *ORE, ElementCount VecWidth,
459                       unsigned UnrollFactor, LoopVectorizationLegality *LVL,
460                       LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
461                       ProfileSummaryInfo *PSI)
462       : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
463         AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
464         Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI),
465         PSI(PSI) {
466     // Query this against the original loop and save it here because the profile
467     // of the original loop header may change as the transformation happens.
468     OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize(
469         OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass);
470   }
471 
472   virtual ~InnerLoopVectorizer() = default;
473 
474   /// Create a new empty loop that will contain vectorized instructions later
475   /// on, while the old loop will be used as the scalar remainder. Control flow
476   /// is generated around the vectorized (and scalar epilogue) loops consisting
477   /// of various checks and bypasses. Return the pre-header block of the new
478   /// loop.
479   /// In the case of epilogue vectorization, this function is overriden to
480   /// handle the more complex control flow around the loops.
481   virtual BasicBlock *createVectorizedLoopSkeleton();
482 
483   /// Widen a single instruction within the innermost loop.
484   void widenInstruction(Instruction &I, VPValue *Def, VPUser &Operands,
485                         VPTransformState &State);
486 
487   /// Widen a single call instruction within the innermost loop.
488   void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands,
489                             VPTransformState &State);
490 
491   /// Widen a single select instruction within the innermost loop.
492   void widenSelectInstruction(SelectInst &I, VPValue *VPDef, VPUser &Operands,
493                               bool InvariantCond, VPTransformState &State);
494 
495   /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
496   void fixVectorizedLoop(VPTransformState &State);
497 
498   // Return true if any runtime check is added.
499   bool areSafetyChecksAdded() { return AddedSafetyChecks; }
500 
501   /// A type for vectorized values in the new loop. Each value from the
502   /// original loop, when vectorized, is represented by UF vector values in the
503   /// new unrolled loop, where UF is the unroll factor.
504   using VectorParts = SmallVector<Value *, 2>;
505 
506   /// Vectorize a single GetElementPtrInst based on information gathered and
507   /// decisions taken during planning.
508   void widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, VPUser &Indices,
509                 unsigned UF, ElementCount VF, bool IsPtrLoopInvariant,
510                 SmallBitVector &IsIndexLoopInvariant, VPTransformState &State);
511 
512   /// Vectorize a single PHINode in a block. This method handles the induction
513   /// variable canonicalization. It supports both VF = 1 for unrolled loops and
514   /// arbitrary length vectors.
515   void widenPHIInstruction(Instruction *PN, RecurrenceDescriptor *RdxDesc,
516                            VPValue *StartV, VPValue *Def,
517                            VPTransformState &State);
518 
519   /// A helper function to scalarize a single Instruction in the innermost loop.
520   /// Generates a sequence of scalar instances for each lane between \p MinLane
521   /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
522   /// inclusive. Uses the VPValue operands from \p Operands instead of \p
523   /// Instr's operands.
524   void scalarizeInstruction(Instruction *Instr, VPValue *Def, VPUser &Operands,
525                             const VPIteration &Instance, bool IfPredicateInstr,
526                             VPTransformState &State);
527 
528   /// Widen an integer or floating-point induction variable \p IV. If \p Trunc
529   /// is provided, the integer induction variable will first be truncated to
530   /// the corresponding type.
531   void widenIntOrFpInduction(PHINode *IV, Value *Start, TruncInst *Trunc,
532                              VPValue *Def, VPValue *CastDef,
533                              VPTransformState &State);
534 
535   /// Construct the vector value of a scalarized value \p V one lane at a time.
536   void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance,
537                                  VPTransformState &State);
538 
539   /// Try to vectorize interleaved access group \p Group with the base address
540   /// given in \p Addr, optionally masking the vector operations if \p
541   /// BlockInMask is non-null. Use \p State to translate given VPValues to IR
542   /// values in the vectorized loop.
543   void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group,
544                                 ArrayRef<VPValue *> VPDefs,
545                                 VPTransformState &State, VPValue *Addr,
546                                 ArrayRef<VPValue *> StoredValues,
547                                 VPValue *BlockInMask = nullptr);
548 
549   /// Vectorize Load and Store instructions with the base address given in \p
550   /// Addr, optionally masking the vector operations if \p BlockInMask is
551   /// non-null. Use \p State to translate given VPValues to IR values in the
552   /// vectorized loop.
553   void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State,
554                                   VPValue *Def, VPValue *Addr,
555                                   VPValue *StoredValue, VPValue *BlockInMask);
556 
557   /// Set the debug location in the builder using the debug location in
558   /// the instruction.
559   void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr);
560 
561   /// Fix the non-induction PHIs in the OrigPHIsToFix vector.
562   void fixNonInductionPHIs(VPTransformState &State);
563 
564   /// Create a broadcast instruction. This method generates a broadcast
565   /// instruction (shuffle) for loop invariant values and for the induction
566   /// value. If this is the induction variable then we extend it to N, N+1, ...
567   /// this is needed because each iteration in the loop corresponds to a SIMD
568   /// element.
569   virtual Value *getBroadcastInstrs(Value *V);
570 
571 protected:
572   friend class LoopVectorizationPlanner;
573 
574   /// A small list of PHINodes.
575   using PhiVector = SmallVector<PHINode *, 4>;
576 
577   /// A type for scalarized values in the new loop. Each value from the
578   /// original loop, when scalarized, is represented by UF x VF scalar values
579   /// in the new unrolled loop, where UF is the unroll factor and VF is the
580   /// vectorization factor.
581   using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
582 
583   /// Set up the values of the IVs correctly when exiting the vector loop.
584   void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
585                     Value *CountRoundDown, Value *EndValue,
586                     BasicBlock *MiddleBlock);
587 
588   /// Create a new induction variable inside L.
589   PHINode *createInductionVariable(Loop *L, Value *Start, Value *End,
590                                    Value *Step, Instruction *DL);
591 
592   /// Handle all cross-iteration phis in the header.
593   void fixCrossIterationPHIs(VPTransformState &State);
594 
595   /// Fix a first-order recurrence. This is the second phase of vectorizing
596   /// this phi node.
597   void fixFirstOrderRecurrence(PHINode *Phi, VPTransformState &State);
598 
599   /// Fix a reduction cross-iteration phi. This is the second phase of
600   /// vectorizing this phi node.
601   void fixReduction(PHINode *Phi, VPTransformState &State);
602 
603   /// Clear NSW/NUW flags from reduction instructions if necessary.
604   void clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc,
605                                VPTransformState &State);
606 
607   /// Fixup the LCSSA phi nodes in the unique exit block.  This simply
608   /// means we need to add the appropriate incoming value from the middle
609   /// block as exiting edges from the scalar epilogue loop (if present) are
610   /// already in place, and we exit the vector loop exclusively to the middle
611   /// block.
612   void fixLCSSAPHIs(VPTransformState &State);
613 
614   /// Iteratively sink the scalarized operands of a predicated instruction into
615   /// the block that was created for it.
616   void sinkScalarOperands(Instruction *PredInst);
617 
618   /// Shrinks vector element sizes to the smallest bitwidth they can be legally
619   /// represented as.
620   void truncateToMinimalBitwidths(VPTransformState &State);
621 
622   /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...)
623   /// to each vector element of Val. The sequence starts at StartIndex.
624   /// \p Opcode is relevant for FP induction variable.
625   virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step,
626                                Instruction::BinaryOps Opcode =
627                                Instruction::BinaryOpsEnd);
628 
629   /// Compute scalar induction steps. \p ScalarIV is the scalar induction
630   /// variable on which to base the steps, \p Step is the size of the step, and
631   /// \p EntryVal is the value from the original loop that maps to the steps.
632   /// Note that \p EntryVal doesn't have to be an induction variable - it
633   /// can also be a truncate instruction.
634   void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal,
635                         const InductionDescriptor &ID, VPValue *Def,
636                         VPValue *CastDef, VPTransformState &State);
637 
638   /// Create a vector induction phi node based on an existing scalar one. \p
639   /// EntryVal is the value from the original loop that maps to the vector phi
640   /// node, and \p Step is the loop-invariant step. If \p EntryVal is a
641   /// truncate instruction, instead of widening the original IV, we widen a
642   /// version of the IV truncated to \p EntryVal's type.
643   void createVectorIntOrFpInductionPHI(const InductionDescriptor &II,
644                                        Value *Step, Value *Start,
645                                        Instruction *EntryVal, VPValue *Def,
646                                        VPValue *CastDef,
647                                        VPTransformState &State);
648 
649   /// Returns true if an instruction \p I should be scalarized instead of
650   /// vectorized for the chosen vectorization factor.
651   bool shouldScalarizeInstruction(Instruction *I) const;
652 
653   /// Returns true if we should generate a scalar version of \p IV.
654   bool needsScalarInduction(Instruction *IV) const;
655 
656   /// If there is a cast involved in the induction variable \p ID, which should
657   /// be ignored in the vectorized loop body, this function records the
658   /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the
659   /// cast. We had already proved that the casted Phi is equal to the uncasted
660   /// Phi in the vectorized loop (under a runtime guard), and therefore
661   /// there is no need to vectorize the cast - the same value can be used in the
662   /// vector loop for both the Phi and the cast.
663   /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified,
664   /// Otherwise, \p VectorLoopValue is a widened/vectorized value.
665   ///
666   /// \p EntryVal is the value from the original loop that maps to the vector
667   /// phi node and is used to distinguish what is the IV currently being
668   /// processed - original one (if \p EntryVal is a phi corresponding to the
669   /// original IV) or the "newly-created" one based on the proof mentioned above
670   /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the
671   /// latter case \p EntryVal is a TruncInst and we must not record anything for
672   /// that IV, but it's error-prone to expect callers of this routine to care
673   /// about that, hence this explicit parameter.
674   void recordVectorLoopValueForInductionCast(
675       const InductionDescriptor &ID, const Instruction *EntryVal,
676       Value *VectorLoopValue, VPValue *CastDef, VPTransformState &State,
677       unsigned Part, unsigned Lane = UINT_MAX);
678 
679   /// Generate a shuffle sequence that will reverse the vector Vec.
680   virtual Value *reverseVector(Value *Vec);
681 
682   /// Returns (and creates if needed) the original loop trip count.
683   Value *getOrCreateTripCount(Loop *NewLoop);
684 
685   /// Returns (and creates if needed) the trip count of the widened loop.
686   Value *getOrCreateVectorTripCount(Loop *NewLoop);
687 
688   /// Returns a bitcasted value to the requested vector type.
689   /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
690   Value *createBitOrPointerCast(Value *V, VectorType *DstVTy,
691                                 const DataLayout &DL);
692 
693   /// Emit a bypass check to see if the vector trip count is zero, including if
694   /// it overflows.
695   void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass);
696 
697   /// Emit a bypass check to see if all of the SCEV assumptions we've
698   /// had to make are correct.
699   void emitSCEVChecks(Loop *L, BasicBlock *Bypass);
700 
701   /// Emit bypass checks to check any memory assumptions we may have made.
702   void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass);
703 
704   /// Compute the transformed value of Index at offset StartValue using step
705   /// StepValue.
706   /// For integer induction, returns StartValue + Index * StepValue.
707   /// For pointer induction, returns StartValue[Index * StepValue].
708   /// FIXME: The newly created binary instructions should contain nsw/nuw
709   /// flags, which can be found from the original scalar operations.
710   Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE,
711                               const DataLayout &DL,
712                               const InductionDescriptor &ID) const;
713 
714   /// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
715   /// vector loop preheader, middle block and scalar preheader. Also
716   /// allocate a loop object for the new vector loop and return it.
717   Loop *createVectorLoopSkeleton(StringRef Prefix);
718 
719   /// Create new phi nodes for the induction variables to resume iteration count
720   /// in the scalar epilogue, from where the vectorized loop left off (given by
721   /// \p VectorTripCount).
722   /// In cases where the loop skeleton is more complicated (eg. epilogue
723   /// vectorization) and the resume values can come from an additional bypass
724   /// block, the \p AdditionalBypass pair provides information about the bypass
725   /// block and the end value on the edge from bypass to this loop.
726   void createInductionResumeValues(
727       Loop *L, Value *VectorTripCount,
728       std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
729 
730   /// Complete the loop skeleton by adding debug MDs, creating appropriate
731   /// conditional branches in the middle block, preparing the builder and
732   /// running the verifier. Take in the vector loop \p L as argument, and return
733   /// the preheader of the completed vector loop.
734   BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID);
735 
736   /// Add additional metadata to \p To that was not present on \p Orig.
737   ///
738   /// Currently this is used to add the noalias annotations based on the
739   /// inserted memchecks.  Use this for instructions that are *cloned* into the
740   /// vector loop.
741   void addNewMetadata(Instruction *To, const Instruction *Orig);
742 
743   /// Add metadata from one instruction to another.
744   ///
745   /// This includes both the original MDs from \p From and additional ones (\see
746   /// addNewMetadata).  Use this for *newly created* instructions in the vector
747   /// loop.
748   void addMetadata(Instruction *To, Instruction *From);
749 
750   /// Similar to the previous function but it adds the metadata to a
751   /// vector of instructions.
752   void addMetadata(ArrayRef<Value *> To, Instruction *From);
753 
754   /// Allow subclasses to override and print debug traces before/after vplan
755   /// execution, when trace information is requested.
756   virtual void printDebugTracesAtStart(){};
757   virtual void printDebugTracesAtEnd(){};
758 
759   /// The original loop.
760   Loop *OrigLoop;
761 
762   /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
763   /// dynamic knowledge to simplify SCEV expressions and converts them to a
764   /// more usable form.
765   PredicatedScalarEvolution &PSE;
766 
767   /// Loop Info.
768   LoopInfo *LI;
769 
770   /// Dominator Tree.
771   DominatorTree *DT;
772 
773   /// Alias Analysis.
774   AAResults *AA;
775 
776   /// Target Library Info.
777   const TargetLibraryInfo *TLI;
778 
779   /// Target Transform Info.
780   const TargetTransformInfo *TTI;
781 
782   /// Assumption Cache.
783   AssumptionCache *AC;
784 
785   /// Interface to emit optimization remarks.
786   OptimizationRemarkEmitter *ORE;
787 
788   /// LoopVersioning.  It's only set up (non-null) if memchecks were
789   /// used.
790   ///
791   /// This is currently only used to add no-alias metadata based on the
792   /// memchecks.  The actually versioning is performed manually.
793   std::unique_ptr<LoopVersioning> LVer;
794 
795   /// The vectorization SIMD factor to use. Each vector will have this many
796   /// vector elements.
797   ElementCount VF;
798 
799   /// The vectorization unroll factor to use. Each scalar is vectorized to this
800   /// many different vector instructions.
801   unsigned UF;
802 
803   /// The builder that we use
804   IRBuilder<> Builder;
805 
806   // --- Vectorization state ---
807 
808   /// The vector-loop preheader.
809   BasicBlock *LoopVectorPreHeader;
810 
811   /// The scalar-loop preheader.
812   BasicBlock *LoopScalarPreHeader;
813 
814   /// Middle Block between the vector and the scalar.
815   BasicBlock *LoopMiddleBlock;
816 
817   /// The (unique) ExitBlock of the scalar loop.  Note that
818   /// there can be multiple exiting edges reaching this block.
819   BasicBlock *LoopExitBlock;
820 
821   /// The vector loop body.
822   BasicBlock *LoopVectorBody;
823 
824   /// The scalar loop body.
825   BasicBlock *LoopScalarBody;
826 
827   /// A list of all bypass blocks. The first block is the entry of the loop.
828   SmallVector<BasicBlock *, 4> LoopBypassBlocks;
829 
830   /// The new Induction variable which was added to the new block.
831   PHINode *Induction = nullptr;
832 
833   /// The induction variable of the old basic block.
834   PHINode *OldInduction = nullptr;
835 
836   /// Store instructions that were predicated.
837   SmallVector<Instruction *, 4> PredicatedInstructions;
838 
839   /// Trip count of the original loop.
840   Value *TripCount = nullptr;
841 
842   /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
843   Value *VectorTripCount = nullptr;
844 
845   /// The legality analysis.
846   LoopVectorizationLegality *Legal;
847 
848   /// The profitablity analysis.
849   LoopVectorizationCostModel *Cost;
850 
851   // Record whether runtime checks are added.
852   bool AddedSafetyChecks = false;
853 
854   // Holds the end values for each induction variable. We save the end values
855   // so we can later fix-up the external users of the induction variables.
856   DenseMap<PHINode *, Value *> IVEndValues;
857 
858   // Vector of original scalar PHIs whose corresponding widened PHIs need to be
859   // fixed up at the end of vector code generation.
860   SmallVector<PHINode *, 8> OrigPHIsToFix;
861 
862   /// BFI and PSI are used to check for profile guided size optimizations.
863   BlockFrequencyInfo *BFI;
864   ProfileSummaryInfo *PSI;
865 
866   // Whether this loop should be optimized for size based on profile guided size
867   // optimizatios.
868   bool OptForSizeBasedOnProfile;
869 };
870 
871 class InnerLoopUnroller : public InnerLoopVectorizer {
872 public:
873   InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
874                     LoopInfo *LI, DominatorTree *DT,
875                     const TargetLibraryInfo *TLI,
876                     const TargetTransformInfo *TTI, AssumptionCache *AC,
877                     OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
878                     LoopVectorizationLegality *LVL,
879                     LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
880                     ProfileSummaryInfo *PSI)
881       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
882                             ElementCount::getFixed(1), UnrollFactor, LVL, CM,
883                             BFI, PSI) {}
884 
885 private:
886   Value *getBroadcastInstrs(Value *V) override;
887   Value *getStepVector(Value *Val, int StartIdx, Value *Step,
888                        Instruction::BinaryOps Opcode =
889                        Instruction::BinaryOpsEnd) override;
890   Value *reverseVector(Value *Vec) override;
891 };
892 
893 /// Encapsulate information regarding vectorization of a loop and its epilogue.
894 /// This information is meant to be updated and used across two stages of
895 /// epilogue vectorization.
896 struct EpilogueLoopVectorizationInfo {
897   ElementCount MainLoopVF = ElementCount::getFixed(0);
898   unsigned MainLoopUF = 0;
899   ElementCount EpilogueVF = ElementCount::getFixed(0);
900   unsigned EpilogueUF = 0;
901   BasicBlock *MainLoopIterationCountCheck = nullptr;
902   BasicBlock *EpilogueIterationCountCheck = nullptr;
903   BasicBlock *SCEVSafetyCheck = nullptr;
904   BasicBlock *MemSafetyCheck = nullptr;
905   Value *TripCount = nullptr;
906   Value *VectorTripCount = nullptr;
907 
908   EpilogueLoopVectorizationInfo(unsigned MVF, unsigned MUF, unsigned EVF,
909                                 unsigned EUF)
910       : MainLoopVF(ElementCount::getFixed(MVF)), MainLoopUF(MUF),
911         EpilogueVF(ElementCount::getFixed(EVF)), EpilogueUF(EUF) {
912     assert(EUF == 1 &&
913            "A high UF for the epilogue loop is likely not beneficial.");
914   }
915 };
916 
917 /// An extension of the inner loop vectorizer that creates a skeleton for a
918 /// vectorized loop that has its epilogue (residual) also vectorized.
919 /// The idea is to run the vplan on a given loop twice, firstly to setup the
920 /// skeleton and vectorize the main loop, and secondly to complete the skeleton
921 /// from the first step and vectorize the epilogue.  This is achieved by
922 /// deriving two concrete strategy classes from this base class and invoking
923 /// them in succession from the loop vectorizer planner.
924 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer {
925 public:
926   InnerLoopAndEpilogueVectorizer(
927       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
928       DominatorTree *DT, const TargetLibraryInfo *TLI,
929       const TargetTransformInfo *TTI, AssumptionCache *AC,
930       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
931       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
932       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI)
933       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
934                             EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI),
935         EPI(EPI) {}
936 
937   // Override this function to handle the more complex control flow around the
938   // three loops.
939   BasicBlock *createVectorizedLoopSkeleton() final override {
940     return createEpilogueVectorizedLoopSkeleton();
941   }
942 
943   /// The interface for creating a vectorized skeleton using one of two
944   /// different strategies, each corresponding to one execution of the vplan
945   /// as described above.
946   virtual BasicBlock *createEpilogueVectorizedLoopSkeleton() = 0;
947 
948   /// Holds and updates state information required to vectorize the main loop
949   /// and its epilogue in two separate passes. This setup helps us avoid
950   /// regenerating and recomputing runtime safety checks. It also helps us to
951   /// shorten the iteration-count-check path length for the cases where the
952   /// iteration count of the loop is so small that the main vector loop is
953   /// completely skipped.
954   EpilogueLoopVectorizationInfo &EPI;
955 };
956 
957 /// A specialized derived class of inner loop vectorizer that performs
958 /// vectorization of *main* loops in the process of vectorizing loops and their
959 /// epilogues.
960 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer {
961 public:
962   EpilogueVectorizerMainLoop(
963       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
964       DominatorTree *DT, const TargetLibraryInfo *TLI,
965       const TargetTransformInfo *TTI, AssumptionCache *AC,
966       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
967       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
968       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI)
969       : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
970                                        EPI, LVL, CM, BFI, PSI) {}
971   /// Implements the interface for creating a vectorized skeleton using the
972   /// *main loop* strategy (ie the first pass of vplan execution).
973   BasicBlock *createEpilogueVectorizedLoopSkeleton() final override;
974 
975 protected:
976   /// Emits an iteration count bypass check once for the main loop (when \p
977   /// ForEpilogue is false) and once for the epilogue loop (when \p
978   /// ForEpilogue is true).
979   BasicBlock *emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass,
980                                              bool ForEpilogue);
981   void printDebugTracesAtStart() override;
982   void printDebugTracesAtEnd() override;
983 };
984 
985 // A specialized derived class of inner loop vectorizer that performs
986 // vectorization of *epilogue* loops in the process of vectorizing loops and
987 // their epilogues.
988 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer {
989 public:
990   EpilogueVectorizerEpilogueLoop(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
991                     LoopInfo *LI, DominatorTree *DT,
992                     const TargetLibraryInfo *TLI,
993                     const TargetTransformInfo *TTI, AssumptionCache *AC,
994                     OptimizationRemarkEmitter *ORE,
995                     EpilogueLoopVectorizationInfo &EPI,
996                     LoopVectorizationLegality *LVL,
997                     llvm::LoopVectorizationCostModel *CM,
998                     BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI)
999       : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
1000                                        EPI, LVL, CM, BFI, PSI) {}
1001   /// Implements the interface for creating a vectorized skeleton using the
1002   /// *epilogue loop* strategy (ie the second pass of vplan execution).
1003   BasicBlock *createEpilogueVectorizedLoopSkeleton() final override;
1004 
1005 protected:
1006   /// Emits an iteration count bypass check after the main vector loop has
1007   /// finished to see if there are any iterations left to execute by either
1008   /// the vector epilogue or the scalar epilogue.
1009   BasicBlock *emitMinimumVectorEpilogueIterCountCheck(Loop *L,
1010                                                       BasicBlock *Bypass,
1011                                                       BasicBlock *Insert);
1012   void printDebugTracesAtStart() override;
1013   void printDebugTracesAtEnd() override;
1014 };
1015 } // end namespace llvm
1016 
1017 /// Look for a meaningful debug location on the instruction or it's
1018 /// operands.
1019 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
1020   if (!I)
1021     return I;
1022 
1023   DebugLoc Empty;
1024   if (I->getDebugLoc() != Empty)
1025     return I;
1026 
1027   for (Use &Op : I->operands()) {
1028     if (Instruction *OpInst = dyn_cast<Instruction>(Op))
1029       if (OpInst->getDebugLoc() != Empty)
1030         return OpInst;
1031   }
1032 
1033   return I;
1034 }
1035 
1036 void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) {
1037   if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) {
1038     const DILocation *DIL = Inst->getDebugLoc();
1039     if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
1040         !isa<DbgInfoIntrinsic>(Inst)) {
1041       assert(!VF.isScalable() && "scalable vectors not yet supported.");
1042       auto NewDIL =
1043           DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue());
1044       if (NewDIL)
1045         B.SetCurrentDebugLocation(NewDIL.getValue());
1046       else
1047         LLVM_DEBUG(dbgs()
1048                    << "Failed to create new discriminator: "
1049                    << DIL->getFilename() << " Line: " << DIL->getLine());
1050     }
1051     else
1052       B.SetCurrentDebugLocation(DIL);
1053   } else
1054     B.SetCurrentDebugLocation(DebugLoc());
1055 }
1056 
1057 /// Write a record \p DebugMsg about vectorization failure to the debug
1058 /// output stream. If \p I is passed, it is an instruction that prevents
1059 /// vectorization.
1060 #ifndef NDEBUG
1061 static void debugVectorizationFailure(const StringRef DebugMsg,
1062     Instruction *I) {
1063   dbgs() << "LV: Not vectorizing: " << DebugMsg;
1064   if (I != nullptr)
1065     dbgs() << " " << *I;
1066   else
1067     dbgs() << '.';
1068   dbgs() << '\n';
1069 }
1070 #endif
1071 
1072 /// Create an analysis remark that explains why vectorization failed
1073 ///
1074 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint).  \p
1075 /// RemarkName is the identifier for the remark.  If \p I is passed it is an
1076 /// instruction that prevents vectorization.  Otherwise \p TheLoop is used for
1077 /// the location of the remark.  \return the remark object that can be
1078 /// streamed to.
1079 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
1080     StringRef RemarkName, Loop *TheLoop, Instruction *I) {
1081   Value *CodeRegion = TheLoop->getHeader();
1082   DebugLoc DL = TheLoop->getStartLoc();
1083 
1084   if (I) {
1085     CodeRegion = I->getParent();
1086     // If there is no debug location attached to the instruction, revert back to
1087     // using the loop's.
1088     if (I->getDebugLoc())
1089       DL = I->getDebugLoc();
1090   }
1091 
1092   OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion);
1093   R << "loop not vectorized: ";
1094   return R;
1095 }
1096 
1097 /// Return a value for Step multiplied by VF.
1098 static Value *createStepForVF(IRBuilder<> &B, Constant *Step, ElementCount VF) {
1099   assert(isa<ConstantInt>(Step) && "Expected an integer step");
1100   Constant *StepVal = ConstantInt::get(
1101       Step->getType(),
1102       cast<ConstantInt>(Step)->getSExtValue() * VF.getKnownMinValue());
1103   return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal;
1104 }
1105 
1106 namespace llvm {
1107 
1108 void reportVectorizationFailure(const StringRef DebugMsg,
1109     const StringRef OREMsg, const StringRef ORETag,
1110     OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) {
1111   LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I));
1112   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
1113   ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(),
1114                 ORETag, TheLoop, I) << OREMsg);
1115 }
1116 
1117 } // end namespace llvm
1118 
1119 #ifndef NDEBUG
1120 /// \return string containing a file name and a line # for the given loop.
1121 static std::string getDebugLocString(const Loop *L) {
1122   std::string Result;
1123   if (L) {
1124     raw_string_ostream OS(Result);
1125     if (const DebugLoc LoopDbgLoc = L->getStartLoc())
1126       LoopDbgLoc.print(OS);
1127     else
1128       // Just print the module name.
1129       OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
1130     OS.flush();
1131   }
1132   return Result;
1133 }
1134 #endif
1135 
1136 void InnerLoopVectorizer::addNewMetadata(Instruction *To,
1137                                          const Instruction *Orig) {
1138   // If the loop was versioned with memchecks, add the corresponding no-alias
1139   // metadata.
1140   if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig)))
1141     LVer->annotateInstWithNoAlias(To, Orig);
1142 }
1143 
1144 void InnerLoopVectorizer::addMetadata(Instruction *To,
1145                                       Instruction *From) {
1146   propagateMetadata(To, From);
1147   addNewMetadata(To, From);
1148 }
1149 
1150 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,
1151                                       Instruction *From) {
1152   for (Value *V : To) {
1153     if (Instruction *I = dyn_cast<Instruction>(V))
1154       addMetadata(I, From);
1155   }
1156 }
1157 
1158 namespace llvm {
1159 
1160 // Loop vectorization cost-model hints how the scalar epilogue loop should be
1161 // lowered.
1162 enum ScalarEpilogueLowering {
1163 
1164   // The default: allowing scalar epilogues.
1165   CM_ScalarEpilogueAllowed,
1166 
1167   // Vectorization with OptForSize: don't allow epilogues.
1168   CM_ScalarEpilogueNotAllowedOptSize,
1169 
1170   // A special case of vectorisation with OptForSize: loops with a very small
1171   // trip count are considered for vectorization under OptForSize, thereby
1172   // making sure the cost of their loop body is dominant, free of runtime
1173   // guards and scalar iteration overheads.
1174   CM_ScalarEpilogueNotAllowedLowTripLoop,
1175 
1176   // Loop hint predicate indicating an epilogue is undesired.
1177   CM_ScalarEpilogueNotNeededUsePredicate,
1178 
1179   // Directive indicating we must either tail fold or not vectorize
1180   CM_ScalarEpilogueNotAllowedUsePredicate
1181 };
1182 
1183 /// LoopVectorizationCostModel - estimates the expected speedups due to
1184 /// vectorization.
1185 /// In many cases vectorization is not profitable. This can happen because of
1186 /// a number of reasons. In this class we mainly attempt to predict the
1187 /// expected speedup/slowdowns due to the supported instruction set. We use the
1188 /// TargetTransformInfo to query the different backends for the cost of
1189 /// different operations.
1190 class LoopVectorizationCostModel {
1191 public:
1192   LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
1193                              PredicatedScalarEvolution &PSE, LoopInfo *LI,
1194                              LoopVectorizationLegality *Legal,
1195                              const TargetTransformInfo &TTI,
1196                              const TargetLibraryInfo *TLI, DemandedBits *DB,
1197                              AssumptionCache *AC,
1198                              OptimizationRemarkEmitter *ORE, const Function *F,
1199                              const LoopVectorizeHints *Hints,
1200                              InterleavedAccessInfo &IAI)
1201       : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
1202         TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
1203         Hints(Hints), InterleaveInfo(IAI) {}
1204 
1205   /// \return An upper bound for the vectorization factor, or None if
1206   /// vectorization and interleaving should be avoided up front.
1207   Optional<ElementCount> computeMaxVF(ElementCount UserVF, unsigned UserIC);
1208 
1209   /// \return True if runtime checks are required for vectorization, and false
1210   /// otherwise.
1211   bool runtimeChecksRequired();
1212 
1213   /// \return The most profitable vectorization factor and the cost of that VF.
1214   /// This method checks every power of two up to MaxVF. If UserVF is not ZERO
1215   /// then this vectorization factor will be selected if vectorization is
1216   /// possible.
1217   VectorizationFactor selectVectorizationFactor(ElementCount MaxVF);
1218   VectorizationFactor
1219   selectEpilogueVectorizationFactor(const ElementCount MaxVF,
1220                                     const LoopVectorizationPlanner &LVP);
1221 
1222   /// Setup cost-based decisions for user vectorization factor.
1223   void selectUserVectorizationFactor(ElementCount UserVF) {
1224     collectUniformsAndScalars(UserVF);
1225     collectInstsToScalarize(UserVF);
1226   }
1227 
1228   /// \return The size (in bits) of the smallest and widest types in the code
1229   /// that needs to be vectorized. We ignore values that remain scalar such as
1230   /// 64 bit loop indices.
1231   std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
1232 
1233   /// \return The desired interleave count.
1234   /// If interleave count has been specified by metadata it will be returned.
1235   /// Otherwise, the interleave count is computed and returned. VF and LoopCost
1236   /// are the selected vectorization factor and the cost of the selected VF.
1237   unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost);
1238 
1239   /// Memory access instruction may be vectorized in more than one way.
1240   /// Form of instruction after vectorization depends on cost.
1241   /// This function takes cost-based decisions for Load/Store instructions
1242   /// and collects them in a map. This decisions map is used for building
1243   /// the lists of loop-uniform and loop-scalar instructions.
1244   /// The calculated cost is saved with widening decision in order to
1245   /// avoid redundant calculations.
1246   void setCostBasedWideningDecision(ElementCount VF);
1247 
1248   /// A struct that represents some properties of the register usage
1249   /// of a loop.
1250   struct RegisterUsage {
1251     /// Holds the number of loop invariant values that are used in the loop.
1252     /// The key is ClassID of target-provided register class.
1253     SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs;
1254     /// Holds the maximum number of concurrent live intervals in the loop.
1255     /// The key is ClassID of target-provided register class.
1256     SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
1257   };
1258 
1259   /// \return Returns information about the register usages of the loop for the
1260   /// given vectorization factors.
1261   SmallVector<RegisterUsage, 8>
1262   calculateRegisterUsage(ArrayRef<ElementCount> VFs);
1263 
1264   /// Collect values we want to ignore in the cost model.
1265   void collectValuesToIgnore();
1266 
1267   /// Split reductions into those that happen in the loop, and those that happen
1268   /// outside. In loop reductions are collected into InLoopReductionChains.
1269   void collectInLoopReductions();
1270 
1271   /// \returns The smallest bitwidth each instruction can be represented with.
1272   /// The vector equivalents of these instructions should be truncated to this
1273   /// type.
1274   const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
1275     return MinBWs;
1276   }
1277 
1278   /// \returns True if it is more profitable to scalarize instruction \p I for
1279   /// vectorization factor \p VF.
1280   bool isProfitableToScalarize(Instruction *I, ElementCount VF) const {
1281     assert(VF.isVector() &&
1282            "Profitable to scalarize relevant only for VF > 1.");
1283 
1284     // Cost model is not run in the VPlan-native path - return conservative
1285     // result until this changes.
1286     if (EnableVPlanNativePath)
1287       return false;
1288 
1289     auto Scalars = InstsToScalarize.find(VF);
1290     assert(Scalars != InstsToScalarize.end() &&
1291            "VF not yet analyzed for scalarization profitability");
1292     return Scalars->second.find(I) != Scalars->second.end();
1293   }
1294 
1295   /// Returns true if \p I is known to be uniform after vectorization.
1296   bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const {
1297     if (VF.isScalar())
1298       return true;
1299 
1300     // Cost model is not run in the VPlan-native path - return conservative
1301     // result until this changes.
1302     if (EnableVPlanNativePath)
1303       return false;
1304 
1305     auto UniformsPerVF = Uniforms.find(VF);
1306     assert(UniformsPerVF != Uniforms.end() &&
1307            "VF not yet analyzed for uniformity");
1308     return UniformsPerVF->second.count(I);
1309   }
1310 
1311   /// Returns true if \p I is known to be scalar after vectorization.
1312   bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const {
1313     if (VF.isScalar())
1314       return true;
1315 
1316     // Cost model is not run in the VPlan-native path - return conservative
1317     // result until this changes.
1318     if (EnableVPlanNativePath)
1319       return false;
1320 
1321     auto ScalarsPerVF = Scalars.find(VF);
1322     assert(ScalarsPerVF != Scalars.end() &&
1323            "Scalar values are not calculated for VF");
1324     return ScalarsPerVF->second.count(I);
1325   }
1326 
1327   /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1328   /// for vectorization factor \p VF.
1329   bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const {
1330     return VF.isVector() && MinBWs.find(I) != MinBWs.end() &&
1331            !isProfitableToScalarize(I, VF) &&
1332            !isScalarAfterVectorization(I, VF);
1333   }
1334 
1335   /// Decision that was taken during cost calculation for memory instruction.
1336   enum InstWidening {
1337     CM_Unknown,
1338     CM_Widen,         // For consecutive accesses with stride +1.
1339     CM_Widen_Reverse, // For consecutive accesses with stride -1.
1340     CM_Interleave,
1341     CM_GatherScatter,
1342     CM_Scalarize
1343   };
1344 
1345   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1346   /// instruction \p I and vector width \p VF.
1347   void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W,
1348                            InstructionCost Cost) {
1349     assert(VF.isVector() && "Expected VF >=2");
1350     WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1351   }
1352 
1353   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1354   /// interleaving group \p Grp and vector width \p VF.
1355   void setWideningDecision(const InterleaveGroup<Instruction> *Grp,
1356                            ElementCount VF, InstWidening W,
1357                            InstructionCost Cost) {
1358     assert(VF.isVector() && "Expected VF >=2");
1359     /// Broadcast this decicion to all instructions inside the group.
1360     /// But the cost will be assigned to one instruction only.
1361     for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1362       if (auto *I = Grp->getMember(i)) {
1363         if (Grp->getInsertPos() == I)
1364           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1365         else
1366           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1367       }
1368     }
1369   }
1370 
1371   /// Return the cost model decision for the given instruction \p I and vector
1372   /// width \p VF. Return CM_Unknown if this instruction did not pass
1373   /// through the cost modeling.
1374   InstWidening getWideningDecision(Instruction *I, ElementCount VF) {
1375     assert(VF.isVector() && "Expected VF to be a vector VF");
1376     // Cost model is not run in the VPlan-native path - return conservative
1377     // result until this changes.
1378     if (EnableVPlanNativePath)
1379       return CM_GatherScatter;
1380 
1381     std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1382     auto Itr = WideningDecisions.find(InstOnVF);
1383     if (Itr == WideningDecisions.end())
1384       return CM_Unknown;
1385     return Itr->second.first;
1386   }
1387 
1388   /// Return the vectorization cost for the given instruction \p I and vector
1389   /// width \p VF.
1390   InstructionCost getWideningCost(Instruction *I, ElementCount VF) {
1391     assert(VF.isVector() && "Expected VF >=2");
1392     std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1393     assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&
1394            "The cost is not calculated");
1395     return WideningDecisions[InstOnVF].second;
1396   }
1397 
1398   /// Return True if instruction \p I is an optimizable truncate whose operand
1399   /// is an induction variable. Such a truncate will be removed by adding a new
1400   /// induction variable with the destination type.
1401   bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) {
1402     // If the instruction is not a truncate, return false.
1403     auto *Trunc = dyn_cast<TruncInst>(I);
1404     if (!Trunc)
1405       return false;
1406 
1407     // Get the source and destination types of the truncate.
1408     Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1409     Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1410 
1411     // If the truncate is free for the given types, return false. Replacing a
1412     // free truncate with an induction variable would add an induction variable
1413     // update instruction to each iteration of the loop. We exclude from this
1414     // check the primary induction variable since it will need an update
1415     // instruction regardless.
1416     Value *Op = Trunc->getOperand(0);
1417     if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1418       return false;
1419 
1420     // If the truncated value is not an induction variable, return false.
1421     return Legal->isInductionPhi(Op);
1422   }
1423 
1424   /// Collects the instructions to scalarize for each predicated instruction in
1425   /// the loop.
1426   void collectInstsToScalarize(ElementCount VF);
1427 
1428   /// Collect Uniform and Scalar values for the given \p VF.
1429   /// The sets depend on CM decision for Load/Store instructions
1430   /// that may be vectorized as interleave, gather-scatter or scalarized.
1431   void collectUniformsAndScalars(ElementCount VF) {
1432     // Do the analysis once.
1433     if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end())
1434       return;
1435     setCostBasedWideningDecision(VF);
1436     collectLoopUniforms(VF);
1437     collectLoopScalars(VF);
1438   }
1439 
1440   /// Returns true if the target machine supports masked store operation
1441   /// for the given \p DataType and kind of access to \p Ptr.
1442   bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) {
1443     return Legal->isConsecutivePtr(Ptr) &&
1444            TTI.isLegalMaskedStore(DataType, Alignment);
1445   }
1446 
1447   /// Returns true if the target machine supports masked load operation
1448   /// for the given \p DataType and kind of access to \p Ptr.
1449   bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) {
1450     return Legal->isConsecutivePtr(Ptr) &&
1451            TTI.isLegalMaskedLoad(DataType, Alignment);
1452   }
1453 
1454   /// Returns true if the target machine supports masked scatter operation
1455   /// for the given \p DataType.
1456   bool isLegalMaskedScatter(Type *DataType, Align Alignment) {
1457     return TTI.isLegalMaskedScatter(DataType, Alignment);
1458   }
1459 
1460   /// Returns true if the target machine supports masked gather operation
1461   /// for the given \p DataType.
1462   bool isLegalMaskedGather(Type *DataType, Align Alignment) {
1463     return TTI.isLegalMaskedGather(DataType, Alignment);
1464   }
1465 
1466   /// Returns true if the target machine can represent \p V as a masked gather
1467   /// or scatter operation.
1468   bool isLegalGatherOrScatter(Value *V) {
1469     bool LI = isa<LoadInst>(V);
1470     bool SI = isa<StoreInst>(V);
1471     if (!LI && !SI)
1472       return false;
1473     auto *Ty = getMemInstValueType(V);
1474     Align Align = getLoadStoreAlignment(V);
1475     return (LI && isLegalMaskedGather(Ty, Align)) ||
1476            (SI && isLegalMaskedScatter(Ty, Align));
1477   }
1478 
1479   /// Returns true if the target machine supports all of the reduction
1480   /// variables found for the given VF.
1481   bool canVectorizeReductions(ElementCount VF) {
1482     return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
1483       RecurrenceDescriptor RdxDesc = Reduction.second;
1484       return TTI.isLegalToVectorizeReduction(RdxDesc, VF);
1485     }));
1486   }
1487 
1488   /// Returns true if \p I is an instruction that will be scalarized with
1489   /// predication. Such instructions include conditional stores and
1490   /// instructions that may divide by zero.
1491   /// If a non-zero VF has been calculated, we check if I will be scalarized
1492   /// predication for that VF.
1493   bool isScalarWithPredication(Instruction *I,
1494                                ElementCount VF = ElementCount::getFixed(1));
1495 
1496   // Returns true if \p I is an instruction that will be predicated either
1497   // through scalar predication or masked load/store or masked gather/scatter.
1498   // Superset of instructions that return true for isScalarWithPredication.
1499   bool isPredicatedInst(Instruction *I) {
1500     if (!blockNeedsPredication(I->getParent()))
1501       return false;
1502     // Loads and stores that need some form of masked operation are predicated
1503     // instructions.
1504     if (isa<LoadInst>(I) || isa<StoreInst>(I))
1505       return Legal->isMaskRequired(I);
1506     return isScalarWithPredication(I);
1507   }
1508 
1509   /// Returns true if \p I is a memory instruction with consecutive memory
1510   /// access that can be widened.
1511   bool
1512   memoryInstructionCanBeWidened(Instruction *I,
1513                                 ElementCount VF = ElementCount::getFixed(1));
1514 
1515   /// Returns true if \p I is a memory instruction in an interleaved-group
1516   /// of memory accesses that can be vectorized with wide vector loads/stores
1517   /// and shuffles.
1518   bool
1519   interleavedAccessCanBeWidened(Instruction *I,
1520                                 ElementCount VF = ElementCount::getFixed(1));
1521 
1522   /// Check if \p Instr belongs to any interleaved access group.
1523   bool isAccessInterleaved(Instruction *Instr) {
1524     return InterleaveInfo.isInterleaved(Instr);
1525   }
1526 
1527   /// Get the interleaved access group that \p Instr belongs to.
1528   const InterleaveGroup<Instruction> *
1529   getInterleavedAccessGroup(Instruction *Instr) {
1530     return InterleaveInfo.getInterleaveGroup(Instr);
1531   }
1532 
1533   /// Returns true if we're required to use a scalar epilogue for at least
1534   /// the final iteration of the original loop.
1535   bool requiresScalarEpilogue() const {
1536     if (!isScalarEpilogueAllowed())
1537       return false;
1538     // If we might exit from anywhere but the latch, must run the exiting
1539     // iteration in scalar form.
1540     if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch())
1541       return true;
1542     return InterleaveInfo.requiresScalarEpilogue();
1543   }
1544 
1545   /// Returns true if a scalar epilogue is not allowed due to optsize or a
1546   /// loop hint annotation.
1547   bool isScalarEpilogueAllowed() const {
1548     return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1549   }
1550 
1551   /// Returns true if all loop blocks should be masked to fold tail loop.
1552   bool foldTailByMasking() const { return FoldTailByMasking; }
1553 
1554   bool blockNeedsPredication(BasicBlock *BB) {
1555     return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1556   }
1557 
1558   /// A SmallMapVector to store the InLoop reduction op chains, mapping phi
1559   /// nodes to the chain of instructions representing the reductions. Uses a
1560   /// MapVector to ensure deterministic iteration order.
1561   using ReductionChainMap =
1562       SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>;
1563 
1564   /// Return the chain of instructions representing an inloop reduction.
1565   const ReductionChainMap &getInLoopReductionChains() const {
1566     return InLoopReductionChains;
1567   }
1568 
1569   /// Returns true if the Phi is part of an inloop reduction.
1570   bool isInLoopReduction(PHINode *Phi) const {
1571     return InLoopReductionChains.count(Phi);
1572   }
1573 
1574   /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1575   /// with factor VF.  Return the cost of the instruction, including
1576   /// scalarization overhead if it's needed.
1577   InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF);
1578 
1579   /// Estimate cost of a call instruction CI if it were vectorized with factor
1580   /// VF. Return the cost of the instruction, including scalarization overhead
1581   /// if it's needed. The flag NeedToScalarize shows if the call needs to be
1582   /// scalarized -
1583   /// i.e. either vector version isn't available, or is too expensive.
1584   InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF,
1585                                     bool &NeedToScalarize);
1586 
1587   /// Invalidates decisions already taken by the cost model.
1588   void invalidateCostModelingDecisions() {
1589     WideningDecisions.clear();
1590     Uniforms.clear();
1591     Scalars.clear();
1592   }
1593 
1594 private:
1595   unsigned NumPredStores = 0;
1596 
1597   /// \return An upper bound for the vectorization factor, a power-of-2 larger
1598   /// than zero. One is returned if vectorization should best be avoided due
1599   /// to cost.
1600   ElementCount computeFeasibleMaxVF(unsigned ConstTripCount,
1601                                     ElementCount UserVF);
1602 
1603   /// The vectorization cost is a combination of the cost itself and a boolean
1604   /// indicating whether any of the contributing operations will actually
1605   /// operate on
1606   /// vector values after type legalization in the backend. If this latter value
1607   /// is
1608   /// false, then all operations will be scalarized (i.e. no vectorization has
1609   /// actually taken place).
1610   using VectorizationCostTy = std::pair<InstructionCost, bool>;
1611 
1612   /// Returns the expected execution cost. The unit of the cost does
1613   /// not matter because we use the 'cost' units to compare different
1614   /// vector widths. The cost that is returned is *not* normalized by
1615   /// the factor width.
1616   VectorizationCostTy expectedCost(ElementCount VF);
1617 
1618   /// Returns the execution time cost of an instruction for a given vector
1619   /// width. Vector width of one means scalar.
1620   VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF);
1621 
1622   /// The cost-computation logic from getInstructionCost which provides
1623   /// the vector type as an output parameter.
1624   InstructionCost getInstructionCost(Instruction *I, ElementCount VF,
1625                                      Type *&VectorTy);
1626 
1627   /// Return the cost of instructions in an inloop reduction pattern, if I is
1628   /// part of that pattern.
1629   InstructionCost getReductionPatternCost(Instruction *I, ElementCount VF,
1630                                           Type *VectorTy,
1631                                           TTI::TargetCostKind CostKind);
1632 
1633   /// Calculate vectorization cost of memory instruction \p I.
1634   InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF);
1635 
1636   /// The cost computation for scalarized memory instruction.
1637   InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF);
1638 
1639   /// The cost computation for interleaving group of memory instructions.
1640   InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF);
1641 
1642   /// The cost computation for Gather/Scatter instruction.
1643   InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF);
1644 
1645   /// The cost computation for widening instruction \p I with consecutive
1646   /// memory access.
1647   InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
1648 
1649   /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1650   /// Load: scalar load + broadcast.
1651   /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1652   /// element)
1653   InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF);
1654 
1655   /// Estimate the overhead of scalarizing an instruction. This is a
1656   /// convenience wrapper for the type-based getScalarizationOverhead API.
1657   InstructionCost getScalarizationOverhead(Instruction *I, ElementCount VF);
1658 
1659   /// Returns whether the instruction is a load or store and will be a emitted
1660   /// as a vector operation.
1661   bool isConsecutiveLoadOrStore(Instruction *I);
1662 
1663   /// Returns true if an artificially high cost for emulated masked memrefs
1664   /// should be used.
1665   bool useEmulatedMaskMemRefHack(Instruction *I);
1666 
1667   /// Map of scalar integer values to the smallest bitwidth they can be legally
1668   /// represented as. The vector equivalents of these values should be truncated
1669   /// to this type.
1670   MapVector<Instruction *, uint64_t> MinBWs;
1671 
1672   /// A type representing the costs for instructions if they were to be
1673   /// scalarized rather than vectorized. The entries are Instruction-Cost
1674   /// pairs.
1675   using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>;
1676 
1677   /// A set containing all BasicBlocks that are known to present after
1678   /// vectorization as a predicated block.
1679   SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;
1680 
1681   /// Records whether it is allowed to have the original scalar loop execute at
1682   /// least once. This may be needed as a fallback loop in case runtime
1683   /// aliasing/dependence checks fail, or to handle the tail/remainder
1684   /// iterations when the trip count is unknown or doesn't divide by the VF,
1685   /// or as a peel-loop to handle gaps in interleave-groups.
1686   /// Under optsize and when the trip count is very small we don't allow any
1687   /// iterations to execute in the scalar loop.
1688   ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1689 
1690   /// All blocks of loop are to be masked to fold tail of scalar iterations.
1691   bool FoldTailByMasking = false;
1692 
1693   /// A map holding scalar costs for different vectorization factors. The
1694   /// presence of a cost for an instruction in the mapping indicates that the
1695   /// instruction will be scalarized when vectorizing with the associated
1696   /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1697   DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize;
1698 
1699   /// Holds the instructions known to be uniform after vectorization.
1700   /// The data is collected per VF.
1701   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms;
1702 
1703   /// Holds the instructions known to be scalar after vectorization.
1704   /// The data is collected per VF.
1705   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars;
1706 
1707   /// Holds the instructions (address computations) that are forced to be
1708   /// scalarized.
1709   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1710 
1711   /// PHINodes of the reductions that should be expanded in-loop along with
1712   /// their associated chains of reduction operations, in program order from top
1713   /// (PHI) to bottom
1714   ReductionChainMap InLoopReductionChains;
1715 
1716   /// A Map of inloop reduction operations and their immediate chain operand.
1717   /// FIXME: This can be removed once reductions can be costed correctly in
1718   /// vplan. This was added to allow quick lookup to the inloop operations,
1719   /// without having to loop through InLoopReductionChains.
1720   DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains;
1721 
1722   /// Returns the expected difference in cost from scalarizing the expression
1723   /// feeding a predicated instruction \p PredInst. The instructions to
1724   /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1725   /// non-negative return value implies the expression will be scalarized.
1726   /// Currently, only single-use chains are considered for scalarization.
1727   int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
1728                               ElementCount VF);
1729 
1730   /// Collect the instructions that are uniform after vectorization. An
1731   /// instruction is uniform if we represent it with a single scalar value in
1732   /// the vectorized loop corresponding to each vector iteration. Examples of
1733   /// uniform instructions include pointer operands of consecutive or
1734   /// interleaved memory accesses. Note that although uniformity implies an
1735   /// instruction will be scalar, the reverse is not true. In general, a
1736   /// scalarized instruction will be represented by VF scalar values in the
1737   /// vectorized loop, each corresponding to an iteration of the original
1738   /// scalar loop.
1739   void collectLoopUniforms(ElementCount VF);
1740 
1741   /// Collect the instructions that are scalar after vectorization. An
1742   /// instruction is scalar if it is known to be uniform or will be scalarized
1743   /// during vectorization. Non-uniform scalarized instructions will be
1744   /// represented by VF values in the vectorized loop, each corresponding to an
1745   /// iteration of the original scalar loop.
1746   void collectLoopScalars(ElementCount VF);
1747 
1748   /// Keeps cost model vectorization decision and cost for instructions.
1749   /// Right now it is used for memory instructions only.
1750   using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>,
1751                                 std::pair<InstWidening, InstructionCost>>;
1752 
1753   DecisionList WideningDecisions;
1754 
1755   /// Returns true if \p V is expected to be vectorized and it needs to be
1756   /// extracted.
1757   bool needsExtract(Value *V, ElementCount VF) const {
1758     Instruction *I = dyn_cast<Instruction>(V);
1759     if (VF.isScalar() || !I || !TheLoop->contains(I) ||
1760         TheLoop->isLoopInvariant(I))
1761       return false;
1762 
1763     // Assume we can vectorize V (and hence we need extraction) if the
1764     // scalars are not computed yet. This can happen, because it is called
1765     // via getScalarizationOverhead from setCostBasedWideningDecision, before
1766     // the scalars are collected. That should be a safe assumption in most
1767     // cases, because we check if the operands have vectorizable types
1768     // beforehand in LoopVectorizationLegality.
1769     return Scalars.find(VF) == Scalars.end() ||
1770            !isScalarAfterVectorization(I, VF);
1771   };
1772 
1773   /// Returns a range containing only operands needing to be extracted.
1774   SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1775                                                    ElementCount VF) {
1776     return SmallVector<Value *, 4>(make_filter_range(
1777         Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1778   }
1779 
1780   /// Determines if we have the infrastructure to vectorize loop \p L and its
1781   /// epilogue, assuming the main loop is vectorized by \p VF.
1782   bool isCandidateForEpilogueVectorization(const Loop &L,
1783                                            const ElementCount VF) const;
1784 
1785   /// Returns true if epilogue vectorization is considered profitable, and
1786   /// false otherwise.
1787   /// \p VF is the vectorization factor chosen for the original loop.
1788   bool isEpilogueVectorizationProfitable(const ElementCount VF) const;
1789 
1790 public:
1791   /// The loop that we evaluate.
1792   Loop *TheLoop;
1793 
1794   /// Predicated scalar evolution analysis.
1795   PredicatedScalarEvolution &PSE;
1796 
1797   /// Loop Info analysis.
1798   LoopInfo *LI;
1799 
1800   /// Vectorization legality.
1801   LoopVectorizationLegality *Legal;
1802 
1803   /// Vector target information.
1804   const TargetTransformInfo &TTI;
1805 
1806   /// Target Library Info.
1807   const TargetLibraryInfo *TLI;
1808 
1809   /// Demanded bits analysis.
1810   DemandedBits *DB;
1811 
1812   /// Assumption cache.
1813   AssumptionCache *AC;
1814 
1815   /// Interface to emit optimization remarks.
1816   OptimizationRemarkEmitter *ORE;
1817 
1818   const Function *TheFunction;
1819 
1820   /// Loop Vectorize Hint.
1821   const LoopVectorizeHints *Hints;
1822 
1823   /// The interleave access information contains groups of interleaved accesses
1824   /// with the same stride and close to each other.
1825   InterleavedAccessInfo &InterleaveInfo;
1826 
1827   /// Values to ignore in the cost model.
1828   SmallPtrSet<const Value *, 16> ValuesToIgnore;
1829 
1830   /// Values to ignore in the cost model when VF > 1.
1831   SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1832 
1833   /// Profitable vector factors.
1834   SmallVector<VectorizationFactor, 8> ProfitableVFs;
1835 };
1836 
1837 } // end namespace llvm
1838 
1839 // Return true if \p OuterLp is an outer loop annotated with hints for explicit
1840 // vectorization. The loop needs to be annotated with #pragma omp simd
1841 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
1842 // vector length information is not provided, vectorization is not considered
1843 // explicit. Interleave hints are not allowed either. These limitations will be
1844 // relaxed in the future.
1845 // Please, note that we are currently forced to abuse the pragma 'clang
1846 // vectorize' semantics. This pragma provides *auto-vectorization hints*
1847 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
1848 // provides *explicit vectorization hints* (LV can bypass legal checks and
1849 // assume that vectorization is legal). However, both hints are implemented
1850 // using the same metadata (llvm.loop.vectorize, processed by
1851 // LoopVectorizeHints). This will be fixed in the future when the native IR
1852 // representation for pragma 'omp simd' is introduced.
1853 static bool isExplicitVecOuterLoop(Loop *OuterLp,
1854                                    OptimizationRemarkEmitter *ORE) {
1855   assert(!OuterLp->isInnermost() && "This is not an outer loop");
1856   LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
1857 
1858   // Only outer loops with an explicit vectorization hint are supported.
1859   // Unannotated outer loops are ignored.
1860   if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
1861     return false;
1862 
1863   Function *Fn = OuterLp->getHeader()->getParent();
1864   if (!Hints.allowVectorization(Fn, OuterLp,
1865                                 true /*VectorizeOnlyWhenForced*/)) {
1866     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
1867     return false;
1868   }
1869 
1870   if (Hints.getInterleave() > 1) {
1871     // TODO: Interleave support is future work.
1872     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
1873                          "outer loops.\n");
1874     Hints.emitRemarkWithHints();
1875     return false;
1876   }
1877 
1878   return true;
1879 }
1880 
1881 static void collectSupportedLoops(Loop &L, LoopInfo *LI,
1882                                   OptimizationRemarkEmitter *ORE,
1883                                   SmallVectorImpl<Loop *> &V) {
1884   // Collect inner loops and outer loops without irreducible control flow. For
1885   // now, only collect outer loops that have explicit vectorization hints. If we
1886   // are stress testing the VPlan H-CFG construction, we collect the outermost
1887   // loop of every loop nest.
1888   if (L.isInnermost() || VPlanBuildStressTest ||
1889       (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
1890     LoopBlocksRPO RPOT(&L);
1891     RPOT.perform(LI);
1892     if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
1893       V.push_back(&L);
1894       // TODO: Collect inner loops inside marked outer loops in case
1895       // vectorization fails for the outer loop. Do not invoke
1896       // 'containsIrreducibleCFG' again for inner loops when the outer loop is
1897       // already known to be reducible. We can use an inherited attribute for
1898       // that.
1899       return;
1900     }
1901   }
1902   for (Loop *InnerL : L)
1903     collectSupportedLoops(*InnerL, LI, ORE, V);
1904 }
1905 
1906 namespace {
1907 
1908 /// The LoopVectorize Pass.
1909 struct LoopVectorize : public FunctionPass {
1910   /// Pass identification, replacement for typeid
1911   static char ID;
1912 
1913   LoopVectorizePass Impl;
1914 
1915   explicit LoopVectorize(bool InterleaveOnlyWhenForced = false,
1916                          bool VectorizeOnlyWhenForced = false)
1917       : FunctionPass(ID),
1918         Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) {
1919     initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
1920   }
1921 
1922   bool runOnFunction(Function &F) override {
1923     if (skipFunction(F))
1924       return false;
1925 
1926     auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
1927     auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
1928     auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
1929     auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
1930     auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
1931     auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
1932     auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
1933     auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
1934     auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
1935     auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
1936     auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
1937     auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
1938     auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
1939 
1940     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
1941         [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
1942 
1943     return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC,
1944                         GetLAA, *ORE, PSI).MadeAnyChange;
1945   }
1946 
1947   void getAnalysisUsage(AnalysisUsage &AU) const override {
1948     AU.addRequired<AssumptionCacheTracker>();
1949     AU.addRequired<BlockFrequencyInfoWrapperPass>();
1950     AU.addRequired<DominatorTreeWrapperPass>();
1951     AU.addRequired<LoopInfoWrapperPass>();
1952     AU.addRequired<ScalarEvolutionWrapperPass>();
1953     AU.addRequired<TargetTransformInfoWrapperPass>();
1954     AU.addRequired<AAResultsWrapperPass>();
1955     AU.addRequired<LoopAccessLegacyAnalysis>();
1956     AU.addRequired<DemandedBitsWrapperPass>();
1957     AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
1958     AU.addRequired<InjectTLIMappingsLegacy>();
1959 
1960     // We currently do not preserve loopinfo/dominator analyses with outer loop
1961     // vectorization. Until this is addressed, mark these analyses as preserved
1962     // only for non-VPlan-native path.
1963     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
1964     if (!EnableVPlanNativePath) {
1965       AU.addPreserved<LoopInfoWrapperPass>();
1966       AU.addPreserved<DominatorTreeWrapperPass>();
1967     }
1968 
1969     AU.addPreserved<BasicAAWrapperPass>();
1970     AU.addPreserved<GlobalsAAWrapperPass>();
1971     AU.addRequired<ProfileSummaryInfoWrapperPass>();
1972   }
1973 };
1974 
1975 } // end anonymous namespace
1976 
1977 //===----------------------------------------------------------------------===//
1978 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
1979 // LoopVectorizationCostModel and LoopVectorizationPlanner.
1980 //===----------------------------------------------------------------------===//
1981 
1982 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
1983   // We need to place the broadcast of invariant variables outside the loop,
1984   // but only if it's proven safe to do so. Else, broadcast will be inside
1985   // vector loop body.
1986   Instruction *Instr = dyn_cast<Instruction>(V);
1987   bool SafeToHoist = OrigLoop->isLoopInvariant(V) &&
1988                      (!Instr ||
1989                       DT->dominates(Instr->getParent(), LoopVectorPreHeader));
1990   // Place the code for broadcasting invariant variables in the new preheader.
1991   IRBuilder<>::InsertPointGuard Guard(Builder);
1992   if (SafeToHoist)
1993     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
1994 
1995   // Broadcast the scalar into all locations in the vector.
1996   Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
1997 
1998   return Shuf;
1999 }
2000 
2001 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
2002     const InductionDescriptor &II, Value *Step, Value *Start,
2003     Instruction *EntryVal, VPValue *Def, VPValue *CastDef,
2004     VPTransformState &State) {
2005   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
2006          "Expected either an induction phi-node or a truncate of it!");
2007 
2008   // Construct the initial value of the vector IV in the vector loop preheader
2009   auto CurrIP = Builder.saveIP();
2010   Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
2011   if (isa<TruncInst>(EntryVal)) {
2012     assert(Start->getType()->isIntegerTy() &&
2013            "Truncation requires an integer type");
2014     auto *TruncType = cast<IntegerType>(EntryVal->getType());
2015     Step = Builder.CreateTrunc(Step, TruncType);
2016     Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
2017   }
2018   Value *SplatStart = Builder.CreateVectorSplat(VF, Start);
2019   Value *SteppedStart =
2020       getStepVector(SplatStart, 0, Step, II.getInductionOpcode());
2021 
2022   // We create vector phi nodes for both integer and floating-point induction
2023   // variables. Here, we determine the kind of arithmetic we will perform.
2024   Instruction::BinaryOps AddOp;
2025   Instruction::BinaryOps MulOp;
2026   if (Step->getType()->isIntegerTy()) {
2027     AddOp = Instruction::Add;
2028     MulOp = Instruction::Mul;
2029   } else {
2030     AddOp = II.getInductionOpcode();
2031     MulOp = Instruction::FMul;
2032   }
2033 
2034   // Multiply the vectorization factor by the step using integer or
2035   // floating-point arithmetic as appropriate.
2036   Value *ConstVF =
2037       getSignedIntOrFpConstant(Step->getType(), VF.getKnownMinValue());
2038   Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF));
2039 
2040   // Create a vector splat to use in the induction update.
2041   //
2042   // FIXME: If the step is non-constant, we create the vector splat with
2043   //        IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
2044   //        handle a constant vector splat.
2045   assert(!VF.isScalable() && "scalable vectors not yet supported.");
2046   Value *SplatVF = isa<Constant>(Mul)
2047                        ? ConstantVector::getSplat(VF, cast<Constant>(Mul))
2048                        : Builder.CreateVectorSplat(VF, Mul);
2049   Builder.restoreIP(CurrIP);
2050 
2051   // We may need to add the step a number of times, depending on the unroll
2052   // factor. The last of those goes into the PHI.
2053   PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
2054                                     &*LoopVectorBody->getFirstInsertionPt());
2055   VecInd->setDebugLoc(EntryVal->getDebugLoc());
2056   Instruction *LastInduction = VecInd;
2057   for (unsigned Part = 0; Part < UF; ++Part) {
2058     State.set(Def, LastInduction, Part);
2059 
2060     if (isa<TruncInst>(EntryVal))
2061       addMetadata(LastInduction, EntryVal);
2062     recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, CastDef,
2063                                           State, Part);
2064 
2065     LastInduction = cast<Instruction>(addFastMathFlag(
2066         Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")));
2067     LastInduction->setDebugLoc(EntryVal->getDebugLoc());
2068   }
2069 
2070   // Move the last step to the end of the latch block. This ensures consistent
2071   // placement of all induction updates.
2072   auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
2073   auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator());
2074   auto *ICmp = cast<Instruction>(Br->getCondition());
2075   LastInduction->moveBefore(ICmp);
2076   LastInduction->setName("vec.ind.next");
2077 
2078   VecInd->addIncoming(SteppedStart, LoopVectorPreHeader);
2079   VecInd->addIncoming(LastInduction, LoopVectorLatch);
2080 }
2081 
2082 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const {
2083   return Cost->isScalarAfterVectorization(I, VF) ||
2084          Cost->isProfitableToScalarize(I, VF);
2085 }
2086 
2087 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const {
2088   if (shouldScalarizeInstruction(IV))
2089     return true;
2090   auto isScalarInst = [&](User *U) -> bool {
2091     auto *I = cast<Instruction>(U);
2092     return (OrigLoop->contains(I) && shouldScalarizeInstruction(I));
2093   };
2094   return llvm::any_of(IV->users(), isScalarInst);
2095 }
2096 
2097 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast(
2098     const InductionDescriptor &ID, const Instruction *EntryVal,
2099     Value *VectorLoopVal, VPValue *CastDef, VPTransformState &State,
2100     unsigned Part, unsigned Lane) {
2101   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
2102          "Expected either an induction phi-node or a truncate of it!");
2103 
2104   // This induction variable is not the phi from the original loop but the
2105   // newly-created IV based on the proof that casted Phi is equal to the
2106   // uncasted Phi in the vectorized loop (under a runtime guard possibly). It
2107   // re-uses the same InductionDescriptor that original IV uses but we don't
2108   // have to do any recording in this case - that is done when original IV is
2109   // processed.
2110   if (isa<TruncInst>(EntryVal))
2111     return;
2112 
2113   const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts();
2114   if (Casts.empty())
2115     return;
2116   // Only the first Cast instruction in the Casts vector is of interest.
2117   // The rest of the Casts (if exist) have no uses outside the
2118   // induction update chain itself.
2119   if (Lane < UINT_MAX)
2120     State.set(CastDef, VectorLoopVal, VPIteration(Part, Lane));
2121   else
2122     State.set(CastDef, VectorLoopVal, Part);
2123 }
2124 
2125 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start,
2126                                                 TruncInst *Trunc, VPValue *Def,
2127                                                 VPValue *CastDef,
2128                                                 VPTransformState &State) {
2129   assert((IV->getType()->isIntegerTy() || IV != OldInduction) &&
2130          "Primary induction variable must have an integer type");
2131 
2132   auto II = Legal->getInductionVars().find(IV);
2133   assert(II != Legal->getInductionVars().end() && "IV is not an induction");
2134 
2135   auto ID = II->second;
2136   assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
2137 
2138   // The value from the original loop to which we are mapping the new induction
2139   // variable.
2140   Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
2141 
2142   auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
2143 
2144   // Generate code for the induction step. Note that induction steps are
2145   // required to be loop-invariant
2146   auto CreateStepValue = [&](const SCEV *Step) -> Value * {
2147     assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) &&
2148            "Induction step should be loop invariant");
2149     if (PSE.getSE()->isSCEVable(IV->getType())) {
2150       SCEVExpander Exp(*PSE.getSE(), DL, "induction");
2151       return Exp.expandCodeFor(Step, Step->getType(),
2152                                LoopVectorPreHeader->getTerminator());
2153     }
2154     return cast<SCEVUnknown>(Step)->getValue();
2155   };
2156 
2157   // The scalar value to broadcast. This is derived from the canonical
2158   // induction variable. If a truncation type is given, truncate the canonical
2159   // induction variable and step. Otherwise, derive these values from the
2160   // induction descriptor.
2161   auto CreateScalarIV = [&](Value *&Step) -> Value * {
2162     Value *ScalarIV = Induction;
2163     if (IV != OldInduction) {
2164       ScalarIV = IV->getType()->isIntegerTy()
2165                      ? Builder.CreateSExtOrTrunc(Induction, IV->getType())
2166                      : Builder.CreateCast(Instruction::SIToFP, Induction,
2167                                           IV->getType());
2168       ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID);
2169       ScalarIV->setName("offset.idx");
2170     }
2171     if (Trunc) {
2172       auto *TruncType = cast<IntegerType>(Trunc->getType());
2173       assert(Step->getType()->isIntegerTy() &&
2174              "Truncation requires an integer step");
2175       ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType);
2176       Step = Builder.CreateTrunc(Step, TruncType);
2177     }
2178     return ScalarIV;
2179   };
2180 
2181   // Create the vector values from the scalar IV, in the absence of creating a
2182   // vector IV.
2183   auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) {
2184     Value *Broadcasted = getBroadcastInstrs(ScalarIV);
2185     for (unsigned Part = 0; Part < UF; ++Part) {
2186       assert(!VF.isScalable() && "scalable vectors not yet supported.");
2187       Value *EntryPart =
2188           getStepVector(Broadcasted, VF.getKnownMinValue() * Part, Step,
2189                         ID.getInductionOpcode());
2190       State.set(Def, EntryPart, Part);
2191       if (Trunc)
2192         addMetadata(EntryPart, Trunc);
2193       recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, CastDef,
2194                                             State, Part);
2195     }
2196   };
2197 
2198   // Now do the actual transformations, and start with creating the step value.
2199   Value *Step = CreateStepValue(ID.getStep());
2200   if (VF.isZero() || VF.isScalar()) {
2201     Value *ScalarIV = CreateScalarIV(Step);
2202     CreateSplatIV(ScalarIV, Step);
2203     return;
2204   }
2205 
2206   // Determine if we want a scalar version of the induction variable. This is
2207   // true if the induction variable itself is not widened, or if it has at
2208   // least one user in the loop that is not widened.
2209   auto NeedsScalarIV = needsScalarInduction(EntryVal);
2210   if (!NeedsScalarIV) {
2211     createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, CastDef,
2212                                     State);
2213     return;
2214   }
2215 
2216   // Try to create a new independent vector induction variable. If we can't
2217   // create the phi node, we will splat the scalar induction variable in each
2218   // loop iteration.
2219   if (!shouldScalarizeInstruction(EntryVal)) {
2220     createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, CastDef,
2221                                     State);
2222     Value *ScalarIV = CreateScalarIV(Step);
2223     // Create scalar steps that can be used by instructions we will later
2224     // scalarize. Note that the addition of the scalar steps will not increase
2225     // the number of instructions in the loop in the common case prior to
2226     // InstCombine. We will be trading one vector extract for each scalar step.
2227     buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, CastDef, State);
2228     return;
2229   }
2230 
2231   // All IV users are scalar instructions, so only emit a scalar IV, not a
2232   // vectorised IV. Except when we tail-fold, then the splat IV feeds the
2233   // predicate used by the masked loads/stores.
2234   Value *ScalarIV = CreateScalarIV(Step);
2235   if (!Cost->isScalarEpilogueAllowed())
2236     CreateSplatIV(ScalarIV, Step);
2237   buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, CastDef, State);
2238 }
2239 
2240 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step,
2241                                           Instruction::BinaryOps BinOp) {
2242   // Create and check the types.
2243   auto *ValVTy = cast<FixedVectorType>(Val->getType());
2244   int VLen = ValVTy->getNumElements();
2245 
2246   Type *STy = Val->getType()->getScalarType();
2247   assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
2248          "Induction Step must be an integer or FP");
2249   assert(Step->getType() == STy && "Step has wrong type");
2250 
2251   SmallVector<Constant *, 8> Indices;
2252 
2253   if (STy->isIntegerTy()) {
2254     // Create a vector of consecutive numbers from zero to VF.
2255     for (int i = 0; i < VLen; ++i)
2256       Indices.push_back(ConstantInt::get(STy, StartIdx + i));
2257 
2258     // Add the consecutive indices to the vector value.
2259     Constant *Cv = ConstantVector::get(Indices);
2260     assert(Cv->getType() == Val->getType() && "Invalid consecutive vec");
2261     Step = Builder.CreateVectorSplat(VLen, Step);
2262     assert(Step->getType() == Val->getType() && "Invalid step vec");
2263     // FIXME: The newly created binary instructions should contain nsw/nuw flags,
2264     // which can be found from the original scalar operations.
2265     Step = Builder.CreateMul(Cv, Step);
2266     return Builder.CreateAdd(Val, Step, "induction");
2267   }
2268 
2269   // Floating point induction.
2270   assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
2271          "Binary Opcode should be specified for FP induction");
2272   // Create a vector of consecutive numbers from zero to VF.
2273   for (int i = 0; i < VLen; ++i)
2274     Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i)));
2275 
2276   // Add the consecutive indices to the vector value.
2277   Constant *Cv = ConstantVector::get(Indices);
2278 
2279   Step = Builder.CreateVectorSplat(VLen, Step);
2280 
2281   // Floating point operations had to be 'fast' to enable the induction.
2282   FastMathFlags Flags;
2283   Flags.setFast();
2284 
2285   Value *MulOp = Builder.CreateFMul(Cv, Step);
2286   if (isa<Instruction>(MulOp))
2287     // Have to check, MulOp may be a constant
2288     cast<Instruction>(MulOp)->setFastMathFlags(Flags);
2289 
2290   Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
2291   if (isa<Instruction>(BOp))
2292     cast<Instruction>(BOp)->setFastMathFlags(Flags);
2293   return BOp;
2294 }
2295 
2296 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
2297                                            Instruction *EntryVal,
2298                                            const InductionDescriptor &ID,
2299                                            VPValue *Def, VPValue *CastDef,
2300                                            VPTransformState &State) {
2301   // We shouldn't have to build scalar steps if we aren't vectorizing.
2302   assert(VF.isVector() && "VF should be greater than one");
2303   // Get the value type and ensure it and the step have the same integer type.
2304   Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
2305   assert(ScalarIVTy == Step->getType() &&
2306          "Val and Step should have the same type");
2307 
2308   // We build scalar steps for both integer and floating-point induction
2309   // variables. Here, we determine the kind of arithmetic we will perform.
2310   Instruction::BinaryOps AddOp;
2311   Instruction::BinaryOps MulOp;
2312   if (ScalarIVTy->isIntegerTy()) {
2313     AddOp = Instruction::Add;
2314     MulOp = Instruction::Mul;
2315   } else {
2316     AddOp = ID.getInductionOpcode();
2317     MulOp = Instruction::FMul;
2318   }
2319 
2320   // Determine the number of scalars we need to generate for each unroll
2321   // iteration. If EntryVal is uniform, we only need to generate the first
2322   // lane. Otherwise, we generate all VF values.
2323   unsigned Lanes =
2324       Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF)
2325           ? 1
2326           : VF.getKnownMinValue();
2327   assert((!VF.isScalable() || Lanes == 1) &&
2328          "Should never scalarize a scalable vector");
2329   // Compute the scalar steps and save the results in State.
2330   for (unsigned Part = 0; Part < UF; ++Part) {
2331     for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
2332       auto *IntStepTy = IntegerType::get(ScalarIVTy->getContext(),
2333                                          ScalarIVTy->getScalarSizeInBits());
2334       Value *StartIdx =
2335           createStepForVF(Builder, ConstantInt::get(IntStepTy, Part), VF);
2336       if (ScalarIVTy->isFloatingPointTy())
2337         StartIdx = Builder.CreateSIToFP(StartIdx, ScalarIVTy);
2338       StartIdx = addFastMathFlag(Builder.CreateBinOp(
2339           AddOp, StartIdx, getSignedIntOrFpConstant(ScalarIVTy, Lane)));
2340       // The step returned by `createStepForVF` is a runtime-evaluated value
2341       // when VF is scalable. Otherwise, it should be folded into a Constant.
2342       assert((VF.isScalable() || isa<Constant>(StartIdx)) &&
2343              "Expected StartIdx to be folded to a constant when VF is not "
2344              "scalable");
2345       auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step));
2346       auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul));
2347       State.set(Def, Add, VPIteration(Part, Lane));
2348       recordVectorLoopValueForInductionCast(ID, EntryVal, Add, CastDef, State,
2349                                             Part, Lane);
2350     }
2351   }
2352 }
2353 
2354 void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue *Def,
2355                                                     const VPIteration &Instance,
2356                                                     VPTransformState &State) {
2357   Value *ScalarInst = State.get(Def, Instance);
2358   Value *VectorValue = State.get(Def, Instance.Part);
2359   VectorValue = Builder.CreateInsertElement(
2360       VectorValue, ScalarInst, State.Builder.getInt32(Instance.Lane));
2361   State.set(Def, VectorValue, Instance.Part);
2362 }
2363 
2364 Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
2365   assert(Vec->getType()->isVectorTy() && "Invalid type");
2366   assert(!VF.isScalable() && "Cannot reverse scalable vectors");
2367   SmallVector<int, 8> ShuffleMask;
2368   for (unsigned i = 0; i < VF.getKnownMinValue(); ++i)
2369     ShuffleMask.push_back(VF.getKnownMinValue() - i - 1);
2370 
2371   return Builder.CreateShuffleVector(Vec, ShuffleMask, "reverse");
2372 }
2373 
2374 // Return whether we allow using masked interleave-groups (for dealing with
2375 // strided loads/stores that reside in predicated blocks, or for dealing
2376 // with gaps).
2377 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2378   // If an override option has been passed in for interleaved accesses, use it.
2379   if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2380     return EnableMaskedInterleavedMemAccesses;
2381 
2382   return TTI.enableMaskedInterleavedAccessVectorization();
2383 }
2384 
2385 // Try to vectorize the interleave group that \p Instr belongs to.
2386 //
2387 // E.g. Translate following interleaved load group (factor = 3):
2388 //   for (i = 0; i < N; i+=3) {
2389 //     R = Pic[i];             // Member of index 0
2390 //     G = Pic[i+1];           // Member of index 1
2391 //     B = Pic[i+2];           // Member of index 2
2392 //     ... // do something to R, G, B
2393 //   }
2394 // To:
2395 //   %wide.vec = load <12 x i32>                       ; Read 4 tuples of R,G,B
2396 //   %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9>   ; R elements
2397 //   %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10>  ; G elements
2398 //   %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11>  ; B elements
2399 //
2400 // Or translate following interleaved store group (factor = 3):
2401 //   for (i = 0; i < N; i+=3) {
2402 //     ... do something to R, G, B
2403 //     Pic[i]   = R;           // Member of index 0
2404 //     Pic[i+1] = G;           // Member of index 1
2405 //     Pic[i+2] = B;           // Member of index 2
2406 //   }
2407 // To:
2408 //   %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2409 //   %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u>
2410 //   %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2411 //        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements
2412 //   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
2413 void InnerLoopVectorizer::vectorizeInterleaveGroup(
2414     const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs,
2415     VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues,
2416     VPValue *BlockInMask) {
2417   Instruction *Instr = Group->getInsertPos();
2418   const DataLayout &DL = Instr->getModule()->getDataLayout();
2419 
2420   // Prepare for the vector type of the interleaved load/store.
2421   Type *ScalarTy = getMemInstValueType(Instr);
2422   unsigned InterleaveFactor = Group->getFactor();
2423   assert(!VF.isScalable() && "scalable vectors not yet supported.");
2424   auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor);
2425 
2426   // Prepare for the new pointers.
2427   SmallVector<Value *, 2> AddrParts;
2428   unsigned Index = Group->getIndex(Instr);
2429 
2430   // TODO: extend the masked interleaved-group support to reversed access.
2431   assert((!BlockInMask || !Group->isReverse()) &&
2432          "Reversed masked interleave-group not supported.");
2433 
2434   // If the group is reverse, adjust the index to refer to the last vector lane
2435   // instead of the first. We adjust the index from the first vector lane,
2436   // rather than directly getting the pointer for lane VF - 1, because the
2437   // pointer operand of the interleaved access is supposed to be uniform. For
2438   // uniform instructions, we're only required to generate a value for the
2439   // first vector lane in each unroll iteration.
2440   assert(!VF.isScalable() &&
2441          "scalable vector reverse operation is not implemented");
2442   if (Group->isReverse())
2443     Index += (VF.getKnownMinValue() - 1) * Group->getFactor();
2444 
2445   for (unsigned Part = 0; Part < UF; Part++) {
2446     Value *AddrPart = State.get(Addr, VPIteration(Part, 0));
2447     setDebugLocFromInst(Builder, AddrPart);
2448 
2449     // Notice current instruction could be any index. Need to adjust the address
2450     // to the member of index 0.
2451     //
2452     // E.g.  a = A[i+1];     // Member of index 1 (Current instruction)
2453     //       b = A[i];       // Member of index 0
2454     // Current pointer is pointed to A[i+1], adjust it to A[i].
2455     //
2456     // E.g.  A[i+1] = a;     // Member of index 1
2457     //       A[i]   = b;     // Member of index 0
2458     //       A[i+2] = c;     // Member of index 2 (Current instruction)
2459     // Current pointer is pointed to A[i+2], adjust it to A[i].
2460 
2461     bool InBounds = false;
2462     if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts()))
2463       InBounds = gep->isInBounds();
2464     AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index));
2465     cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds);
2466 
2467     // Cast to the vector pointer type.
2468     unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace();
2469     Type *PtrTy = VecTy->getPointerTo(AddressSpace);
2470     AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy));
2471   }
2472 
2473   setDebugLocFromInst(Builder, Instr);
2474   Value *PoisonVec = PoisonValue::get(VecTy);
2475 
2476   Value *MaskForGaps = nullptr;
2477   if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) {
2478     assert(!VF.isScalable() && "scalable vectors not yet supported.");
2479     MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
2480     assert(MaskForGaps && "Mask for Gaps is required but it is null");
2481   }
2482 
2483   // Vectorize the interleaved load group.
2484   if (isa<LoadInst>(Instr)) {
2485     // For each unroll part, create a wide load for the group.
2486     SmallVector<Value *, 2> NewLoads;
2487     for (unsigned Part = 0; Part < UF; Part++) {
2488       Instruction *NewLoad;
2489       if (BlockInMask || MaskForGaps) {
2490         assert(useMaskedInterleavedAccesses(*TTI) &&
2491                "masked interleaved groups are not allowed.");
2492         Value *GroupMask = MaskForGaps;
2493         if (BlockInMask) {
2494           Value *BlockInMaskPart = State.get(BlockInMask, Part);
2495           assert(!VF.isScalable() && "scalable vectors not yet supported.");
2496           Value *ShuffledMask = Builder.CreateShuffleVector(
2497               BlockInMaskPart,
2498               createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2499               "interleaved.mask");
2500           GroupMask = MaskForGaps
2501                           ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
2502                                                 MaskForGaps)
2503                           : ShuffledMask;
2504         }
2505         NewLoad =
2506             Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlign(),
2507                                      GroupMask, PoisonVec, "wide.masked.vec");
2508       }
2509       else
2510         NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part],
2511                                             Group->getAlign(), "wide.vec");
2512       Group->addMetadata(NewLoad);
2513       NewLoads.push_back(NewLoad);
2514     }
2515 
2516     // For each member in the group, shuffle out the appropriate data from the
2517     // wide loads.
2518     unsigned J = 0;
2519     for (unsigned I = 0; I < InterleaveFactor; ++I) {
2520       Instruction *Member = Group->getMember(I);
2521 
2522       // Skip the gaps in the group.
2523       if (!Member)
2524         continue;
2525 
2526       assert(!VF.isScalable() && "scalable vectors not yet supported.");
2527       auto StrideMask =
2528           createStrideMask(I, InterleaveFactor, VF.getKnownMinValue());
2529       for (unsigned Part = 0; Part < UF; Part++) {
2530         Value *StridedVec = Builder.CreateShuffleVector(
2531             NewLoads[Part], StrideMask, "strided.vec");
2532 
2533         // If this member has different type, cast the result type.
2534         if (Member->getType() != ScalarTy) {
2535           assert(!VF.isScalable() && "VF is assumed to be non scalable.");
2536           VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2537           StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2538         }
2539 
2540         if (Group->isReverse())
2541           StridedVec = reverseVector(StridedVec);
2542 
2543         State.set(VPDefs[J], StridedVec, Part);
2544       }
2545       ++J;
2546     }
2547     return;
2548   }
2549 
2550   // The sub vector type for current instruction.
2551   assert(!VF.isScalable() && "VF is assumed to be non scalable.");
2552   auto *SubVT = VectorType::get(ScalarTy, VF);
2553 
2554   // Vectorize the interleaved store group.
2555   for (unsigned Part = 0; Part < UF; Part++) {
2556     // Collect the stored vector from each member.
2557     SmallVector<Value *, 4> StoredVecs;
2558     for (unsigned i = 0; i < InterleaveFactor; i++) {
2559       // Interleaved store group doesn't allow a gap, so each index has a member
2560       assert(Group->getMember(i) && "Fail to get a member from an interleaved store group");
2561 
2562       Value *StoredVec = State.get(StoredValues[i], Part);
2563 
2564       if (Group->isReverse())
2565         StoredVec = reverseVector(StoredVec);
2566 
2567       // If this member has different type, cast it to a unified type.
2568 
2569       if (StoredVec->getType() != SubVT)
2570         StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2571 
2572       StoredVecs.push_back(StoredVec);
2573     }
2574 
2575     // Concatenate all vectors into a wide vector.
2576     Value *WideVec = concatenateVectors(Builder, StoredVecs);
2577 
2578     // Interleave the elements in the wide vector.
2579     assert(!VF.isScalable() && "scalable vectors not yet supported.");
2580     Value *IVec = Builder.CreateShuffleVector(
2581         WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor),
2582         "interleaved.vec");
2583 
2584     Instruction *NewStoreInstr;
2585     if (BlockInMask) {
2586       Value *BlockInMaskPart = State.get(BlockInMask, Part);
2587       Value *ShuffledMask = Builder.CreateShuffleVector(
2588           BlockInMaskPart,
2589           createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2590           "interleaved.mask");
2591       NewStoreInstr = Builder.CreateMaskedStore(
2592           IVec, AddrParts[Part], Group->getAlign(), ShuffledMask);
2593     }
2594     else
2595       NewStoreInstr =
2596           Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign());
2597 
2598     Group->addMetadata(NewStoreInstr);
2599   }
2600 }
2601 
2602 void InnerLoopVectorizer::vectorizeMemoryInstruction(
2603     Instruction *Instr, VPTransformState &State, VPValue *Def, VPValue *Addr,
2604     VPValue *StoredValue, VPValue *BlockInMask) {
2605   // Attempt to issue a wide load.
2606   LoadInst *LI = dyn_cast<LoadInst>(Instr);
2607   StoreInst *SI = dyn_cast<StoreInst>(Instr);
2608 
2609   assert((LI || SI) && "Invalid Load/Store instruction");
2610   assert((!SI || StoredValue) && "No stored value provided for widened store");
2611   assert((!LI || !StoredValue) && "Stored value provided for widened load");
2612 
2613   LoopVectorizationCostModel::InstWidening Decision =
2614       Cost->getWideningDecision(Instr, VF);
2615   assert((Decision == LoopVectorizationCostModel::CM_Widen ||
2616           Decision == LoopVectorizationCostModel::CM_Widen_Reverse ||
2617           Decision == LoopVectorizationCostModel::CM_GatherScatter) &&
2618          "CM decision is not to widen the memory instruction");
2619 
2620   Type *ScalarDataTy = getMemInstValueType(Instr);
2621 
2622   auto *DataTy = VectorType::get(ScalarDataTy, VF);
2623   const Align Alignment = getLoadStoreAlignment(Instr);
2624 
2625   // Determine if the pointer operand of the access is either consecutive or
2626   // reverse consecutive.
2627   bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse);
2628   bool ConsecutiveStride =
2629       Reverse || (Decision == LoopVectorizationCostModel::CM_Widen);
2630   bool CreateGatherScatter =
2631       (Decision == LoopVectorizationCostModel::CM_GatherScatter);
2632 
2633   // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector
2634   // gather/scatter. Otherwise Decision should have been to Scalarize.
2635   assert((ConsecutiveStride || CreateGatherScatter) &&
2636          "The instruction should be scalarized");
2637   (void)ConsecutiveStride;
2638 
2639   VectorParts BlockInMaskParts(UF);
2640   bool isMaskRequired = BlockInMask;
2641   if (isMaskRequired)
2642     for (unsigned Part = 0; Part < UF; ++Part)
2643       BlockInMaskParts[Part] = State.get(BlockInMask, Part);
2644 
2645   const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
2646     // Calculate the pointer for the specific unroll-part.
2647     GetElementPtrInst *PartPtr = nullptr;
2648 
2649     bool InBounds = false;
2650     if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
2651       InBounds = gep->isInBounds();
2652 
2653     if (Reverse) {
2654       assert(!VF.isScalable() &&
2655              "Reversing vectors is not yet supported for scalable vectors.");
2656 
2657       // If the address is consecutive but reversed, then the
2658       // wide store needs to start at the last vector element.
2659       PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP(
2660           ScalarDataTy, Ptr, Builder.getInt32(-Part * VF.getKnownMinValue())));
2661       PartPtr->setIsInBounds(InBounds);
2662       PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP(
2663           ScalarDataTy, PartPtr, Builder.getInt32(1 - VF.getKnownMinValue())));
2664       PartPtr->setIsInBounds(InBounds);
2665       if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
2666         BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]);
2667     } else {
2668       Value *Increment = createStepForVF(Builder, Builder.getInt32(Part), VF);
2669       PartPtr = cast<GetElementPtrInst>(
2670           Builder.CreateGEP(ScalarDataTy, Ptr, Increment));
2671       PartPtr->setIsInBounds(InBounds);
2672     }
2673 
2674     unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
2675     return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
2676   };
2677 
2678   // Handle Stores:
2679   if (SI) {
2680     setDebugLocFromInst(Builder, SI);
2681 
2682     for (unsigned Part = 0; Part < UF; ++Part) {
2683       Instruction *NewSI = nullptr;
2684       Value *StoredVal = State.get(StoredValue, Part);
2685       if (CreateGatherScatter) {
2686         Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
2687         Value *VectorGep = State.get(Addr, Part);
2688         NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
2689                                             MaskPart);
2690       } else {
2691         if (Reverse) {
2692           // If we store to reverse consecutive memory locations, then we need
2693           // to reverse the order of elements in the stored value.
2694           StoredVal = reverseVector(StoredVal);
2695           // We don't want to update the value in the map as it might be used in
2696           // another expression. So don't call resetVectorValue(StoredVal).
2697         }
2698         auto *VecPtr = CreateVecPtr(Part, State.get(Addr, VPIteration(0, 0)));
2699         if (isMaskRequired)
2700           NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
2701                                             BlockInMaskParts[Part]);
2702         else
2703           NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
2704       }
2705       addMetadata(NewSI, SI);
2706     }
2707     return;
2708   }
2709 
2710   // Handle loads.
2711   assert(LI && "Must have a load instruction");
2712   setDebugLocFromInst(Builder, LI);
2713   for (unsigned Part = 0; Part < UF; ++Part) {
2714     Value *NewLI;
2715     if (CreateGatherScatter) {
2716       Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
2717       Value *VectorGep = State.get(Addr, Part);
2718       NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart,
2719                                          nullptr, "wide.masked.gather");
2720       addMetadata(NewLI, LI);
2721     } else {
2722       auto *VecPtr = CreateVecPtr(Part, State.get(Addr, VPIteration(0, 0)));
2723       if (isMaskRequired)
2724         NewLI = Builder.CreateMaskedLoad(
2725             VecPtr, Alignment, BlockInMaskParts[Part], PoisonValue::get(DataTy),
2726             "wide.masked.load");
2727       else
2728         NewLI =
2729             Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");
2730 
2731       // Add metadata to the load, but setVectorValue to the reverse shuffle.
2732       addMetadata(NewLI, LI);
2733       if (Reverse)
2734         NewLI = reverseVector(NewLI);
2735     }
2736 
2737     State.set(Def, NewLI, Part);
2738   }
2739 }
2740 
2741 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPValue *Def,
2742                                                VPUser &User,
2743                                                const VPIteration &Instance,
2744                                                bool IfPredicateInstr,
2745                                                VPTransformState &State) {
2746   assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
2747 
2748   // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for
2749   // the first lane and part.
2750   if (isa<NoAliasScopeDeclInst>(Instr))
2751     if (!Instance.isFirstIteration())
2752       return;
2753 
2754   setDebugLocFromInst(Builder, Instr);
2755 
2756   // Does this instruction return a value ?
2757   bool IsVoidRetTy = Instr->getType()->isVoidTy();
2758 
2759   Instruction *Cloned = Instr->clone();
2760   if (!IsVoidRetTy)
2761     Cloned->setName(Instr->getName() + ".cloned");
2762 
2763   State.Builder.SetInsertPoint(Builder.GetInsertBlock(),
2764                                Builder.GetInsertPoint());
2765   // Replace the operands of the cloned instructions with their scalar
2766   // equivalents in the new loop.
2767   for (unsigned op = 0, e = User.getNumOperands(); op != e; ++op) {
2768     auto *Operand = dyn_cast<Instruction>(Instr->getOperand(op));
2769     auto InputInstance = Instance;
2770     if (!Operand || !OrigLoop->contains(Operand) ||
2771         (Cost->isUniformAfterVectorization(Operand, State.VF)))
2772       InputInstance.Lane = 0;
2773     auto *NewOp = State.get(User.getOperand(op), InputInstance);
2774     Cloned->setOperand(op, NewOp);
2775   }
2776   addNewMetadata(Cloned, Instr);
2777 
2778   // Place the cloned scalar in the new loop.
2779   Builder.Insert(Cloned);
2780 
2781   State.set(Def, Cloned, Instance);
2782 
2783   // If we just cloned a new assumption, add it the assumption cache.
2784   if (auto *II = dyn_cast<IntrinsicInst>(Cloned))
2785     if (II->getIntrinsicID() == Intrinsic::assume)
2786       AC->registerAssumption(II);
2787 
2788   // End if-block.
2789   if (IfPredicateInstr)
2790     PredicatedInstructions.push_back(Cloned);
2791 }
2792 
2793 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start,
2794                                                       Value *End, Value *Step,
2795                                                       Instruction *DL) {
2796   BasicBlock *Header = L->getHeader();
2797   BasicBlock *Latch = L->getLoopLatch();
2798   // As we're just creating this loop, it's possible no latch exists
2799   // yet. If so, use the header as this will be a single block loop.
2800   if (!Latch)
2801     Latch = Header;
2802 
2803   IRBuilder<> Builder(&*Header->getFirstInsertionPt());
2804   Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction);
2805   setDebugLocFromInst(Builder, OldInst);
2806   auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index");
2807 
2808   Builder.SetInsertPoint(Latch->getTerminator());
2809   setDebugLocFromInst(Builder, OldInst);
2810 
2811   // Create i+1 and fill the PHINode.
2812   Value *Next = Builder.CreateAdd(Induction, Step, "index.next");
2813   Induction->addIncoming(Start, L->getLoopPreheader());
2814   Induction->addIncoming(Next, Latch);
2815   // Create the compare.
2816   Value *ICmp = Builder.CreateICmpEQ(Next, End);
2817   Builder.CreateCondBr(ICmp, L->getUniqueExitBlock(), Header);
2818 
2819   // Now we have two terminators. Remove the old one from the block.
2820   Latch->getTerminator()->eraseFromParent();
2821 
2822   return Induction;
2823 }
2824 
2825 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
2826   if (TripCount)
2827     return TripCount;
2828 
2829   assert(L && "Create Trip Count for null loop.");
2830   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2831   // Find the loop boundaries.
2832   ScalarEvolution *SE = PSE.getSE();
2833   const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
2834   assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) &&
2835          "Invalid loop count");
2836 
2837   Type *IdxTy = Legal->getWidestInductionType();
2838   assert(IdxTy && "No type for induction");
2839 
2840   // The exit count might have the type of i64 while the phi is i32. This can
2841   // happen if we have an induction variable that is sign extended before the
2842   // compare. The only way that we get a backedge taken count is that the
2843   // induction variable was signed and as such will not overflow. In such a case
2844   // truncation is legal.
2845   if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) >
2846       IdxTy->getPrimitiveSizeInBits())
2847     BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
2848   BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
2849 
2850   // Get the total trip count from the count by adding 1.
2851   const SCEV *ExitCount = SE->getAddExpr(
2852       BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
2853 
2854   const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
2855 
2856   // Expand the trip count and place the new instructions in the preheader.
2857   // Notice that the pre-header does not change, only the loop body.
2858   SCEVExpander Exp(*SE, DL, "induction");
2859 
2860   // Count holds the overall loop count (N).
2861   TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
2862                                 L->getLoopPreheader()->getTerminator());
2863 
2864   if (TripCount->getType()->isPointerTy())
2865     TripCount =
2866         CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
2867                                     L->getLoopPreheader()->getTerminator());
2868 
2869   return TripCount;
2870 }
2871 
2872 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
2873   if (VectorTripCount)
2874     return VectorTripCount;
2875 
2876   Value *TC = getOrCreateTripCount(L);
2877   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2878 
2879   Type *Ty = TC->getType();
2880   // This is where we can make the step a runtime constant.
2881   Value *Step = createStepForVF(Builder, ConstantInt::get(Ty, UF), VF);
2882 
2883   // If the tail is to be folded by masking, round the number of iterations N
2884   // up to a multiple of Step instead of rounding down. This is done by first
2885   // adding Step-1 and then rounding down. Note that it's ok if this addition
2886   // overflows: the vector induction variable will eventually wrap to zero given
2887   // that it starts at zero and its Step is a power of two; the loop will then
2888   // exit, with the last early-exit vector comparison also producing all-true.
2889   if (Cost->foldTailByMasking()) {
2890     assert(isPowerOf2_32(VF.getKnownMinValue() * UF) &&
2891            "VF*UF must be a power of 2 when folding tail by masking");
2892     assert(!VF.isScalable() &&
2893            "Tail folding not yet supported for scalable vectors");
2894     TC = Builder.CreateAdd(
2895         TC, ConstantInt::get(Ty, VF.getKnownMinValue() * UF - 1), "n.rnd.up");
2896   }
2897 
2898   // Now we need to generate the expression for the part of the loop that the
2899   // vectorized body will execute. This is equal to N - (N % Step) if scalar
2900   // iterations are not required for correctness, or N - Step, otherwise. Step
2901   // is equal to the vectorization factor (number of SIMD elements) times the
2902   // unroll factor (number of SIMD instructions).
2903   Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
2904 
2905   // There are two cases where we need to ensure (at least) the last iteration
2906   // runs in the scalar remainder loop. Thus, if the step evenly divides
2907   // the trip count, we set the remainder to be equal to the step. If the step
2908   // does not evenly divide the trip count, no adjustment is necessary since
2909   // there will already be scalar iterations. Note that the minimum iterations
2910   // check ensures that N >= Step. The cases are:
2911   // 1) If there is a non-reversed interleaved group that may speculatively
2912   //    access memory out-of-bounds.
2913   // 2) If any instruction may follow a conditionally taken exit. That is, if
2914   //    the loop contains multiple exiting blocks, or a single exiting block
2915   //    which is not the latch.
2916   if (VF.isVector() && Cost->requiresScalarEpilogue()) {
2917     auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
2918     R = Builder.CreateSelect(IsZero, Step, R);
2919   }
2920 
2921   VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
2922 
2923   return VectorTripCount;
2924 }
2925 
2926 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
2927                                                    const DataLayout &DL) {
2928   // Verify that V is a vector type with same number of elements as DstVTy.
2929   auto *DstFVTy = cast<FixedVectorType>(DstVTy);
2930   unsigned VF = DstFVTy->getNumElements();
2931   auto *SrcVecTy = cast<FixedVectorType>(V->getType());
2932   assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match");
2933   Type *SrcElemTy = SrcVecTy->getElementType();
2934   Type *DstElemTy = DstFVTy->getElementType();
2935   assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
2936          "Vector elements must have same size");
2937 
2938   // Do a direct cast if element types are castable.
2939   if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
2940     return Builder.CreateBitOrPointerCast(V, DstFVTy);
2941   }
2942   // V cannot be directly casted to desired vector type.
2943   // May happen when V is a floating point vector but DstVTy is a vector of
2944   // pointers or vice-versa. Handle this using a two-step bitcast using an
2945   // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
2946   assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
2947          "Only one type should be a pointer type");
2948   assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
2949          "Only one type should be a floating point type");
2950   Type *IntTy =
2951       IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
2952   auto *VecIntTy = FixedVectorType::get(IntTy, VF);
2953   Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
2954   return Builder.CreateBitOrPointerCast(CastVal, DstFVTy);
2955 }
2956 
2957 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
2958                                                          BasicBlock *Bypass) {
2959   Value *Count = getOrCreateTripCount(L);
2960   // Reuse existing vector loop preheader for TC checks.
2961   // Note that new preheader block is generated for vector loop.
2962   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
2963   IRBuilder<> Builder(TCCheckBlock->getTerminator());
2964 
2965   // Generate code to check if the loop's trip count is less than VF * UF, or
2966   // equal to it in case a scalar epilogue is required; this implies that the
2967   // vector trip count is zero. This check also covers the case where adding one
2968   // to the backedge-taken count overflowed leading to an incorrect trip count
2969   // of zero. In this case we will also jump to the scalar loop.
2970   auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE
2971                                           : ICmpInst::ICMP_ULT;
2972 
2973   // If tail is to be folded, vector loop takes care of all iterations.
2974   Value *CheckMinIters = Builder.getFalse();
2975   if (!Cost->foldTailByMasking()) {
2976     Value *Step =
2977         createStepForVF(Builder, ConstantInt::get(Count->getType(), UF), VF);
2978     CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check");
2979   }
2980   // Create new preheader for vector loop.
2981   LoopVectorPreHeader =
2982       SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
2983                  "vector.ph");
2984 
2985   assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
2986                                DT->getNode(Bypass)->getIDom()) &&
2987          "TC check is expected to dominate Bypass");
2988 
2989   // Update dominator for Bypass & LoopExit.
2990   DT->changeImmediateDominator(Bypass, TCCheckBlock);
2991   DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
2992 
2993   ReplaceInstWithInst(
2994       TCCheckBlock->getTerminator(),
2995       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
2996   LoopBypassBlocks.push_back(TCCheckBlock);
2997 }
2998 
2999 void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
3000   // Reuse existing vector loop preheader for SCEV checks.
3001   // Note that new preheader block is generated for vector loop.
3002   BasicBlock *const SCEVCheckBlock = LoopVectorPreHeader;
3003 
3004   // Generate the code to check that the SCEV assumptions that we made.
3005   // We want the new basic block to start at the first instruction in a
3006   // sequence of instructions that form a check.
3007   SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(),
3008                    "scev.check");
3009   Value *SCEVCheck = Exp.expandCodeForPredicate(
3010       &PSE.getUnionPredicate(), SCEVCheckBlock->getTerminator());
3011 
3012   if (auto *C = dyn_cast<ConstantInt>(SCEVCheck))
3013     if (C->isZero())
3014       return;
3015 
3016   assert(!(SCEVCheckBlock->getParent()->hasOptSize() ||
3017            (OptForSizeBasedOnProfile &&
3018             Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
3019          "Cannot SCEV check stride or overflow when optimizing for size");
3020 
3021   SCEVCheckBlock->setName("vector.scevcheck");
3022   // Create new preheader for vector loop.
3023   LoopVectorPreHeader =
3024       SplitBlock(SCEVCheckBlock, SCEVCheckBlock->getTerminator(), DT, LI,
3025                  nullptr, "vector.ph");
3026 
3027   // Update dominator only if this is first RT check.
3028   if (LoopBypassBlocks.empty()) {
3029     DT->changeImmediateDominator(Bypass, SCEVCheckBlock);
3030     DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock);
3031   }
3032 
3033   ReplaceInstWithInst(
3034       SCEVCheckBlock->getTerminator(),
3035       BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheck));
3036   LoopBypassBlocks.push_back(SCEVCheckBlock);
3037   AddedSafetyChecks = true;
3038 }
3039 
3040 void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {
3041   // VPlan-native path does not do any analysis for runtime checks currently.
3042   if (EnableVPlanNativePath)
3043     return;
3044 
3045   // Reuse existing vector loop preheader for runtime memory checks.
3046   // Note that new preheader block is generated for vector loop.
3047   BasicBlock *const MemCheckBlock = L->getLoopPreheader();
3048 
3049   // Generate the code that checks in runtime if arrays overlap. We put the
3050   // checks into a separate block to make the more common case of few elements
3051   // faster.
3052   auto *LAI = Legal->getLAI();
3053   const auto &RtPtrChecking = *LAI->getRuntimePointerChecking();
3054   if (!RtPtrChecking.Need)
3055     return;
3056 
3057   if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) {
3058     assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
3059            "Cannot emit memory checks when optimizing for size, unless forced "
3060            "to vectorize.");
3061     ORE->emit([&]() {
3062       return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
3063                                         L->getStartLoc(), L->getHeader())
3064              << "Code-size may be reduced by not forcing "
3065                 "vectorization, or by source-code modifications "
3066                 "eliminating the need for runtime checks "
3067                 "(e.g., adding 'restrict').";
3068     });
3069   }
3070 
3071   MemCheckBlock->setName("vector.memcheck");
3072   // Create new preheader for vector loop.
3073   LoopVectorPreHeader =
3074       SplitBlock(MemCheckBlock, MemCheckBlock->getTerminator(), DT, LI, nullptr,
3075                  "vector.ph");
3076 
3077   auto *CondBranch = cast<BranchInst>(
3078       Builder.CreateCondBr(Builder.getTrue(), Bypass, LoopVectorPreHeader));
3079   ReplaceInstWithInst(MemCheckBlock->getTerminator(), CondBranch);
3080   LoopBypassBlocks.push_back(MemCheckBlock);
3081   AddedSafetyChecks = true;
3082 
3083   // Update dominator only if this is first RT check.
3084   if (LoopBypassBlocks.empty()) {
3085     DT->changeImmediateDominator(Bypass, MemCheckBlock);
3086     DT->changeImmediateDominator(LoopExitBlock, MemCheckBlock);
3087   }
3088 
3089   Instruction *FirstCheckInst;
3090   Instruction *MemRuntimeCheck;
3091   SCEVExpander Exp(*PSE.getSE(), MemCheckBlock->getModule()->getDataLayout(),
3092                    "induction");
3093   std::tie(FirstCheckInst, MemRuntimeCheck) = addRuntimeChecks(
3094       MemCheckBlock->getTerminator(), OrigLoop, RtPtrChecking.getChecks(), Exp);
3095   assert(MemRuntimeCheck && "no RT checks generated although RtPtrChecking "
3096                             "claimed checks are required");
3097   CondBranch->setCondition(MemRuntimeCheck);
3098 
3099   // We currently don't use LoopVersioning for the actual loop cloning but we
3100   // still use it to add the noalias metadata.
3101   LVer = std::make_unique<LoopVersioning>(
3102       *Legal->getLAI(),
3103       Legal->getLAI()->getRuntimePointerChecking()->getChecks(), OrigLoop, LI,
3104       DT, PSE.getSE());
3105   LVer->prepareNoAliasMetadata();
3106 }
3107 
3108 Value *InnerLoopVectorizer::emitTransformedIndex(
3109     IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL,
3110     const InductionDescriptor &ID) const {
3111 
3112   SCEVExpander Exp(*SE, DL, "induction");
3113   auto Step = ID.getStep();
3114   auto StartValue = ID.getStartValue();
3115   assert(Index->getType() == Step->getType() &&
3116          "Index type does not match StepValue type");
3117 
3118   // Note: the IR at this point is broken. We cannot use SE to create any new
3119   // SCEV and then expand it, hoping that SCEV's simplification will give us
3120   // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
3121   // lead to various SCEV crashes. So all we can do is to use builder and rely
3122   // on InstCombine for future simplifications. Here we handle some trivial
3123   // cases only.
3124   auto CreateAdd = [&B](Value *X, Value *Y) {
3125     assert(X->getType() == Y->getType() && "Types don't match!");
3126     if (auto *CX = dyn_cast<ConstantInt>(X))
3127       if (CX->isZero())
3128         return Y;
3129     if (auto *CY = dyn_cast<ConstantInt>(Y))
3130       if (CY->isZero())
3131         return X;
3132     return B.CreateAdd(X, Y);
3133   };
3134 
3135   auto CreateMul = [&B](Value *X, Value *Y) {
3136     assert(X->getType() == Y->getType() && "Types don't match!");
3137     if (auto *CX = dyn_cast<ConstantInt>(X))
3138       if (CX->isOne())
3139         return Y;
3140     if (auto *CY = dyn_cast<ConstantInt>(Y))
3141       if (CY->isOne())
3142         return X;
3143     return B.CreateMul(X, Y);
3144   };
3145 
3146   // Get a suitable insert point for SCEV expansion. For blocks in the vector
3147   // loop, choose the end of the vector loop header (=LoopVectorBody), because
3148   // the DomTree is not kept up-to-date for additional blocks generated in the
3149   // vector loop. By using the header as insertion point, we guarantee that the
3150   // expanded instructions dominate all their uses.
3151   auto GetInsertPoint = [this, &B]() {
3152     BasicBlock *InsertBB = B.GetInsertPoint()->getParent();
3153     if (InsertBB != LoopVectorBody &&
3154         LI->getLoopFor(LoopVectorBody) == LI->getLoopFor(InsertBB))
3155       return LoopVectorBody->getTerminator();
3156     return &*B.GetInsertPoint();
3157   };
3158   switch (ID.getKind()) {
3159   case InductionDescriptor::IK_IntInduction: {
3160     assert(Index->getType() == StartValue->getType() &&
3161            "Index type does not match StartValue type");
3162     if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne())
3163       return B.CreateSub(StartValue, Index);
3164     auto *Offset = CreateMul(
3165         Index, Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint()));
3166     return CreateAdd(StartValue, Offset);
3167   }
3168   case InductionDescriptor::IK_PtrInduction: {
3169     assert(isa<SCEVConstant>(Step) &&
3170            "Expected constant step for pointer induction");
3171     return B.CreateGEP(
3172         StartValue->getType()->getPointerElementType(), StartValue,
3173         CreateMul(Index,
3174                   Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint())));
3175   }
3176   case InductionDescriptor::IK_FpInduction: {
3177     assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
3178     auto InductionBinOp = ID.getInductionBinOp();
3179     assert(InductionBinOp &&
3180            (InductionBinOp->getOpcode() == Instruction::FAdd ||
3181             InductionBinOp->getOpcode() == Instruction::FSub) &&
3182            "Original bin op should be defined for FP induction");
3183 
3184     Value *StepValue = cast<SCEVUnknown>(Step)->getValue();
3185 
3186     // Floating point operations had to be 'fast' to enable the induction.
3187     FastMathFlags Flags;
3188     Flags.setFast();
3189 
3190     Value *MulExp = B.CreateFMul(StepValue, Index);
3191     if (isa<Instruction>(MulExp))
3192       // We have to check, the MulExp may be a constant.
3193       cast<Instruction>(MulExp)->setFastMathFlags(Flags);
3194 
3195     Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
3196                                "induction");
3197     if (isa<Instruction>(BOp))
3198       cast<Instruction>(BOp)->setFastMathFlags(Flags);
3199 
3200     return BOp;
3201   }
3202   case InductionDescriptor::IK_NoInduction:
3203     return nullptr;
3204   }
3205   llvm_unreachable("invalid enum");
3206 }
3207 
3208 Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
3209   LoopScalarBody = OrigLoop->getHeader();
3210   LoopVectorPreHeader = OrigLoop->getLoopPreheader();
3211   LoopExitBlock = OrigLoop->getUniqueExitBlock();
3212   assert(LoopExitBlock && "Must have an exit block");
3213   assert(LoopVectorPreHeader && "Invalid loop structure");
3214 
3215   LoopMiddleBlock =
3216       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
3217                  LI, nullptr, Twine(Prefix) + "middle.block");
3218   LoopScalarPreHeader =
3219       SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,
3220                  nullptr, Twine(Prefix) + "scalar.ph");
3221 
3222   // Set up branch from middle block to the exit and scalar preheader blocks.
3223   // completeLoopSkeleton will update the condition to use an iteration check,
3224   // if required to decide whether to execute the remainder.
3225   BranchInst *BrInst =
3226       BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, Builder.getTrue());
3227   auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
3228   BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc());
3229   ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst);
3230 
3231   // We intentionally don't let SplitBlock to update LoopInfo since
3232   // LoopVectorBody should belong to another loop than LoopVectorPreHeader.
3233   // LoopVectorBody is explicitly added to the correct place few lines later.
3234   LoopVectorBody =
3235       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
3236                  nullptr, nullptr, Twine(Prefix) + "vector.body");
3237 
3238   // Update dominator for loop exit.
3239   DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
3240 
3241   // Create and register the new vector loop.
3242   Loop *Lp = LI->AllocateLoop();
3243   Loop *ParentLoop = OrigLoop->getParentLoop();
3244 
3245   // Insert the new loop into the loop nest and register the new basic blocks
3246   // before calling any utilities such as SCEV that require valid LoopInfo.
3247   if (ParentLoop) {
3248     ParentLoop->addChildLoop(Lp);
3249   } else {
3250     LI->addTopLevelLoop(Lp);
3251   }
3252   Lp->addBasicBlockToLoop(LoopVectorBody, *LI);
3253   return Lp;
3254 }
3255 
3256 void InnerLoopVectorizer::createInductionResumeValues(
3257     Loop *L, Value *VectorTripCount,
3258     std::pair<BasicBlock *, Value *> AdditionalBypass) {
3259   assert(VectorTripCount && L && "Expected valid arguments");
3260   assert(((AdditionalBypass.first && AdditionalBypass.second) ||
3261           (!AdditionalBypass.first && !AdditionalBypass.second)) &&
3262          "Inconsistent information about additional bypass.");
3263   // We are going to resume the execution of the scalar loop.
3264   // Go over all of the induction variables that we found and fix the
3265   // PHIs that are left in the scalar version of the loop.
3266   // The starting values of PHI nodes depend on the counter of the last
3267   // iteration in the vectorized loop.
3268   // If we come from a bypass edge then we need to start from the original
3269   // start value.
3270   for (auto &InductionEntry : Legal->getInductionVars()) {
3271     PHINode *OrigPhi = InductionEntry.first;
3272     InductionDescriptor II = InductionEntry.second;
3273 
3274     // Create phi nodes to merge from the  backedge-taken check block.
3275     PHINode *BCResumeVal =
3276         PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
3277                         LoopScalarPreHeader->getTerminator());
3278     // Copy original phi DL over to the new one.
3279     BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
3280     Value *&EndValue = IVEndValues[OrigPhi];
3281     Value *EndValueFromAdditionalBypass = AdditionalBypass.second;
3282     if (OrigPhi == OldInduction) {
3283       // We know what the end value is.
3284       EndValue = VectorTripCount;
3285     } else {
3286       IRBuilder<> B(L->getLoopPreheader()->getTerminator());
3287       Type *StepType = II.getStep()->getType();
3288       Instruction::CastOps CastOp =
3289           CastInst::getCastOpcode(VectorTripCount, true, StepType, true);
3290       Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd");
3291       const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout();
3292       EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
3293       EndValue->setName("ind.end");
3294 
3295       // Compute the end value for the additional bypass (if applicable).
3296       if (AdditionalBypass.first) {
3297         B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt()));
3298         CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true,
3299                                          StepType, true);
3300         CRD =
3301             B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.crd");
3302         EndValueFromAdditionalBypass =
3303             emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
3304         EndValueFromAdditionalBypass->setName("ind.end");
3305       }
3306     }
3307     // The new PHI merges the original incoming value, in case of a bypass,
3308     // or the value at the end of the vectorized loop.
3309     BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);
3310 
3311     // Fix the scalar body counter (PHI node).
3312     // The old induction's phi node in the scalar body needs the truncated
3313     // value.
3314     for (BasicBlock *BB : LoopBypassBlocks)
3315       BCResumeVal->addIncoming(II.getStartValue(), BB);
3316 
3317     if (AdditionalBypass.first)
3318       BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first,
3319                                             EndValueFromAdditionalBypass);
3320 
3321     OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
3322   }
3323 }
3324 
3325 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L,
3326                                                       MDNode *OrigLoopID) {
3327   assert(L && "Expected valid loop.");
3328 
3329   // The trip counts should be cached by now.
3330   Value *Count = getOrCreateTripCount(L);
3331   Value *VectorTripCount = getOrCreateVectorTripCount(L);
3332 
3333   auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
3334 
3335   // Add a check in the middle block to see if we have completed
3336   // all of the iterations in the first vector loop.
3337   // If (N - N%VF) == N, then we *don't* need to run the remainder.
3338   // If tail is to be folded, we know we don't need to run the remainder.
3339   if (!Cost->foldTailByMasking()) {
3340     Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ,
3341                                         Count, VectorTripCount, "cmp.n",
3342                                         LoopMiddleBlock->getTerminator());
3343 
3344     // Here we use the same DebugLoc as the scalar loop latch terminator instead
3345     // of the corresponding compare because they may have ended up with
3346     // different line numbers and we want to avoid awkward line stepping while
3347     // debugging. Eg. if the compare has got a line number inside the loop.
3348     CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc());
3349     cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN);
3350   }
3351 
3352   // Get ready to start creating new instructions into the vectorized body.
3353   assert(LoopVectorPreHeader == L->getLoopPreheader() &&
3354          "Inconsistent vector loop preheader");
3355   Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt());
3356 
3357   Optional<MDNode *> VectorizedLoopID =
3358       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
3359                                       LLVMLoopVectorizeFollowupVectorized});
3360   if (VectorizedLoopID.hasValue()) {
3361     L->setLoopID(VectorizedLoopID.getValue());
3362 
3363     // Do not setAlreadyVectorized if loop attributes have been defined
3364     // explicitly.
3365     return LoopVectorPreHeader;
3366   }
3367 
3368   // Keep all loop hints from the original loop on the vector loop (we'll
3369   // replace the vectorizer-specific hints below).
3370   if (MDNode *LID = OrigLoop->getLoopID())
3371     L->setLoopID(LID);
3372 
3373   LoopVectorizeHints Hints(L, true, *ORE);
3374   Hints.setAlreadyVectorized();
3375 
3376 #ifdef EXPENSIVE_CHECKS
3377   assert(DT->verify(DominatorTree::VerificationLevel::Fast));
3378   LI->verify(*DT);
3379 #endif
3380 
3381   return LoopVectorPreHeader;
3382 }
3383 
3384 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
3385   /*
3386    In this function we generate a new loop. The new loop will contain
3387    the vectorized instructions while the old loop will continue to run the
3388    scalar remainder.
3389 
3390        [ ] <-- loop iteration number check.
3391     /   |
3392    /    v
3393   |    [ ] <-- vector loop bypass (may consist of multiple blocks).
3394   |  /  |
3395   | /   v
3396   ||   [ ]     <-- vector pre header.
3397   |/    |
3398   |     v
3399   |    [  ] \
3400   |    [  ]_|   <-- vector loop.
3401   |     |
3402   |     v
3403   |   -[ ]   <--- middle-block.
3404   |  /  |
3405   | /   v
3406   -|- >[ ]     <--- new preheader.
3407    |    |
3408    |    v
3409    |   [ ] \
3410    |   [ ]_|   <-- old scalar loop to handle remainder.
3411     \   |
3412      \  v
3413       >[ ]     <-- exit block.
3414    ...
3415    */
3416 
3417   // Get the metadata of the original loop before it gets modified.
3418   MDNode *OrigLoopID = OrigLoop->getLoopID();
3419 
3420   // Create an empty vector loop, and prepare basic blocks for the runtime
3421   // checks.
3422   Loop *Lp = createVectorLoopSkeleton("");
3423 
3424   // Now, compare the new count to zero. If it is zero skip the vector loop and
3425   // jump to the scalar loop. This check also covers the case where the
3426   // backedge-taken count is uint##_max: adding one to it will overflow leading
3427   // to an incorrect trip count of zero. In this (rare) case we will also jump
3428   // to the scalar loop.
3429   emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader);
3430 
3431   // Generate the code to check any assumptions that we've made for SCEV
3432   // expressions.
3433   emitSCEVChecks(Lp, LoopScalarPreHeader);
3434 
3435   // Generate the code that checks in runtime if arrays overlap. We put the
3436   // checks into a separate block to make the more common case of few elements
3437   // faster.
3438   emitMemRuntimeChecks(Lp, LoopScalarPreHeader);
3439 
3440   // Some loops have a single integer induction variable, while other loops
3441   // don't. One example is c++ iterators that often have multiple pointer
3442   // induction variables. In the code below we also support a case where we
3443   // don't have a single induction variable.
3444   //
3445   // We try to obtain an induction variable from the original loop as hard
3446   // as possible. However if we don't find one that:
3447   //   - is an integer
3448   //   - counts from zero, stepping by one
3449   //   - is the size of the widest induction variable type
3450   // then we create a new one.
3451   OldInduction = Legal->getPrimaryInduction();
3452   Type *IdxTy = Legal->getWidestInductionType();
3453   Value *StartIdx = ConstantInt::get(IdxTy, 0);
3454   // The loop step is equal to the vectorization factor (num of SIMD elements)
3455   // times the unroll factor (num of SIMD instructions).
3456   Builder.SetInsertPoint(&*Lp->getHeader()->getFirstInsertionPt());
3457   Value *Step = createStepForVF(Builder, ConstantInt::get(IdxTy, UF), VF);
3458   Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
3459   Induction =
3460       createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
3461                               getDebugLocFromInstOrOperands(OldInduction));
3462 
3463   // Emit phis for the new starting index of the scalar loop.
3464   createInductionResumeValues(Lp, CountRoundDown);
3465 
3466   return completeLoopSkeleton(Lp, OrigLoopID);
3467 }
3468 
3469 // Fix up external users of the induction variable. At this point, we are
3470 // in LCSSA form, with all external PHIs that use the IV having one input value,
3471 // coming from the remainder loop. We need those PHIs to also have a correct
3472 // value for the IV when arriving directly from the middle block.
3473 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
3474                                        const InductionDescriptor &II,
3475                                        Value *CountRoundDown, Value *EndValue,
3476                                        BasicBlock *MiddleBlock) {
3477   // There are two kinds of external IV usages - those that use the value
3478   // computed in the last iteration (the PHI) and those that use the penultimate
3479   // value (the value that feeds into the phi from the loop latch).
3480   // We allow both, but they, obviously, have different values.
3481 
3482   assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block");
3483 
3484   DenseMap<Value *, Value *> MissingVals;
3485 
3486   // An external user of the last iteration's value should see the value that
3487   // the remainder loop uses to initialize its own IV.
3488   Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
3489   for (User *U : PostInc->users()) {
3490     Instruction *UI = cast<Instruction>(U);
3491     if (!OrigLoop->contains(UI)) {
3492       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3493       MissingVals[UI] = EndValue;
3494     }
3495   }
3496 
3497   // An external user of the penultimate value need to see EndValue - Step.
3498   // The simplest way to get this is to recompute it from the constituent SCEVs,
3499   // that is Start + (Step * (CRD - 1)).
3500   for (User *U : OrigPhi->users()) {
3501     auto *UI = cast<Instruction>(U);
3502     if (!OrigLoop->contains(UI)) {
3503       const DataLayout &DL =
3504           OrigLoop->getHeader()->getModule()->getDataLayout();
3505       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3506 
3507       IRBuilder<> B(MiddleBlock->getTerminator());
3508       Value *CountMinusOne = B.CreateSub(
3509           CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1));
3510       Value *CMO =
3511           !II.getStep()->getType()->isIntegerTy()
3512               ? B.CreateCast(Instruction::SIToFP, CountMinusOne,
3513                              II.getStep()->getType())
3514               : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType());
3515       CMO->setName("cast.cmo");
3516       Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II);
3517       Escape->setName("ind.escape");
3518       MissingVals[UI] = Escape;
3519     }
3520   }
3521 
3522   for (auto &I : MissingVals) {
3523     PHINode *PHI = cast<PHINode>(I.first);
3524     // One corner case we have to handle is two IVs "chasing" each-other,
3525     // that is %IV2 = phi [...], [ %IV1, %latch ]
3526     // In this case, if IV1 has an external use, we need to avoid adding both
3527     // "last value of IV1" and "penultimate value of IV2". So, verify that we
3528     // don't already have an incoming value for the middle block.
3529     if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
3530       PHI->addIncoming(I.second, MiddleBlock);
3531   }
3532 }
3533 
3534 namespace {
3535 
3536 struct CSEDenseMapInfo {
3537   static bool canHandle(const Instruction *I) {
3538     return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
3539            isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
3540   }
3541 
3542   static inline Instruction *getEmptyKey() {
3543     return DenseMapInfo<Instruction *>::getEmptyKey();
3544   }
3545 
3546   static inline Instruction *getTombstoneKey() {
3547     return DenseMapInfo<Instruction *>::getTombstoneKey();
3548   }
3549 
3550   static unsigned getHashValue(const Instruction *I) {
3551     assert(canHandle(I) && "Unknown instruction!");
3552     return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
3553                                                            I->value_op_end()));
3554   }
3555 
3556   static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
3557     if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3558         LHS == getTombstoneKey() || RHS == getTombstoneKey())
3559       return LHS == RHS;
3560     return LHS->isIdenticalTo(RHS);
3561   }
3562 };
3563 
3564 } // end anonymous namespace
3565 
3566 ///Perform cse of induction variable instructions.
3567 static void cse(BasicBlock *BB) {
3568   // Perform simple cse.
3569   SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
3570   for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
3571     Instruction *In = &*I++;
3572 
3573     if (!CSEDenseMapInfo::canHandle(In))
3574       continue;
3575 
3576     // Check if we can replace this instruction with any of the
3577     // visited instructions.
3578     if (Instruction *V = CSEMap.lookup(In)) {
3579       In->replaceAllUsesWith(V);
3580       In->eraseFromParent();
3581       continue;
3582     }
3583 
3584     CSEMap[In] = In;
3585   }
3586 }
3587 
3588 InstructionCost
3589 LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF,
3590                                               bool &NeedToScalarize) {
3591   Function *F = CI->getCalledFunction();
3592   Type *ScalarRetTy = CI->getType();
3593   SmallVector<Type *, 4> Tys, ScalarTys;
3594   for (auto &ArgOp : CI->arg_operands())
3595     ScalarTys.push_back(ArgOp->getType());
3596 
3597   // Estimate cost of scalarized vector call. The source operands are assumed
3598   // to be vectors, so we need to extract individual elements from there,
3599   // execute VF scalar calls, and then gather the result into the vector return
3600   // value.
3601   InstructionCost ScalarCallCost =
3602       TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, TTI::TCK_RecipThroughput);
3603   if (VF.isScalar())
3604     return ScalarCallCost;
3605 
3606   // Compute corresponding vector type for return value and arguments.
3607   Type *RetTy = ToVectorTy(ScalarRetTy, VF);
3608   for (Type *ScalarTy : ScalarTys)
3609     Tys.push_back(ToVectorTy(ScalarTy, VF));
3610 
3611   // Compute costs of unpacking argument values for the scalar calls and
3612   // packing the return values to a vector.
3613   InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF);
3614 
3615   InstructionCost Cost =
3616       ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
3617 
3618   // If we can't emit a vector call for this function, then the currently found
3619   // cost is the cost we need to return.
3620   NeedToScalarize = true;
3621   VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
3622   Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
3623 
3624   if (!TLI || CI->isNoBuiltin() || !VecFunc)
3625     return Cost;
3626 
3627   // If the corresponding vector cost is cheaper, return its cost.
3628   InstructionCost VectorCallCost =
3629       TTI.getCallInstrCost(nullptr, RetTy, Tys, TTI::TCK_RecipThroughput);
3630   if (VectorCallCost < Cost) {
3631     NeedToScalarize = false;
3632     Cost = VectorCallCost;
3633   }
3634   return Cost;
3635 }
3636 
3637 InstructionCost
3638 LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
3639                                                    ElementCount VF) {
3640   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
3641   assert(ID && "Expected intrinsic call!");
3642 
3643   IntrinsicCostAttributes CostAttrs(ID, *CI, VF);
3644   return TTI.getIntrinsicInstrCost(CostAttrs,
3645                                    TargetTransformInfo::TCK_RecipThroughput);
3646 }
3647 
3648 static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
3649   auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3650   auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3651   return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3652 }
3653 
3654 static Type *largestIntegerVectorType(Type *T1, Type *T2) {
3655   auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3656   auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3657   return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3658 }
3659 
3660 void InnerLoopVectorizer::truncateToMinimalBitwidths(VPTransformState &State) {
3661   // For every instruction `I` in MinBWs, truncate the operands, create a
3662   // truncated version of `I` and reextend its result. InstCombine runs
3663   // later and will remove any ext/trunc pairs.
3664   SmallPtrSet<Value *, 4> Erased;
3665   for (const auto &KV : Cost->getMinimalBitwidths()) {
3666     // If the value wasn't vectorized, we must maintain the original scalar
3667     // type. The absence of the value from State indicates that it
3668     // wasn't vectorized.
3669     VPValue *Def = State.Plan->getVPValue(KV.first);
3670     if (!State.hasAnyVectorValue(Def))
3671       continue;
3672     for (unsigned Part = 0; Part < UF; ++Part) {
3673       Value *I = State.get(Def, Part);
3674       if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I))
3675         continue;
3676       Type *OriginalTy = I->getType();
3677       Type *ScalarTruncatedTy =
3678           IntegerType::get(OriginalTy->getContext(), KV.second);
3679       auto *TruncatedTy = FixedVectorType::get(
3680           ScalarTruncatedTy,
3681           cast<FixedVectorType>(OriginalTy)->getNumElements());
3682       if (TruncatedTy == OriginalTy)
3683         continue;
3684 
3685       IRBuilder<> B(cast<Instruction>(I));
3686       auto ShrinkOperand = [&](Value *V) -> Value * {
3687         if (auto *ZI = dyn_cast<ZExtInst>(V))
3688           if (ZI->getSrcTy() == TruncatedTy)
3689             return ZI->getOperand(0);
3690         return B.CreateZExtOrTrunc(V, TruncatedTy);
3691       };
3692 
3693       // The actual instruction modification depends on the instruction type,
3694       // unfortunately.
3695       Value *NewI = nullptr;
3696       if (auto *BO = dyn_cast<BinaryOperator>(I)) {
3697         NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
3698                              ShrinkOperand(BO->getOperand(1)));
3699 
3700         // Any wrapping introduced by shrinking this operation shouldn't be
3701         // considered undefined behavior. So, we can't unconditionally copy
3702         // arithmetic wrapping flags to NewI.
3703         cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
3704       } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
3705         NewI =
3706             B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
3707                          ShrinkOperand(CI->getOperand(1)));
3708       } else if (auto *SI = dyn_cast<SelectInst>(I)) {
3709         NewI = B.CreateSelect(SI->getCondition(),
3710                               ShrinkOperand(SI->getTrueValue()),
3711                               ShrinkOperand(SI->getFalseValue()));
3712       } else if (auto *CI = dyn_cast<CastInst>(I)) {
3713         switch (CI->getOpcode()) {
3714         default:
3715           llvm_unreachable("Unhandled cast!");
3716         case Instruction::Trunc:
3717           NewI = ShrinkOperand(CI->getOperand(0));
3718           break;
3719         case Instruction::SExt:
3720           NewI = B.CreateSExtOrTrunc(
3721               CI->getOperand(0),
3722               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3723           break;
3724         case Instruction::ZExt:
3725           NewI = B.CreateZExtOrTrunc(
3726               CI->getOperand(0),
3727               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3728           break;
3729         }
3730       } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
3731         auto Elements0 = cast<FixedVectorType>(SI->getOperand(0)->getType())
3732                              ->getNumElements();
3733         auto *O0 = B.CreateZExtOrTrunc(
3734             SI->getOperand(0),
3735             FixedVectorType::get(ScalarTruncatedTy, Elements0));
3736         auto Elements1 = cast<FixedVectorType>(SI->getOperand(1)->getType())
3737                              ->getNumElements();
3738         auto *O1 = B.CreateZExtOrTrunc(
3739             SI->getOperand(1),
3740             FixedVectorType::get(ScalarTruncatedTy, Elements1));
3741 
3742         NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask());
3743       } else if (isa<LoadInst>(I) || isa<PHINode>(I)) {
3744         // Don't do anything with the operands, just extend the result.
3745         continue;
3746       } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
3747         auto Elements = cast<FixedVectorType>(IE->getOperand(0)->getType())
3748                             ->getNumElements();
3749         auto *O0 = B.CreateZExtOrTrunc(
3750             IE->getOperand(0),
3751             FixedVectorType::get(ScalarTruncatedTy, Elements));
3752         auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
3753         NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
3754       } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
3755         auto Elements = cast<FixedVectorType>(EE->getOperand(0)->getType())
3756                             ->getNumElements();
3757         auto *O0 = B.CreateZExtOrTrunc(
3758             EE->getOperand(0),
3759             FixedVectorType::get(ScalarTruncatedTy, Elements));
3760         NewI = B.CreateExtractElement(O0, EE->getOperand(2));
3761       } else {
3762         // If we don't know what to do, be conservative and don't do anything.
3763         continue;
3764       }
3765 
3766       // Lastly, extend the result.
3767       NewI->takeName(cast<Instruction>(I));
3768       Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
3769       I->replaceAllUsesWith(Res);
3770       cast<Instruction>(I)->eraseFromParent();
3771       Erased.insert(I);
3772       State.reset(Def, Res, Part);
3773     }
3774   }
3775 
3776   // We'll have created a bunch of ZExts that are now parentless. Clean up.
3777   for (const auto &KV : Cost->getMinimalBitwidths()) {
3778     // If the value wasn't vectorized, we must maintain the original scalar
3779     // type. The absence of the value from State indicates that it
3780     // wasn't vectorized.
3781     VPValue *Def = State.Plan->getVPValue(KV.first);
3782     if (!State.hasAnyVectorValue(Def))
3783       continue;
3784     for (unsigned Part = 0; Part < UF; ++Part) {
3785       Value *I = State.get(Def, Part);
3786       ZExtInst *Inst = dyn_cast<ZExtInst>(I);
3787       if (Inst && Inst->use_empty()) {
3788         Value *NewI = Inst->getOperand(0);
3789         Inst->eraseFromParent();
3790         State.reset(Def, NewI, Part);
3791       }
3792     }
3793   }
3794 }
3795 
3796 void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
3797   // Insert truncates and extends for any truncated instructions as hints to
3798   // InstCombine.
3799   if (VF.isVector())
3800     truncateToMinimalBitwidths(State);
3801 
3802   // Fix widened non-induction PHIs by setting up the PHI operands.
3803   if (OrigPHIsToFix.size()) {
3804     assert(EnableVPlanNativePath &&
3805            "Unexpected non-induction PHIs for fixup in non VPlan-native path");
3806     fixNonInductionPHIs(State);
3807   }
3808 
3809   // At this point every instruction in the original loop is widened to a
3810   // vector form. Now we need to fix the recurrences in the loop. These PHI
3811   // nodes are currently empty because we did not want to introduce cycles.
3812   // This is the second stage of vectorizing recurrences.
3813   fixCrossIterationPHIs(State);
3814 
3815   // Forget the original basic block.
3816   PSE.getSE()->forgetLoop(OrigLoop);
3817 
3818   // Fix-up external users of the induction variables.
3819   for (auto &Entry : Legal->getInductionVars())
3820     fixupIVUsers(Entry.first, Entry.second,
3821                  getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)),
3822                  IVEndValues[Entry.first], LoopMiddleBlock);
3823 
3824   fixLCSSAPHIs(State);
3825   for (Instruction *PI : PredicatedInstructions)
3826     sinkScalarOperands(&*PI);
3827 
3828   // Remove redundant induction instructions.
3829   cse(LoopVectorBody);
3830 
3831   // Set/update profile weights for the vector and remainder loops as original
3832   // loop iterations are now distributed among them. Note that original loop
3833   // represented by LoopScalarBody becomes remainder loop after vectorization.
3834   //
3835   // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
3836   // end up getting slightly roughened result but that should be OK since
3837   // profile is not inherently precise anyway. Note also possible bypass of
3838   // vector code caused by legality checks is ignored, assigning all the weight
3839   // to the vector loop, optimistically.
3840   //
3841   // For scalable vectorization we can't know at compile time how many iterations
3842   // of the loop are handled in one vector iteration, so instead assume a pessimistic
3843   // vscale of '1'.
3844   setProfileInfoAfterUnrolling(
3845       LI->getLoopFor(LoopScalarBody), LI->getLoopFor(LoopVectorBody),
3846       LI->getLoopFor(LoopScalarBody), VF.getKnownMinValue() * UF);
3847 }
3848 
3849 void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) {
3850   // In order to support recurrences we need to be able to vectorize Phi nodes.
3851   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3852   // stage #2: We now need to fix the recurrences by adding incoming edges to
3853   // the currently empty PHI nodes. At this point every instruction in the
3854   // original loop is widened to a vector form so we can use them to construct
3855   // the incoming edges.
3856   for (PHINode &Phi : OrigLoop->getHeader()->phis()) {
3857     // Handle first-order recurrences and reductions that need to be fixed.
3858     if (Legal->isFirstOrderRecurrence(&Phi))
3859       fixFirstOrderRecurrence(&Phi, State);
3860     else if (Legal->isReductionVariable(&Phi))
3861       fixReduction(&Phi, State);
3862   }
3863 }
3864 
3865 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi,
3866                                                   VPTransformState &State) {
3867   // This is the second phase of vectorizing first-order recurrences. An
3868   // overview of the transformation is described below. Suppose we have the
3869   // following loop.
3870   //
3871   //   for (int i = 0; i < n; ++i)
3872   //     b[i] = a[i] - a[i - 1];
3873   //
3874   // There is a first-order recurrence on "a". For this loop, the shorthand
3875   // scalar IR looks like:
3876   //
3877   //   scalar.ph:
3878   //     s_init = a[-1]
3879   //     br scalar.body
3880   //
3881   //   scalar.body:
3882   //     i = phi [0, scalar.ph], [i+1, scalar.body]
3883   //     s1 = phi [s_init, scalar.ph], [s2, scalar.body]
3884   //     s2 = a[i]
3885   //     b[i] = s2 - s1
3886   //     br cond, scalar.body, ...
3887   //
3888   // In this example, s1 is a recurrence because it's value depends on the
3889   // previous iteration. In the first phase of vectorization, we created a
3890   // temporary value for s1. We now complete the vectorization and produce the
3891   // shorthand vector IR shown below (for VF = 4, UF = 1).
3892   //
3893   //   vector.ph:
3894   //     v_init = vector(..., ..., ..., a[-1])
3895   //     br vector.body
3896   //
3897   //   vector.body
3898   //     i = phi [0, vector.ph], [i+4, vector.body]
3899   //     v1 = phi [v_init, vector.ph], [v2, vector.body]
3900   //     v2 = a[i, i+1, i+2, i+3];
3901   //     v3 = vector(v1(3), v2(0, 1, 2))
3902   //     b[i, i+1, i+2, i+3] = v2 - v3
3903   //     br cond, vector.body, middle.block
3904   //
3905   //   middle.block:
3906   //     x = v2(3)
3907   //     br scalar.ph
3908   //
3909   //   scalar.ph:
3910   //     s_init = phi [x, middle.block], [a[-1], otherwise]
3911   //     br scalar.body
3912   //
3913   // After execution completes the vector loop, we extract the next value of
3914   // the recurrence (x) to use as the initial value in the scalar loop.
3915 
3916   // Get the original loop preheader and single loop latch.
3917   auto *Preheader = OrigLoop->getLoopPreheader();
3918   auto *Latch = OrigLoop->getLoopLatch();
3919 
3920   // Get the initial and previous values of the scalar recurrence.
3921   auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader);
3922   auto *Previous = Phi->getIncomingValueForBlock(Latch);
3923 
3924   // Create a vector from the initial value.
3925   auto *VectorInit = ScalarInit;
3926   if (VF.isVector()) {
3927     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
3928     assert(!VF.isScalable() && "VF is assumed to be non scalable.");
3929     VectorInit = Builder.CreateInsertElement(
3930         PoisonValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit,
3931         Builder.getInt32(VF.getKnownMinValue() - 1), "vector.recur.init");
3932   }
3933 
3934   VPValue *PhiDef = State.Plan->getVPValue(Phi);
3935   VPValue *PreviousDef = State.Plan->getVPValue(Previous);
3936   // We constructed a temporary phi node in the first phase of vectorization.
3937   // This phi node will eventually be deleted.
3938   Builder.SetInsertPoint(cast<Instruction>(State.get(PhiDef, 0)));
3939 
3940   // Create a phi node for the new recurrence. The current value will either be
3941   // the initial value inserted into a vector or loop-varying vector value.
3942   auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur");
3943   VecPhi->addIncoming(VectorInit, LoopVectorPreHeader);
3944 
3945   // Get the vectorized previous value of the last part UF - 1. It appears last
3946   // among all unrolled iterations, due to the order of their construction.
3947   Value *PreviousLastPart = State.get(PreviousDef, UF - 1);
3948 
3949   // Find and set the insertion point after the previous value if it is an
3950   // instruction.
3951   BasicBlock::iterator InsertPt;
3952   // Note that the previous value may have been constant-folded so it is not
3953   // guaranteed to be an instruction in the vector loop.
3954   // FIXME: Loop invariant values do not form recurrences. We should deal with
3955   //        them earlier.
3956   if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart))
3957     InsertPt = LoopVectorBody->getFirstInsertionPt();
3958   else {
3959     Instruction *PreviousInst = cast<Instruction>(PreviousLastPart);
3960     if (isa<PHINode>(PreviousLastPart))
3961       // If the previous value is a phi node, we should insert after all the phi
3962       // nodes in the block containing the PHI to avoid breaking basic block
3963       // verification. Note that the basic block may be different to
3964       // LoopVectorBody, in case we predicate the loop.
3965       InsertPt = PreviousInst->getParent()->getFirstInsertionPt();
3966     else
3967       InsertPt = ++PreviousInst->getIterator();
3968   }
3969   Builder.SetInsertPoint(&*InsertPt);
3970 
3971   // We will construct a vector for the recurrence by combining the values for
3972   // the current and previous iterations. This is the required shuffle mask.
3973   assert(!VF.isScalable());
3974   SmallVector<int, 8> ShuffleMask(VF.getKnownMinValue());
3975   ShuffleMask[0] = VF.getKnownMinValue() - 1;
3976   for (unsigned I = 1; I < VF.getKnownMinValue(); ++I)
3977     ShuffleMask[I] = I + VF.getKnownMinValue() - 1;
3978 
3979   // The vector from which to take the initial value for the current iteration
3980   // (actual or unrolled). Initially, this is the vector phi node.
3981   Value *Incoming = VecPhi;
3982 
3983   // Shuffle the current and previous vector and update the vector parts.
3984   for (unsigned Part = 0; Part < UF; ++Part) {
3985     Value *PreviousPart = State.get(PreviousDef, Part);
3986     Value *PhiPart = State.get(PhiDef, Part);
3987     auto *Shuffle =
3988         VF.isVector()
3989             ? Builder.CreateShuffleVector(Incoming, PreviousPart, ShuffleMask)
3990             : Incoming;
3991     PhiPart->replaceAllUsesWith(Shuffle);
3992     cast<Instruction>(PhiPart)->eraseFromParent();
3993     State.reset(PhiDef, Shuffle, Part);
3994     Incoming = PreviousPart;
3995   }
3996 
3997   // Fix the latch value of the new recurrence in the vector loop.
3998   VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
3999 
4000   // Extract the last vector element in the middle block. This will be the
4001   // initial value for the recurrence when jumping to the scalar loop.
4002   auto *ExtractForScalar = Incoming;
4003   if (VF.isVector()) {
4004     Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
4005     ExtractForScalar = Builder.CreateExtractElement(
4006         ExtractForScalar, Builder.getInt32(VF.getKnownMinValue() - 1),
4007         "vector.recur.extract");
4008   }
4009   // Extract the second last element in the middle block if the
4010   // Phi is used outside the loop. We need to extract the phi itself
4011   // and not the last element (the phi update in the current iteration). This
4012   // will be the value when jumping to the exit block from the LoopMiddleBlock,
4013   // when the scalar loop is not run at all.
4014   Value *ExtractForPhiUsedOutsideLoop = nullptr;
4015   if (VF.isVector())
4016     ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
4017         Incoming, Builder.getInt32(VF.getKnownMinValue() - 2),
4018         "vector.recur.extract.for.phi");
4019   // When loop is unrolled without vectorizing, initialize
4020   // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of
4021   // `Incoming`. This is analogous to the vectorized case above: extracting the
4022   // second last element when VF > 1.
4023   else if (UF > 1)
4024     ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2);
4025 
4026   // Fix the initial value of the original recurrence in the scalar loop.
4027   Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
4028   auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
4029   for (auto *BB : predecessors(LoopScalarPreHeader)) {
4030     auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
4031     Start->addIncoming(Incoming, BB);
4032   }
4033 
4034   Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start);
4035   Phi->setName("scalar.recur");
4036 
4037   // Finally, fix users of the recurrence outside the loop. The users will need
4038   // either the last value of the scalar recurrence or the last value of the
4039   // vector recurrence we extracted in the middle block. Since the loop is in
4040   // LCSSA form, we just need to find all the phi nodes for the original scalar
4041   // recurrence in the exit block, and then add an edge for the middle block.
4042   // Note that LCSSA does not imply single entry when the original scalar loop
4043   // had multiple exiting edges (as we always run the last iteration in the
4044   // scalar epilogue); in that case, the exiting path through middle will be
4045   // dynamically dead and the value picked for the phi doesn't matter.
4046   for (PHINode &LCSSAPhi : LoopExitBlock->phis())
4047     if (any_of(LCSSAPhi.incoming_values(),
4048                [Phi](Value *V) { return V == Phi; }))
4049       LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
4050 }
4051 
4052 void InnerLoopVectorizer::fixReduction(PHINode *Phi, VPTransformState &State) {
4053   // Get it's reduction variable descriptor.
4054   assert(Legal->isReductionVariable(Phi) &&
4055          "Unable to find the reduction variable");
4056   RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi];
4057 
4058   RecurKind RK = RdxDesc.getRecurrenceKind();
4059   TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
4060   Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
4061   setDebugLocFromInst(Builder, ReductionStartValue);
4062   bool IsInLoopReductionPhi = Cost->isInLoopReduction(Phi);
4063 
4064   VPValue *LoopExitInstDef = State.Plan->getVPValue(LoopExitInst);
4065   // This is the vector-clone of the value that leaves the loop.
4066   Type *VecTy = State.get(LoopExitInstDef, 0)->getType();
4067 
4068   // Wrap flags are in general invalid after vectorization, clear them.
4069   clearReductionWrapFlags(RdxDesc, State);
4070 
4071   // Fix the vector-loop phi.
4072 
4073   // Reductions do not have to start at zero. They can start with
4074   // any loop invariant values.
4075   BasicBlock *Latch = OrigLoop->getLoopLatch();
4076   Value *LoopVal = Phi->getIncomingValueForBlock(Latch);
4077 
4078   for (unsigned Part = 0; Part < UF; ++Part) {
4079     Value *VecRdxPhi = State.get(State.Plan->getVPValue(Phi), Part);
4080     Value *Val = State.get(State.Plan->getVPValue(LoopVal), Part);
4081     cast<PHINode>(VecRdxPhi)
4082       ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
4083   }
4084 
4085   // Before each round, move the insertion point right between
4086   // the PHIs and the values we are going to write.
4087   // This allows us to write both PHINodes and the extractelement
4088   // instructions.
4089   Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
4090 
4091   setDebugLocFromInst(Builder, LoopExitInst);
4092 
4093   // If tail is folded by masking, the vector value to leave the loop should be
4094   // a Select choosing between the vectorized LoopExitInst and vectorized Phi,
4095   // instead of the former. For an inloop reduction the reduction will already
4096   // be predicated, and does not need to be handled here.
4097   if (Cost->foldTailByMasking() && !IsInLoopReductionPhi) {
4098     for (unsigned Part = 0; Part < UF; ++Part) {
4099       Value *VecLoopExitInst = State.get(LoopExitInstDef, Part);
4100       Value *Sel = nullptr;
4101       for (User *U : VecLoopExitInst->users()) {
4102         if (isa<SelectInst>(U)) {
4103           assert(!Sel && "Reduction exit feeding two selects");
4104           Sel = U;
4105         } else
4106           assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select");
4107       }
4108       assert(Sel && "Reduction exit feeds no select");
4109       State.reset(LoopExitInstDef, Sel, Part);
4110 
4111       // If the target can create a predicated operator for the reduction at no
4112       // extra cost in the loop (for example a predicated vadd), it can be
4113       // cheaper for the select to remain in the loop than be sunk out of it,
4114       // and so use the select value for the phi instead of the old
4115       // LoopExitValue.
4116       RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi];
4117       if (PreferPredicatedReductionSelect ||
4118           TTI->preferPredicatedReductionSelect(
4119               RdxDesc.getOpcode(), Phi->getType(),
4120               TargetTransformInfo::ReductionFlags())) {
4121         auto *VecRdxPhi =
4122             cast<PHINode>(State.get(State.Plan->getVPValue(Phi), Part));
4123         VecRdxPhi->setIncomingValueForBlock(
4124             LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel);
4125       }
4126     }
4127   }
4128 
4129   // If the vector reduction can be performed in a smaller type, we truncate
4130   // then extend the loop exit value to enable InstCombine to evaluate the
4131   // entire expression in the smaller type.
4132   if (VF.isVector() && Phi->getType() != RdxDesc.getRecurrenceType()) {
4133     assert(!IsInLoopReductionPhi && "Unexpected truncated inloop reduction!");
4134     assert(!VF.isScalable() && "scalable vectors not yet supported.");
4135     Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
4136     Builder.SetInsertPoint(
4137         LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator());
4138     VectorParts RdxParts(UF);
4139     for (unsigned Part = 0; Part < UF; ++Part) {
4140       RdxParts[Part] = State.get(LoopExitInstDef, Part);
4141       Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
4142       Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
4143                                         : Builder.CreateZExt(Trunc, VecTy);
4144       for (Value::user_iterator UI = RdxParts[Part]->user_begin();
4145            UI != RdxParts[Part]->user_end();)
4146         if (*UI != Trunc) {
4147           (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd);
4148           RdxParts[Part] = Extnd;
4149         } else {
4150           ++UI;
4151         }
4152     }
4153     Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
4154     for (unsigned Part = 0; Part < UF; ++Part) {
4155       RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
4156       State.reset(LoopExitInstDef, RdxParts[Part], Part);
4157     }
4158   }
4159 
4160   // Reduce all of the unrolled parts into a single vector.
4161   Value *ReducedPartRdx = State.get(LoopExitInstDef, 0);
4162   unsigned Op = RecurrenceDescriptor::getOpcode(RK);
4163 
4164   // The middle block terminator has already been assigned a DebugLoc here (the
4165   // OrigLoop's single latch terminator). We want the whole middle block to
4166   // appear to execute on this line because: (a) it is all compiler generated,
4167   // (b) these instructions are always executed after evaluating the latch
4168   // conditional branch, and (c) other passes may add new predecessors which
4169   // terminate on this line. This is the easiest way to ensure we don't
4170   // accidentally cause an extra step back into the loop while debugging.
4171   setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator());
4172   {
4173     // Floating-point operations should have some FMF to enable the reduction.
4174     IRBuilderBase::FastMathFlagGuard FMFG(Builder);
4175     Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
4176     for (unsigned Part = 1; Part < UF; ++Part) {
4177       Value *RdxPart = State.get(LoopExitInstDef, Part);
4178       if (Op != Instruction::ICmp && Op != Instruction::FCmp) {
4179         ReducedPartRdx = Builder.CreateBinOp(
4180             (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx");
4181       } else {
4182         ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart);
4183       }
4184     }
4185   }
4186 
4187   // Create the reduction after the loop. Note that inloop reductions create the
4188   // target reduction in the loop using a Reduction recipe.
4189   if (VF.isVector() && !IsInLoopReductionPhi) {
4190     ReducedPartRdx =
4191         createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx);
4192     // If the reduction can be performed in a smaller type, we need to extend
4193     // the reduction to the wider type before we branch to the original loop.
4194     if (Phi->getType() != RdxDesc.getRecurrenceType())
4195       ReducedPartRdx =
4196         RdxDesc.isSigned()
4197         ? Builder.CreateSExt(ReducedPartRdx, Phi->getType())
4198         : Builder.CreateZExt(ReducedPartRdx, Phi->getType());
4199   }
4200 
4201   // Create a phi node that merges control-flow from the backedge-taken check
4202   // block and the middle block.
4203   PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx",
4204                                         LoopScalarPreHeader->getTerminator());
4205   for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
4206     BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]);
4207   BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
4208 
4209   // Now, we need to fix the users of the reduction variable
4210   // inside and outside of the scalar remainder loop.
4211 
4212   // We know that the loop is in LCSSA form. We need to update the PHI nodes
4213   // in the exit blocks.  See comment on analogous loop in
4214   // fixFirstOrderRecurrence for a more complete explaination of the logic.
4215   for (PHINode &LCSSAPhi : LoopExitBlock->phis())
4216     if (any_of(LCSSAPhi.incoming_values(),
4217                [LoopExitInst](Value *V) { return V == LoopExitInst; }))
4218       LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
4219 
4220   // Fix the scalar loop reduction variable with the incoming reduction sum
4221   // from the vector body and from the backedge value.
4222   int IncomingEdgeBlockIdx =
4223     Phi->getBasicBlockIndex(OrigLoop->getLoopLatch());
4224   assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
4225   // Pick the other block.
4226   int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
4227   Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
4228   Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
4229 }
4230 
4231 void InnerLoopVectorizer::clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc,
4232                                                   VPTransformState &State) {
4233   RecurKind RK = RdxDesc.getRecurrenceKind();
4234   if (RK != RecurKind::Add && RK != RecurKind::Mul)
4235     return;
4236 
4237   Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr();
4238   assert(LoopExitInstr && "null loop exit instruction");
4239   SmallVector<Instruction *, 8> Worklist;
4240   SmallPtrSet<Instruction *, 8> Visited;
4241   Worklist.push_back(LoopExitInstr);
4242   Visited.insert(LoopExitInstr);
4243 
4244   while (!Worklist.empty()) {
4245     Instruction *Cur = Worklist.pop_back_val();
4246     if (isa<OverflowingBinaryOperator>(Cur))
4247       for (unsigned Part = 0; Part < UF; ++Part) {
4248         Value *V = State.get(State.Plan->getVPValue(Cur), Part);
4249         cast<Instruction>(V)->dropPoisonGeneratingFlags();
4250       }
4251 
4252     for (User *U : Cur->users()) {
4253       Instruction *UI = cast<Instruction>(U);
4254       if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) &&
4255           Visited.insert(UI).second)
4256         Worklist.push_back(UI);
4257     }
4258   }
4259 }
4260 
4261 void InnerLoopVectorizer::fixLCSSAPHIs(VPTransformState &State) {
4262   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
4263     if (LCSSAPhi.getBasicBlockIndex(LoopMiddleBlock) != -1)
4264       // Some phis were already hand updated by the reduction and recurrence
4265       // code above, leave them alone.
4266       continue;
4267 
4268     auto *IncomingValue = LCSSAPhi.getIncomingValue(0);
4269     // Non-instruction incoming values will have only one value.
4270     unsigned LastLane = 0;
4271     if (isa<Instruction>(IncomingValue))
4272       LastLane = Cost->isUniformAfterVectorization(
4273                      cast<Instruction>(IncomingValue), VF)
4274                      ? 0
4275                      : VF.getKnownMinValue() - 1;
4276     assert((!VF.isScalable() || LastLane == 0) &&
4277            "scalable vectors dont support non-uniform scalars yet");
4278     // Can be a loop invariant incoming value or the last scalar value to be
4279     // extracted from the vectorized loop.
4280     Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
4281     Value *lastIncomingValue =
4282         OrigLoop->isLoopInvariant(IncomingValue)
4283             ? IncomingValue
4284             : State.get(State.Plan->getVPValue(IncomingValue),
4285                         VPIteration(UF - 1, LastLane));
4286     LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock);
4287   }
4288 }
4289 
4290 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
4291   // The basic block and loop containing the predicated instruction.
4292   auto *PredBB = PredInst->getParent();
4293   auto *VectorLoop = LI->getLoopFor(PredBB);
4294 
4295   // Initialize a worklist with the operands of the predicated instruction.
4296   SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
4297 
4298   // Holds instructions that we need to analyze again. An instruction may be
4299   // reanalyzed if we don't yet know if we can sink it or not.
4300   SmallVector<Instruction *, 8> InstsToReanalyze;
4301 
4302   // Returns true if a given use occurs in the predicated block. Phi nodes use
4303   // their operands in their corresponding predecessor blocks.
4304   auto isBlockOfUsePredicated = [&](Use &U) -> bool {
4305     auto *I = cast<Instruction>(U.getUser());
4306     BasicBlock *BB = I->getParent();
4307     if (auto *Phi = dyn_cast<PHINode>(I))
4308       BB = Phi->getIncomingBlock(
4309           PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
4310     return BB == PredBB;
4311   };
4312 
4313   // Iteratively sink the scalarized operands of the predicated instruction
4314   // into the block we created for it. When an instruction is sunk, it's
4315   // operands are then added to the worklist. The algorithm ends after one pass
4316   // through the worklist doesn't sink a single instruction.
4317   bool Changed;
4318   do {
4319     // Add the instructions that need to be reanalyzed to the worklist, and
4320     // reset the changed indicator.
4321     Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
4322     InstsToReanalyze.clear();
4323     Changed = false;
4324 
4325     while (!Worklist.empty()) {
4326       auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
4327 
4328       // We can't sink an instruction if it is a phi node, is already in the
4329       // predicated block, is not in the loop, or may have side effects.
4330       if (!I || isa<PHINode>(I) || I->getParent() == PredBB ||
4331           !VectorLoop->contains(I) || I->mayHaveSideEffects())
4332         continue;
4333 
4334       // It's legal to sink the instruction if all its uses occur in the
4335       // predicated block. Otherwise, there's nothing to do yet, and we may
4336       // need to reanalyze the instruction.
4337       if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
4338         InstsToReanalyze.push_back(I);
4339         continue;
4340       }
4341 
4342       // Move the instruction to the beginning of the predicated block, and add
4343       // it's operands to the worklist.
4344       I->moveBefore(&*PredBB->getFirstInsertionPt());
4345       Worklist.insert(I->op_begin(), I->op_end());
4346 
4347       // The sinking may have enabled other instructions to be sunk, so we will
4348       // need to iterate.
4349       Changed = true;
4350     }
4351   } while (Changed);
4352 }
4353 
4354 void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) {
4355   for (PHINode *OrigPhi : OrigPHIsToFix) {
4356     VPWidenPHIRecipe *VPPhi =
4357         cast<VPWidenPHIRecipe>(State.Plan->getVPValue(OrigPhi));
4358     PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0));
4359     for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) {
4360       VPValue *Inc = VPPhi->getIncomingValue(i);
4361       VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i);
4362       NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]);
4363     }
4364   }
4365 }
4366 
4367 void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPValue *VPDef,
4368                                    VPUser &Operands, unsigned UF,
4369                                    ElementCount VF, bool IsPtrLoopInvariant,
4370                                    SmallBitVector &IsIndexLoopInvariant,
4371                                    VPTransformState &State) {
4372   // Construct a vector GEP by widening the operands of the scalar GEP as
4373   // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
4374   // results in a vector of pointers when at least one operand of the GEP
4375   // is vector-typed. Thus, to keep the representation compact, we only use
4376   // vector-typed operands for loop-varying values.
4377 
4378   if (VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) {
4379     // If we are vectorizing, but the GEP has only loop-invariant operands,
4380     // the GEP we build (by only using vector-typed operands for
4381     // loop-varying values) would be a scalar pointer. Thus, to ensure we
4382     // produce a vector of pointers, we need to either arbitrarily pick an
4383     // operand to broadcast, or broadcast a clone of the original GEP.
4384     // Here, we broadcast a clone of the original.
4385     //
4386     // TODO: If at some point we decide to scalarize instructions having
4387     //       loop-invariant operands, this special case will no longer be
4388     //       required. We would add the scalarization decision to
4389     //       collectLoopScalars() and teach getVectorValue() to broadcast
4390     //       the lane-zero scalar value.
4391     auto *Clone = Builder.Insert(GEP->clone());
4392     for (unsigned Part = 0; Part < UF; ++Part) {
4393       Value *EntryPart = Builder.CreateVectorSplat(VF, Clone);
4394       State.set(VPDef, EntryPart, Part);
4395       addMetadata(EntryPart, GEP);
4396     }
4397   } else {
4398     // If the GEP has at least one loop-varying operand, we are sure to
4399     // produce a vector of pointers. But if we are only unrolling, we want
4400     // to produce a scalar GEP for each unroll part. Thus, the GEP we
4401     // produce with the code below will be scalar (if VF == 1) or vector
4402     // (otherwise). Note that for the unroll-only case, we still maintain
4403     // values in the vector mapping with initVector, as we do for other
4404     // instructions.
4405     for (unsigned Part = 0; Part < UF; ++Part) {
4406       // The pointer operand of the new GEP. If it's loop-invariant, we
4407       // won't broadcast it.
4408       auto *Ptr = IsPtrLoopInvariant
4409                       ? State.get(Operands.getOperand(0), VPIteration(0, 0))
4410                       : State.get(Operands.getOperand(0), Part);
4411 
4412       // Collect all the indices for the new GEP. If any index is
4413       // loop-invariant, we won't broadcast it.
4414       SmallVector<Value *, 4> Indices;
4415       for (unsigned I = 1, E = Operands.getNumOperands(); I < E; I++) {
4416         VPValue *Operand = Operands.getOperand(I);
4417         if (IsIndexLoopInvariant[I - 1])
4418           Indices.push_back(State.get(Operand, VPIteration(0, 0)));
4419         else
4420           Indices.push_back(State.get(Operand, Part));
4421       }
4422 
4423       // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
4424       // but it should be a vector, otherwise.
4425       auto *NewGEP =
4426           GEP->isInBounds()
4427               ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr,
4428                                           Indices)
4429               : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices);
4430       assert((VF.isScalar() || NewGEP->getType()->isVectorTy()) &&
4431              "NewGEP is not a pointer vector");
4432       State.set(VPDef, NewGEP, Part);
4433       addMetadata(NewGEP, GEP);
4434     }
4435   }
4436 }
4437 
4438 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,
4439                                               RecurrenceDescriptor *RdxDesc,
4440                                               VPValue *StartVPV, VPValue *Def,
4441                                               VPTransformState &State) {
4442   PHINode *P = cast<PHINode>(PN);
4443   if (EnableVPlanNativePath) {
4444     // Currently we enter here in the VPlan-native path for non-induction
4445     // PHIs where all control flow is uniform. We simply widen these PHIs.
4446     // Create a vector phi with no operands - the vector phi operands will be
4447     // set at the end of vector code generation.
4448     Type *VecTy = (State.VF.isScalar())
4449                       ? PN->getType()
4450                       : VectorType::get(PN->getType(), State.VF);
4451     Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi");
4452     State.set(Def, VecPhi, 0);
4453     OrigPHIsToFix.push_back(P);
4454 
4455     return;
4456   }
4457 
4458   assert(PN->getParent() == OrigLoop->getHeader() &&
4459          "Non-header phis should have been handled elsewhere");
4460 
4461   Value *StartV = StartVPV ? StartVPV->getLiveInIRValue() : nullptr;
4462   // In order to support recurrences we need to be able to vectorize Phi nodes.
4463   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
4464   // stage #1: We create a new vector PHI node with no incoming edges. We'll use
4465   // this value when we vectorize all of the instructions that use the PHI.
4466   if (RdxDesc || Legal->isFirstOrderRecurrence(P)) {
4467     Value *Iden = nullptr;
4468     bool ScalarPHI =
4469         (State.VF.isScalar()) || Cost->isInLoopReduction(cast<PHINode>(PN));
4470     Type *VecTy =
4471         ScalarPHI ? PN->getType() : VectorType::get(PN->getType(), State.VF);
4472 
4473     if (RdxDesc) {
4474       assert(Legal->isReductionVariable(P) && StartV &&
4475              "RdxDesc should only be set for reduction variables; in that case "
4476              "a StartV is also required");
4477       RecurKind RK = RdxDesc->getRecurrenceKind();
4478       if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK)) {
4479         // MinMax reduction have the start value as their identify.
4480         if (ScalarPHI) {
4481           Iden = StartV;
4482         } else {
4483           IRBuilderBase::InsertPointGuard IPBuilder(Builder);
4484           Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
4485           StartV = Iden =
4486               Builder.CreateVectorSplat(State.VF, StartV, "minmax.ident");
4487         }
4488       } else {
4489         Constant *IdenC = RecurrenceDescriptor::getRecurrenceIdentity(
4490             RK, VecTy->getScalarType());
4491         Iden = IdenC;
4492 
4493         if (!ScalarPHI) {
4494           Iden = ConstantVector::getSplat(State.VF, IdenC);
4495           IRBuilderBase::InsertPointGuard IPBuilder(Builder);
4496           Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
4497           Constant *Zero = Builder.getInt32(0);
4498           StartV = Builder.CreateInsertElement(Iden, StartV, Zero);
4499         }
4500       }
4501     }
4502 
4503     for (unsigned Part = 0; Part < State.UF; ++Part) {
4504       // This is phase one of vectorizing PHIs.
4505       Value *EntryPart = PHINode::Create(
4506           VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt());
4507       State.set(Def, EntryPart, Part);
4508       if (StartV) {
4509         // Make sure to add the reduction start value only to the
4510         // first unroll part.
4511         Value *StartVal = (Part == 0) ? StartV : Iden;
4512         cast<PHINode>(EntryPart)->addIncoming(StartVal, LoopVectorPreHeader);
4513       }
4514     }
4515     return;
4516   }
4517 
4518   assert(!Legal->isReductionVariable(P) &&
4519          "reductions should be handled above");
4520 
4521   setDebugLocFromInst(Builder, P);
4522 
4523   // This PHINode must be an induction variable.
4524   // Make sure that we know about it.
4525   assert(Legal->getInductionVars().count(P) && "Not an induction variable");
4526 
4527   InductionDescriptor II = Legal->getInductionVars().lookup(P);
4528   const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
4529 
4530   // FIXME: The newly created binary instructions should contain nsw/nuw flags,
4531   // which can be found from the original scalar operations.
4532   switch (II.getKind()) {
4533   case InductionDescriptor::IK_NoInduction:
4534     llvm_unreachable("Unknown induction");
4535   case InductionDescriptor::IK_IntInduction:
4536   case InductionDescriptor::IK_FpInduction:
4537     llvm_unreachable("Integer/fp induction is handled elsewhere.");
4538   case InductionDescriptor::IK_PtrInduction: {
4539     // Handle the pointer induction variable case.
4540     assert(P->getType()->isPointerTy() && "Unexpected type.");
4541 
4542     if (Cost->isScalarAfterVectorization(P, State.VF)) {
4543       // This is the normalized GEP that starts counting at zero.
4544       Value *PtrInd =
4545           Builder.CreateSExtOrTrunc(Induction, II.getStep()->getType());
4546       // Determine the number of scalars we need to generate for each unroll
4547       // iteration. If the instruction is uniform, we only need to generate the
4548       // first lane. Otherwise, we generate all VF values.
4549       unsigned Lanes = Cost->isUniformAfterVectorization(P, State.VF)
4550                            ? 1
4551                            : State.VF.getKnownMinValue();
4552       for (unsigned Part = 0; Part < UF; ++Part) {
4553         for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
4554           Constant *Idx = ConstantInt::get(
4555               PtrInd->getType(), Lane + Part * State.VF.getKnownMinValue());
4556           Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
4557           Value *SclrGep =
4558               emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II);
4559           SclrGep->setName("next.gep");
4560           State.set(Def, SclrGep, VPIteration(Part, Lane));
4561         }
4562       }
4563       return;
4564     }
4565     assert(isa<SCEVConstant>(II.getStep()) &&
4566            "Induction step not a SCEV constant!");
4567     Type *PhiType = II.getStep()->getType();
4568 
4569     // Build a pointer phi
4570     Value *ScalarStartValue = II.getStartValue();
4571     Type *ScStValueType = ScalarStartValue->getType();
4572     PHINode *NewPointerPhi =
4573         PHINode::Create(ScStValueType, 2, "pointer.phi", Induction);
4574     NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader);
4575 
4576     // A pointer induction, performed by using a gep
4577     BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
4578     Instruction *InductionLoc = LoopLatch->getTerminator();
4579     const SCEV *ScalarStep = II.getStep();
4580     SCEVExpander Exp(*PSE.getSE(), DL, "induction");
4581     Value *ScalarStepValue =
4582         Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc);
4583     Value *InductionGEP = GetElementPtrInst::Create(
4584         ScStValueType->getPointerElementType(), NewPointerPhi,
4585         Builder.CreateMul(
4586             ScalarStepValue,
4587             ConstantInt::get(PhiType, State.VF.getKnownMinValue() * State.UF)),
4588         "ptr.ind", InductionLoc);
4589     NewPointerPhi->addIncoming(InductionGEP, LoopLatch);
4590 
4591     // Create UF many actual address geps that use the pointer
4592     // phi as base and a vectorized version of the step value
4593     // (<step*0, ..., step*N>) as offset.
4594     for (unsigned Part = 0; Part < State.UF; ++Part) {
4595       SmallVector<Constant *, 8> Indices;
4596       // Create a vector of consecutive numbers from zero to VF.
4597       for (unsigned i = 0; i < State.VF.getKnownMinValue(); ++i)
4598         Indices.push_back(
4599             ConstantInt::get(PhiType, i + Part * State.VF.getKnownMinValue()));
4600       Constant *StartOffset = ConstantVector::get(Indices);
4601 
4602       Value *GEP = Builder.CreateGEP(
4603           ScStValueType->getPointerElementType(), NewPointerPhi,
4604           Builder.CreateMul(StartOffset,
4605                             Builder.CreateVectorSplat(
4606                                 State.VF.getKnownMinValue(), ScalarStepValue),
4607                             "vector.gep"));
4608       State.set(Def, GEP, Part);
4609     }
4610   }
4611   }
4612 }
4613 
4614 /// A helper function for checking whether an integer division-related
4615 /// instruction may divide by zero (in which case it must be predicated if
4616 /// executed conditionally in the scalar code).
4617 /// TODO: It may be worthwhile to generalize and check isKnownNonZero().
4618 /// Non-zero divisors that are non compile-time constants will not be
4619 /// converted into multiplication, so we will still end up scalarizing
4620 /// the division, but can do so w/o predication.
4621 static bool mayDivideByZero(Instruction &I) {
4622   assert((I.getOpcode() == Instruction::UDiv ||
4623           I.getOpcode() == Instruction::SDiv ||
4624           I.getOpcode() == Instruction::URem ||
4625           I.getOpcode() == Instruction::SRem) &&
4626          "Unexpected instruction");
4627   Value *Divisor = I.getOperand(1);
4628   auto *CInt = dyn_cast<ConstantInt>(Divisor);
4629   return !CInt || CInt->isZero();
4630 }
4631 
4632 void InnerLoopVectorizer::widenInstruction(Instruction &I, VPValue *Def,
4633                                            VPUser &User,
4634                                            VPTransformState &State) {
4635   switch (I.getOpcode()) {
4636   case Instruction::Call:
4637   case Instruction::Br:
4638   case Instruction::PHI:
4639   case Instruction::GetElementPtr:
4640   case Instruction::Select:
4641     llvm_unreachable("This instruction is handled by a different recipe.");
4642   case Instruction::UDiv:
4643   case Instruction::SDiv:
4644   case Instruction::SRem:
4645   case Instruction::URem:
4646   case Instruction::Add:
4647   case Instruction::FAdd:
4648   case Instruction::Sub:
4649   case Instruction::FSub:
4650   case Instruction::FNeg:
4651   case Instruction::Mul:
4652   case Instruction::FMul:
4653   case Instruction::FDiv:
4654   case Instruction::FRem:
4655   case Instruction::Shl:
4656   case Instruction::LShr:
4657   case Instruction::AShr:
4658   case Instruction::And:
4659   case Instruction::Or:
4660   case Instruction::Xor: {
4661     // Just widen unops and binops.
4662     setDebugLocFromInst(Builder, &I);
4663 
4664     for (unsigned Part = 0; Part < UF; ++Part) {
4665       SmallVector<Value *, 2> Ops;
4666       for (VPValue *VPOp : User.operands())
4667         Ops.push_back(State.get(VPOp, Part));
4668 
4669       Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops);
4670 
4671       if (auto *VecOp = dyn_cast<Instruction>(V))
4672         VecOp->copyIRFlags(&I);
4673 
4674       // Use this vector value for all users of the original instruction.
4675       State.set(Def, V, Part);
4676       addMetadata(V, &I);
4677     }
4678 
4679     break;
4680   }
4681   case Instruction::ICmp:
4682   case Instruction::FCmp: {
4683     // Widen compares. Generate vector compares.
4684     bool FCmp = (I.getOpcode() == Instruction::FCmp);
4685     auto *Cmp = cast<CmpInst>(&I);
4686     setDebugLocFromInst(Builder, Cmp);
4687     for (unsigned Part = 0; Part < UF; ++Part) {
4688       Value *A = State.get(User.getOperand(0), Part);
4689       Value *B = State.get(User.getOperand(1), Part);
4690       Value *C = nullptr;
4691       if (FCmp) {
4692         // Propagate fast math flags.
4693         IRBuilder<>::FastMathFlagGuard FMFG(Builder);
4694         Builder.setFastMathFlags(Cmp->getFastMathFlags());
4695         C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
4696       } else {
4697         C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
4698       }
4699       State.set(Def, C, Part);
4700       addMetadata(C, &I);
4701     }
4702 
4703     break;
4704   }
4705 
4706   case Instruction::ZExt:
4707   case Instruction::SExt:
4708   case Instruction::FPToUI:
4709   case Instruction::FPToSI:
4710   case Instruction::FPExt:
4711   case Instruction::PtrToInt:
4712   case Instruction::IntToPtr:
4713   case Instruction::SIToFP:
4714   case Instruction::UIToFP:
4715   case Instruction::Trunc:
4716   case Instruction::FPTrunc:
4717   case Instruction::BitCast: {
4718     auto *CI = cast<CastInst>(&I);
4719     setDebugLocFromInst(Builder, CI);
4720 
4721     /// Vectorize casts.
4722     Type *DestTy =
4723         (VF.isScalar()) ? CI->getType() : VectorType::get(CI->getType(), VF);
4724 
4725     for (unsigned Part = 0; Part < UF; ++Part) {
4726       Value *A = State.get(User.getOperand(0), Part);
4727       Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
4728       State.set(Def, Cast, Part);
4729       addMetadata(Cast, &I);
4730     }
4731     break;
4732   }
4733   default:
4734     // This instruction is not vectorized by simple widening.
4735     LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
4736     llvm_unreachable("Unhandled instruction!");
4737   } // end of switch.
4738 }
4739 
4740 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def,
4741                                                VPUser &ArgOperands,
4742                                                VPTransformState &State) {
4743   assert(!isa<DbgInfoIntrinsic>(I) &&
4744          "DbgInfoIntrinsic should have been dropped during VPlan construction");
4745   setDebugLocFromInst(Builder, &I);
4746 
4747   Module *M = I.getParent()->getParent()->getParent();
4748   auto *CI = cast<CallInst>(&I);
4749 
4750   SmallVector<Type *, 4> Tys;
4751   for (Value *ArgOperand : CI->arg_operands())
4752     Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue()));
4753 
4754   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
4755 
4756   // The flag shows whether we use Intrinsic or a usual Call for vectorized
4757   // version of the instruction.
4758   // Is it beneficial to perform intrinsic call compared to lib call?
4759   bool NeedToScalarize = false;
4760   InstructionCost CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize);
4761   InstructionCost IntrinsicCost = ID ? Cost->getVectorIntrinsicCost(CI, VF) : 0;
4762   bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost;
4763   assert((UseVectorIntrinsic || !NeedToScalarize) &&
4764          "Instruction should be scalarized elsewhere.");
4765   assert(IntrinsicCost.isValid() && CallCost.isValid() &&
4766          "Cannot have invalid costs while widening");
4767 
4768   for (unsigned Part = 0; Part < UF; ++Part) {
4769     SmallVector<Value *, 4> Args;
4770     for (auto &I : enumerate(ArgOperands.operands())) {
4771       // Some intrinsics have a scalar argument - don't replace it with a
4772       // vector.
4773       Value *Arg;
4774       if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index()))
4775         Arg = State.get(I.value(), Part);
4776       else
4777         Arg = State.get(I.value(), VPIteration(0, 0));
4778       Args.push_back(Arg);
4779     }
4780 
4781     Function *VectorF;
4782     if (UseVectorIntrinsic) {
4783       // Use vector version of the intrinsic.
4784       Type *TysForDecl[] = {CI->getType()};
4785       if (VF.isVector())
4786         TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
4787       VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
4788       assert(VectorF && "Can't retrieve vector intrinsic.");
4789     } else {
4790       // Use vector version of the function call.
4791       const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
4792 #ifndef NDEBUG
4793       assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr &&
4794              "Can't create vector function.");
4795 #endif
4796         VectorF = VFDatabase(*CI).getVectorizedFunction(Shape);
4797     }
4798       SmallVector<OperandBundleDef, 1> OpBundles;
4799       CI->getOperandBundlesAsDefs(OpBundles);
4800       CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
4801 
4802       if (isa<FPMathOperator>(V))
4803         V->copyFastMathFlags(CI);
4804 
4805       State.set(Def, V, Part);
4806       addMetadata(V, &I);
4807   }
4808 }
4809 
4810 void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I, VPValue *VPDef,
4811                                                  VPUser &Operands,
4812                                                  bool InvariantCond,
4813                                                  VPTransformState &State) {
4814   setDebugLocFromInst(Builder, &I);
4815 
4816   // The condition can be loop invariant  but still defined inside the
4817   // loop. This means that we can't just use the original 'cond' value.
4818   // We have to take the 'vectorized' value and pick the first lane.
4819   // Instcombine will make this a no-op.
4820   auto *InvarCond = InvariantCond
4821                         ? State.get(Operands.getOperand(0), VPIteration(0, 0))
4822                         : nullptr;
4823 
4824   for (unsigned Part = 0; Part < UF; ++Part) {
4825     Value *Cond =
4826         InvarCond ? InvarCond : State.get(Operands.getOperand(0), Part);
4827     Value *Op0 = State.get(Operands.getOperand(1), Part);
4828     Value *Op1 = State.get(Operands.getOperand(2), Part);
4829     Value *Sel = Builder.CreateSelect(Cond, Op0, Op1);
4830     State.set(VPDef, Sel, Part);
4831     addMetadata(Sel, &I);
4832   }
4833 }
4834 
4835 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
4836   // We should not collect Scalars more than once per VF. Right now, this
4837   // function is called from collectUniformsAndScalars(), which already does
4838   // this check. Collecting Scalars for VF=1 does not make any sense.
4839   assert(VF.isVector() && Scalars.find(VF) == Scalars.end() &&
4840          "This function should not be visited twice for the same VF");
4841 
4842   SmallSetVector<Instruction *, 8> Worklist;
4843 
4844   // These sets are used to seed the analysis with pointers used by memory
4845   // accesses that will remain scalar.
4846   SmallSetVector<Instruction *, 8> ScalarPtrs;
4847   SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
4848   auto *Latch = TheLoop->getLoopLatch();
4849 
4850   // A helper that returns true if the use of Ptr by MemAccess will be scalar.
4851   // The pointer operands of loads and stores will be scalar as long as the
4852   // memory access is not a gather or scatter operation. The value operand of a
4853   // store will remain scalar if the store is scalarized.
4854   auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
4855     InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
4856     assert(WideningDecision != CM_Unknown &&
4857            "Widening decision should be ready at this moment");
4858     if (auto *Store = dyn_cast<StoreInst>(MemAccess))
4859       if (Ptr == Store->getValueOperand())
4860         return WideningDecision == CM_Scalarize;
4861     assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
4862            "Ptr is neither a value or pointer operand");
4863     return WideningDecision != CM_GatherScatter;
4864   };
4865 
4866   // A helper that returns true if the given value is a bitcast or
4867   // getelementptr instruction contained in the loop.
4868   auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
4869     return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
4870             isa<GetElementPtrInst>(V)) &&
4871            !TheLoop->isLoopInvariant(V);
4872   };
4873 
4874   auto isScalarPtrInduction = [&](Instruction *MemAccess, Value *Ptr) {
4875     if (!isa<PHINode>(Ptr) ||
4876         !Legal->getInductionVars().count(cast<PHINode>(Ptr)))
4877       return false;
4878     auto &Induction = Legal->getInductionVars()[cast<PHINode>(Ptr)];
4879     if (Induction.getKind() != InductionDescriptor::IK_PtrInduction)
4880       return false;
4881     return isScalarUse(MemAccess, Ptr);
4882   };
4883 
4884   // A helper that evaluates a memory access's use of a pointer. If the
4885   // pointer is actually the pointer induction of a loop, it is being
4886   // inserted into Worklist. If the use will be a scalar use, and the
4887   // pointer is only used by memory accesses, we place the pointer in
4888   // ScalarPtrs. Otherwise, the pointer is placed in PossibleNonScalarPtrs.
4889   auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
4890     if (isScalarPtrInduction(MemAccess, Ptr)) {
4891       Worklist.insert(cast<Instruction>(Ptr));
4892       Instruction *Update = cast<Instruction>(
4893           cast<PHINode>(Ptr)->getIncomingValueForBlock(Latch));
4894       Worklist.insert(Update);
4895       LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Ptr
4896                         << "\n");
4897       LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Update
4898                         << "\n");
4899       return;
4900     }
4901     // We only care about bitcast and getelementptr instructions contained in
4902     // the loop.
4903     if (!isLoopVaryingBitCastOrGEP(Ptr))
4904       return;
4905 
4906     // If the pointer has already been identified as scalar (e.g., if it was
4907     // also identified as uniform), there's nothing to do.
4908     auto *I = cast<Instruction>(Ptr);
4909     if (Worklist.count(I))
4910       return;
4911 
4912     // If the use of the pointer will be a scalar use, and all users of the
4913     // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
4914     // place the pointer in PossibleNonScalarPtrs.
4915     if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
4916           return isa<LoadInst>(U) || isa<StoreInst>(U);
4917         }))
4918       ScalarPtrs.insert(I);
4919     else
4920       PossibleNonScalarPtrs.insert(I);
4921   };
4922 
4923   // We seed the scalars analysis with three classes of instructions: (1)
4924   // instructions marked uniform-after-vectorization and (2) bitcast,
4925   // getelementptr and (pointer) phi instructions used by memory accesses
4926   // requiring a scalar use.
4927   //
4928   // (1) Add to the worklist all instructions that have been identified as
4929   // uniform-after-vectorization.
4930   Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
4931 
4932   // (2) Add to the worklist all bitcast and getelementptr instructions used by
4933   // memory accesses requiring a scalar use. The pointer operands of loads and
4934   // stores will be scalar as long as the memory accesses is not a gather or
4935   // scatter operation. The value operand of a store will remain scalar if the
4936   // store is scalarized.
4937   for (auto *BB : TheLoop->blocks())
4938     for (auto &I : *BB) {
4939       if (auto *Load = dyn_cast<LoadInst>(&I)) {
4940         evaluatePtrUse(Load, Load->getPointerOperand());
4941       } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
4942         evaluatePtrUse(Store, Store->getPointerOperand());
4943         evaluatePtrUse(Store, Store->getValueOperand());
4944       }
4945     }
4946   for (auto *I : ScalarPtrs)
4947     if (!PossibleNonScalarPtrs.count(I)) {
4948       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
4949       Worklist.insert(I);
4950     }
4951 
4952   // Insert the forced scalars.
4953   // FIXME: Currently widenPHIInstruction() often creates a dead vector
4954   // induction variable when the PHI user is scalarized.
4955   auto ForcedScalar = ForcedScalars.find(VF);
4956   if (ForcedScalar != ForcedScalars.end())
4957     for (auto *I : ForcedScalar->second)
4958       Worklist.insert(I);
4959 
4960   // Expand the worklist by looking through any bitcasts and getelementptr
4961   // instructions we've already identified as scalar. This is similar to the
4962   // expansion step in collectLoopUniforms(); however, here we're only
4963   // expanding to include additional bitcasts and getelementptr instructions.
4964   unsigned Idx = 0;
4965   while (Idx != Worklist.size()) {
4966     Instruction *Dst = Worklist[Idx++];
4967     if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
4968       continue;
4969     auto *Src = cast<Instruction>(Dst->getOperand(0));
4970     if (llvm::all_of(Src->users(), [&](User *U) -> bool {
4971           auto *J = cast<Instruction>(U);
4972           return !TheLoop->contains(J) || Worklist.count(J) ||
4973                  ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
4974                   isScalarUse(J, Src));
4975         })) {
4976       Worklist.insert(Src);
4977       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
4978     }
4979   }
4980 
4981   // An induction variable will remain scalar if all users of the induction
4982   // variable and induction variable update remain scalar.
4983   for (auto &Induction : Legal->getInductionVars()) {
4984     auto *Ind = Induction.first;
4985     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4986 
4987     // If tail-folding is applied, the primary induction variable will be used
4988     // to feed a vector compare.
4989     if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
4990       continue;
4991 
4992     // Determine if all users of the induction variable are scalar after
4993     // vectorization.
4994     auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4995       auto *I = cast<Instruction>(U);
4996       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I);
4997     });
4998     if (!ScalarInd)
4999       continue;
5000 
5001     // Determine if all users of the induction variable update instruction are
5002     // scalar after vectorization.
5003     auto ScalarIndUpdate =
5004         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
5005           auto *I = cast<Instruction>(U);
5006           return I == Ind || !TheLoop->contains(I) || Worklist.count(I);
5007         });
5008     if (!ScalarIndUpdate)
5009       continue;
5010 
5011     // The induction variable and its update instruction will remain scalar.
5012     Worklist.insert(Ind);
5013     Worklist.insert(IndUpdate);
5014     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
5015     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
5016                       << "\n");
5017   }
5018 
5019   Scalars[VF].insert(Worklist.begin(), Worklist.end());
5020 }
5021 
5022 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I,
5023                                                          ElementCount VF) {
5024   if (!blockNeedsPredication(I->getParent()))
5025     return false;
5026   switch(I->getOpcode()) {
5027   default:
5028     break;
5029   case Instruction::Load:
5030   case Instruction::Store: {
5031     if (!Legal->isMaskRequired(I))
5032       return false;
5033     auto *Ptr = getLoadStorePointerOperand(I);
5034     auto *Ty = getMemInstValueType(I);
5035     // We have already decided how to vectorize this instruction, get that
5036     // result.
5037     if (VF.isVector()) {
5038       InstWidening WideningDecision = getWideningDecision(I, VF);
5039       assert(WideningDecision != CM_Unknown &&
5040              "Widening decision should be ready at this moment");
5041       return WideningDecision == CM_Scalarize;
5042     }
5043     const Align Alignment = getLoadStoreAlignment(I);
5044     return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
5045                                 isLegalMaskedGather(Ty, Alignment))
5046                             : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
5047                                 isLegalMaskedScatter(Ty, Alignment));
5048   }
5049   case Instruction::UDiv:
5050   case Instruction::SDiv:
5051   case Instruction::SRem:
5052   case Instruction::URem:
5053     return mayDivideByZero(*I);
5054   }
5055   return false;
5056 }
5057 
5058 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
5059     Instruction *I, ElementCount VF) {
5060   assert(isAccessInterleaved(I) && "Expecting interleaved access.");
5061   assert(getWideningDecision(I, VF) == CM_Unknown &&
5062          "Decision should not be set yet.");
5063   auto *Group = getInterleavedAccessGroup(I);
5064   assert(Group && "Must have a group.");
5065 
5066   // If the instruction's allocated size doesn't equal it's type size, it
5067   // requires padding and will be scalarized.
5068   auto &DL = I->getModule()->getDataLayout();
5069   auto *ScalarTy = getMemInstValueType(I);
5070   if (hasIrregularType(ScalarTy, DL, VF))
5071     return false;
5072 
5073   // Check if masking is required.
5074   // A Group may need masking for one of two reasons: it resides in a block that
5075   // needs predication, or it was decided to use masking to deal with gaps.
5076   bool PredicatedAccessRequiresMasking =
5077       Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I);
5078   bool AccessWithGapsRequiresMasking =
5079       Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
5080   if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking)
5081     return true;
5082 
5083   // If masked interleaving is required, we expect that the user/target had
5084   // enabled it, because otherwise it either wouldn't have been created or
5085   // it should have been invalidated by the CostModel.
5086   assert(useMaskedInterleavedAccesses(TTI) &&
5087          "Masked interleave-groups for predicated accesses are not enabled.");
5088 
5089   auto *Ty = getMemInstValueType(I);
5090   const Align Alignment = getLoadStoreAlignment(I);
5091   return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
5092                           : TTI.isLegalMaskedStore(Ty, Alignment);
5093 }
5094 
5095 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
5096     Instruction *I, ElementCount VF) {
5097   // Get and ensure we have a valid memory instruction.
5098   LoadInst *LI = dyn_cast<LoadInst>(I);
5099   StoreInst *SI = dyn_cast<StoreInst>(I);
5100   assert((LI || SI) && "Invalid memory instruction");
5101 
5102   auto *Ptr = getLoadStorePointerOperand(I);
5103 
5104   // In order to be widened, the pointer should be consecutive, first of all.
5105   if (!Legal->isConsecutivePtr(Ptr))
5106     return false;
5107 
5108   // If the instruction is a store located in a predicated block, it will be
5109   // scalarized.
5110   if (isScalarWithPredication(I))
5111     return false;
5112 
5113   // If the instruction's allocated size doesn't equal it's type size, it
5114   // requires padding and will be scalarized.
5115   auto &DL = I->getModule()->getDataLayout();
5116   auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType();
5117   if (hasIrregularType(ScalarTy, DL, VF))
5118     return false;
5119 
5120   return true;
5121 }
5122 
5123 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
5124   // We should not collect Uniforms more than once per VF. Right now,
5125   // this function is called from collectUniformsAndScalars(), which
5126   // already does this check. Collecting Uniforms for VF=1 does not make any
5127   // sense.
5128 
5129   assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() &&
5130          "This function should not be visited twice for the same VF");
5131 
5132   // Visit the list of Uniforms. If we'll not find any uniform value, we'll
5133   // not analyze again.  Uniforms.count(VF) will return 1.
5134   Uniforms[VF].clear();
5135 
5136   // We now know that the loop is vectorizable!
5137   // Collect instructions inside the loop that will remain uniform after
5138   // vectorization.
5139 
5140   // Global values, params and instructions outside of current loop are out of
5141   // scope.
5142   auto isOutOfScope = [&](Value *V) -> bool {
5143     Instruction *I = dyn_cast<Instruction>(V);
5144     return (!I || !TheLoop->contains(I));
5145   };
5146 
5147   SetVector<Instruction *> Worklist;
5148   BasicBlock *Latch = TheLoop->getLoopLatch();
5149 
5150   // Instructions that are scalar with predication must not be considered
5151   // uniform after vectorization, because that would create an erroneous
5152   // replicating region where only a single instance out of VF should be formed.
5153   // TODO: optimize such seldom cases if found important, see PR40816.
5154   auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
5155     if (isOutOfScope(I)) {
5156       LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: "
5157                         << *I << "\n");
5158       return;
5159     }
5160     if (isScalarWithPredication(I, VF)) {
5161       LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "
5162                         << *I << "\n");
5163       return;
5164     }
5165     LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
5166     Worklist.insert(I);
5167   };
5168 
5169   // Start with the conditional branch. If the branch condition is an
5170   // instruction contained in the loop that is only used by the branch, it is
5171   // uniform.
5172   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
5173   if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
5174     addToWorklistIfAllowed(Cmp);
5175 
5176   auto isUniformDecision = [&](Instruction *I, ElementCount VF) {
5177     InstWidening WideningDecision = getWideningDecision(I, VF);
5178     assert(WideningDecision != CM_Unknown &&
5179            "Widening decision should be ready at this moment");
5180 
5181     // A uniform memory op is itself uniform.  We exclude uniform stores
5182     // here as they demand the last lane, not the first one.
5183     if (isa<LoadInst>(I) && Legal->isUniformMemOp(*I)) {
5184       assert(WideningDecision == CM_Scalarize);
5185       return true;
5186     }
5187 
5188     return (WideningDecision == CM_Widen ||
5189             WideningDecision == CM_Widen_Reverse ||
5190             WideningDecision == CM_Interleave);
5191   };
5192 
5193 
5194   // Returns true if Ptr is the pointer operand of a memory access instruction
5195   // I, and I is known to not require scalarization.
5196   auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
5197     return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF);
5198   };
5199 
5200   // Holds a list of values which are known to have at least one uniform use.
5201   // Note that there may be other uses which aren't uniform.  A "uniform use"
5202   // here is something which only demands lane 0 of the unrolled iterations;
5203   // it does not imply that all lanes produce the same value (e.g. this is not
5204   // the usual meaning of uniform)
5205   SmallPtrSet<Value *, 8> HasUniformUse;
5206 
5207   // Scan the loop for instructions which are either a) known to have only
5208   // lane 0 demanded or b) are uses which demand only lane 0 of their operand.
5209   for (auto *BB : TheLoop->blocks())
5210     for (auto &I : *BB) {
5211       // If there's no pointer operand, there's nothing to do.
5212       auto *Ptr = getLoadStorePointerOperand(&I);
5213       if (!Ptr)
5214         continue;
5215 
5216       // A uniform memory op is itself uniform.  We exclude uniform stores
5217       // here as they demand the last lane, not the first one.
5218       if (isa<LoadInst>(I) && Legal->isUniformMemOp(I))
5219         addToWorklistIfAllowed(&I);
5220 
5221       if (isUniformDecision(&I, VF)) {
5222         assert(isVectorizedMemAccessUse(&I, Ptr) && "consistency check");
5223         HasUniformUse.insert(Ptr);
5224       }
5225     }
5226 
5227   // Add to the worklist any operands which have *only* uniform (e.g. lane 0
5228   // demanding) users.  Since loops are assumed to be in LCSSA form, this
5229   // disallows uses outside the loop as well.
5230   for (auto *V : HasUniformUse) {
5231     if (isOutOfScope(V))
5232       continue;
5233     auto *I = cast<Instruction>(V);
5234     auto UsersAreMemAccesses =
5235       llvm::all_of(I->users(), [&](User *U) -> bool {
5236         return isVectorizedMemAccessUse(cast<Instruction>(U), V);
5237       });
5238     if (UsersAreMemAccesses)
5239       addToWorklistIfAllowed(I);
5240   }
5241 
5242   // Expand Worklist in topological order: whenever a new instruction
5243   // is added , its users should be already inside Worklist.  It ensures
5244   // a uniform instruction will only be used by uniform instructions.
5245   unsigned idx = 0;
5246   while (idx != Worklist.size()) {
5247     Instruction *I = Worklist[idx++];
5248 
5249     for (auto OV : I->operand_values()) {
5250       // isOutOfScope operands cannot be uniform instructions.
5251       if (isOutOfScope(OV))
5252         continue;
5253       // First order recurrence Phi's should typically be considered
5254       // non-uniform.
5255       auto *OP = dyn_cast<PHINode>(OV);
5256       if (OP && Legal->isFirstOrderRecurrence(OP))
5257         continue;
5258       // If all the users of the operand are uniform, then add the
5259       // operand into the uniform worklist.
5260       auto *OI = cast<Instruction>(OV);
5261       if (llvm::all_of(OI->users(), [&](User *U) -> bool {
5262             auto *J = cast<Instruction>(U);
5263             return Worklist.count(J) || isVectorizedMemAccessUse(J, OI);
5264           }))
5265         addToWorklistIfAllowed(OI);
5266     }
5267   }
5268 
5269   // For an instruction to be added into Worklist above, all its users inside
5270   // the loop should also be in Worklist. However, this condition cannot be
5271   // true for phi nodes that form a cyclic dependence. We must process phi
5272   // nodes separately. An induction variable will remain uniform if all users
5273   // of the induction variable and induction variable update remain uniform.
5274   // The code below handles both pointer and non-pointer induction variables.
5275   for (auto &Induction : Legal->getInductionVars()) {
5276     auto *Ind = Induction.first;
5277     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
5278 
5279     // Determine if all users of the induction variable are uniform after
5280     // vectorization.
5281     auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
5282       auto *I = cast<Instruction>(U);
5283       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
5284              isVectorizedMemAccessUse(I, Ind);
5285     });
5286     if (!UniformInd)
5287       continue;
5288 
5289     // Determine if all users of the induction variable update instruction are
5290     // uniform after vectorization.
5291     auto UniformIndUpdate =
5292         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
5293           auto *I = cast<Instruction>(U);
5294           return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
5295                  isVectorizedMemAccessUse(I, IndUpdate);
5296         });
5297     if (!UniformIndUpdate)
5298       continue;
5299 
5300     // The induction variable and its update instruction will remain uniform.
5301     addToWorklistIfAllowed(Ind);
5302     addToWorklistIfAllowed(IndUpdate);
5303   }
5304 
5305   Uniforms[VF].insert(Worklist.begin(), Worklist.end());
5306 }
5307 
5308 bool LoopVectorizationCostModel::runtimeChecksRequired() {
5309   LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
5310 
5311   if (Legal->getRuntimePointerChecking()->Need) {
5312     reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
5313         "runtime pointer checks needed. Enable vectorization of this "
5314         "loop with '#pragma clang loop vectorize(enable)' when "
5315         "compiling with -Os/-Oz",
5316         "CantVersionLoopWithOptForSize", ORE, TheLoop);
5317     return true;
5318   }
5319 
5320   if (!PSE.getUnionPredicate().getPredicates().empty()) {
5321     reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
5322         "runtime SCEV checks needed. Enable vectorization of this "
5323         "loop with '#pragma clang loop vectorize(enable)' when "
5324         "compiling with -Os/-Oz",
5325         "CantVersionLoopWithOptForSize", ORE, TheLoop);
5326     return true;
5327   }
5328 
5329   // FIXME: Avoid specializing for stride==1 instead of bailing out.
5330   if (!Legal->getLAI()->getSymbolicStrides().empty()) {
5331     reportVectorizationFailure("Runtime stride check for small trip count",
5332         "runtime stride == 1 checks needed. Enable vectorization of "
5333         "this loop without such check by compiling with -Os/-Oz",
5334         "CantVersionLoopWithOptForSize", ORE, TheLoop);
5335     return true;
5336   }
5337 
5338   return false;
5339 }
5340 
5341 Optional<ElementCount>
5342 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
5343   if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
5344     // TODO: It may by useful to do since it's still likely to be dynamically
5345     // uniform if the target can skip.
5346     reportVectorizationFailure(
5347         "Not inserting runtime ptr check for divergent target",
5348         "runtime pointer checks needed. Not enabled for divergent target",
5349         "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
5350     return None;
5351   }
5352 
5353   unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
5354   LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
5355   if (TC == 1) {
5356     reportVectorizationFailure("Single iteration (non) loop",
5357         "loop trip count is one, irrelevant for vectorization",
5358         "SingleIterationLoop", ORE, TheLoop);
5359     return None;
5360   }
5361 
5362   switch (ScalarEpilogueStatus) {
5363   case CM_ScalarEpilogueAllowed:
5364     return computeFeasibleMaxVF(TC, UserVF);
5365   case CM_ScalarEpilogueNotAllowedUsePredicate:
5366     LLVM_FALLTHROUGH;
5367   case CM_ScalarEpilogueNotNeededUsePredicate:
5368     LLVM_DEBUG(
5369         dbgs() << "LV: vector predicate hint/switch found.\n"
5370                << "LV: Not allowing scalar epilogue, creating predicated "
5371                << "vector loop.\n");
5372     break;
5373   case CM_ScalarEpilogueNotAllowedLowTripLoop:
5374     // fallthrough as a special case of OptForSize
5375   case CM_ScalarEpilogueNotAllowedOptSize:
5376     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
5377       LLVM_DEBUG(
5378           dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
5379     else
5380       LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
5381                         << "count.\n");
5382 
5383     // Bail if runtime checks are required, which are not good when optimising
5384     // for size.
5385     if (runtimeChecksRequired())
5386       return None;
5387 
5388     break;
5389   }
5390 
5391   // The only loops we can vectorize without a scalar epilogue, are loops with
5392   // a bottom-test and a single exiting block. We'd have to handle the fact
5393   // that not every instruction executes on the last iteration.  This will
5394   // require a lane mask which varies through the vector loop body.  (TODO)
5395   if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
5396     // If there was a tail-folding hint/switch, but we can't fold the tail by
5397     // masking, fallback to a vectorization with a scalar epilogue.
5398     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
5399       LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
5400                            "scalar epilogue instead.\n");
5401       ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
5402       return computeFeasibleMaxVF(TC, UserVF);
5403     }
5404     return None;
5405   }
5406 
5407   // Now try the tail folding
5408 
5409   // Invalidate interleave groups that require an epilogue if we can't mask
5410   // the interleave-group.
5411   if (!useMaskedInterleavedAccesses(TTI)) {
5412     assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() &&
5413            "No decisions should have been taken at this point");
5414     // Note: There is no need to invalidate any cost modeling decisions here, as
5415     // non where taken so far.
5416     InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
5417   }
5418 
5419   ElementCount MaxVF = computeFeasibleMaxVF(TC, UserVF);
5420   assert(!MaxVF.isScalable() &&
5421          "Scalable vectors do not yet support tail folding");
5422   assert((UserVF.isNonZero() || isPowerOf2_32(MaxVF.getFixedValue())) &&
5423          "MaxVF must be a power of 2");
5424   unsigned MaxVFtimesIC =
5425       UserIC ? MaxVF.getFixedValue() * UserIC : MaxVF.getFixedValue();
5426   // Avoid tail folding if the trip count is known to be a multiple of any VF we
5427   // chose.
5428   ScalarEvolution *SE = PSE.getSE();
5429   const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
5430   const SCEV *ExitCount = SE->getAddExpr(
5431       BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
5432   const SCEV *Rem = SE->getURemExpr(
5433       SE->applyLoopGuards(ExitCount, TheLoop),
5434       SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC));
5435   if (Rem->isZero()) {
5436     // Accept MaxVF if we do not have a tail.
5437     LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
5438     return MaxVF;
5439   }
5440 
5441   // If we don't know the precise trip count, or if the trip count that we
5442   // found modulo the vectorization factor is not zero, try to fold the tail
5443   // by masking.
5444   // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
5445   if (Legal->prepareToFoldTailByMasking()) {
5446     FoldTailByMasking = true;
5447     return MaxVF;
5448   }
5449 
5450   // If there was a tail-folding hint/switch, but we can't fold the tail by
5451   // masking, fallback to a vectorization with a scalar epilogue.
5452   if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
5453     LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
5454                          "scalar epilogue instead.\n");
5455     ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
5456     return MaxVF;
5457   }
5458 
5459   if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) {
5460     LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n");
5461     return None;
5462   }
5463 
5464   if (TC == 0) {
5465     reportVectorizationFailure(
5466         "Unable to calculate the loop count due to complex control flow",
5467         "unable to calculate the loop count due to complex control flow",
5468         "UnknownLoopCountComplexCFG", ORE, TheLoop);
5469     return None;
5470   }
5471 
5472   reportVectorizationFailure(
5473       "Cannot optimize for size and vectorize at the same time.",
5474       "cannot optimize for size and vectorize at the same time. "
5475       "Enable vectorization of this loop with '#pragma clang loop "
5476       "vectorize(enable)' when compiling with -Os/-Oz",
5477       "NoTailLoopWithOptForSize", ORE, TheLoop);
5478   return None;
5479 }
5480 
5481 ElementCount
5482 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount,
5483                                                  ElementCount UserVF) {
5484   bool IgnoreScalableUserVF = UserVF.isScalable() &&
5485                               !TTI.supportsScalableVectors() &&
5486                               !ForceTargetSupportsScalableVectors;
5487   if (IgnoreScalableUserVF) {
5488     LLVM_DEBUG(
5489         dbgs() << "LV: Ignoring VF=" << UserVF
5490                << " because target does not support scalable vectors.\n");
5491     ORE->emit([&]() {
5492       return OptimizationRemarkAnalysis(DEBUG_TYPE, "IgnoreScalableUserVF",
5493                                         TheLoop->getStartLoc(),
5494                                         TheLoop->getHeader())
5495              << "Ignoring VF=" << ore::NV("UserVF", UserVF)
5496              << " because target does not support scalable vectors.";
5497     });
5498   }
5499 
5500   // Beyond this point two scenarios are handled. If UserVF isn't specified
5501   // then a suitable VF is chosen. If UserVF is specified and there are
5502   // dependencies, check if it's legal. However, if a UserVF is specified and
5503   // there are no dependencies, then there's nothing to do.
5504   if (UserVF.isNonZero() && !IgnoreScalableUserVF) {
5505     if (!canVectorizeReductions(UserVF)) {
5506       reportVectorizationFailure(
5507           "LV: Scalable vectorization not supported for the reduction "
5508           "operations found in this loop. Using fixed-width "
5509           "vectorization instead.",
5510           "Scalable vectorization not supported for the reduction operations "
5511           "found in this loop. Using fixed-width vectorization instead.",
5512           "ScalableVFUnfeasible", ORE, TheLoop);
5513       return computeFeasibleMaxVF(
5514           ConstTripCount, ElementCount::getFixed(UserVF.getKnownMinValue()));
5515     }
5516 
5517     if (Legal->isSafeForAnyVectorWidth())
5518       return UserVF;
5519   }
5520 
5521   MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
5522   unsigned SmallestType, WidestType;
5523   std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
5524   unsigned WidestRegister = TTI.getRegisterBitWidth(true);
5525 
5526   // Get the maximum safe dependence distance in bits computed by LAA.
5527   // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
5528   // the memory accesses that is most restrictive (involved in the smallest
5529   // dependence distance).
5530   unsigned MaxSafeVectorWidthInBits = Legal->getMaxSafeVectorWidthInBits();
5531 
5532   // If the user vectorization factor is legally unsafe, clamp it to a safe
5533   // value. Otherwise, return as is.
5534   if (UserVF.isNonZero() && !IgnoreScalableUserVF) {
5535     unsigned MaxSafeElements =
5536         PowerOf2Floor(MaxSafeVectorWidthInBits / WidestType);
5537     ElementCount MaxSafeVF = ElementCount::getFixed(MaxSafeElements);
5538 
5539     if (UserVF.isScalable()) {
5540       Optional<unsigned> MaxVScale = TTI.getMaxVScale();
5541 
5542       // Scale VF by vscale before checking if it's safe.
5543       MaxSafeVF = ElementCount::getScalable(
5544           MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0);
5545 
5546       if (MaxSafeVF.isZero()) {
5547         // The dependence distance is too small to use scalable vectors,
5548         // fallback on fixed.
5549         LLVM_DEBUG(
5550             dbgs()
5551             << "LV: Max legal vector width too small, scalable vectorization "
5552                "unfeasible. Using fixed-width vectorization instead.\n");
5553         ORE->emit([&]() {
5554           return OptimizationRemarkAnalysis(DEBUG_TYPE, "ScalableVFUnfeasible",
5555                                             TheLoop->getStartLoc(),
5556                                             TheLoop->getHeader())
5557                  << "Max legal vector width too small, scalable vectorization "
5558                  << "unfeasible. Using fixed-width vectorization instead.";
5559         });
5560         return computeFeasibleMaxVF(
5561             ConstTripCount, ElementCount::getFixed(UserVF.getKnownMinValue()));
5562       }
5563     }
5564 
5565     LLVM_DEBUG(dbgs() << "LV: The max safe VF is: " << MaxSafeVF << ".\n");
5566 
5567     if (ElementCount::isKnownLE(UserVF, MaxSafeVF))
5568       return UserVF;
5569 
5570     LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
5571                       << " is unsafe, clamping to max safe VF=" << MaxSafeVF
5572                       << ".\n");
5573     ORE->emit([&]() {
5574       return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
5575                                         TheLoop->getStartLoc(),
5576                                         TheLoop->getHeader())
5577              << "User-specified vectorization factor "
5578              << ore::NV("UserVectorizationFactor", UserVF)
5579              << " is unsafe, clamping to maximum safe vectorization factor "
5580              << ore::NV("VectorizationFactor", MaxSafeVF);
5581     });
5582     return MaxSafeVF;
5583   }
5584 
5585   WidestRegister = std::min(WidestRegister, MaxSafeVectorWidthInBits);
5586 
5587   // Ensure MaxVF is a power of 2; the dependence distance bound may not be.
5588   // Note that both WidestRegister and WidestType may not be a powers of 2.
5589   auto MaxVectorSize =
5590       ElementCount::getFixed(PowerOf2Floor(WidestRegister / WidestType));
5591 
5592   LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
5593                     << " / " << WidestType << " bits.\n");
5594   LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
5595                     << WidestRegister << " bits.\n");
5596 
5597   assert(MaxVectorSize.getFixedValue() <= WidestRegister &&
5598          "Did not expect to pack so many elements"
5599          " into one vector!");
5600   if (MaxVectorSize.getFixedValue() == 0) {
5601     LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n");
5602     return ElementCount::getFixed(1);
5603   } else if (ConstTripCount && ConstTripCount < MaxVectorSize.getFixedValue() &&
5604              isPowerOf2_32(ConstTripCount)) {
5605     // We need to clamp the VF to be the ConstTripCount. There is no point in
5606     // choosing a higher viable VF as done in the loop below.
5607     LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: "
5608                       << ConstTripCount << "\n");
5609     return ElementCount::getFixed(ConstTripCount);
5610   }
5611 
5612   ElementCount MaxVF = MaxVectorSize;
5613   if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) ||
5614       (MaximizeBandwidth && isScalarEpilogueAllowed())) {
5615     // Collect all viable vectorization factors larger than the default MaxVF
5616     // (i.e. MaxVectorSize).
5617     SmallVector<ElementCount, 8> VFs;
5618     auto MaxVectorSizeMaxBW =
5619         ElementCount::getFixed(WidestRegister / SmallestType);
5620     for (ElementCount VS = MaxVectorSize * 2;
5621          ElementCount::isKnownLE(VS, MaxVectorSizeMaxBW); VS *= 2)
5622       VFs.push_back(VS);
5623 
5624     // For each VF calculate its register usage.
5625     auto RUs = calculateRegisterUsage(VFs);
5626 
5627     // Select the largest VF which doesn't require more registers than existing
5628     // ones.
5629     for (int i = RUs.size() - 1; i >= 0; --i) {
5630       bool Selected = true;
5631       for (auto &pair : RUs[i].MaxLocalUsers) {
5632         unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5633         if (pair.second > TargetNumRegisters)
5634           Selected = false;
5635       }
5636       if (Selected) {
5637         MaxVF = VFs[i];
5638         break;
5639       }
5640     }
5641     if (ElementCount MinVF =
5642             TTI.getMinimumVF(SmallestType, /*IsScalable=*/false)) {
5643       if (ElementCount::isKnownLT(MaxVF, MinVF)) {
5644         LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
5645                           << ") with target's minimum: " << MinVF << '\n');
5646         MaxVF = MinVF;
5647       }
5648     }
5649   }
5650   return MaxVF;
5651 }
5652 
5653 VectorizationFactor
5654 LoopVectorizationCostModel::selectVectorizationFactor(ElementCount MaxVF) {
5655   // FIXME: This can be fixed for scalable vectors later, because at this stage
5656   // the LoopVectorizer will only consider vectorizing a loop with scalable
5657   // vectors when the loop has a hint to enable vectorization for a given VF.
5658   assert(!MaxVF.isScalable() && "scalable vectors not yet supported");
5659 
5660   InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first;
5661   LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n");
5662   assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop");
5663 
5664   auto Width = ElementCount::getFixed(1);
5665   const float ScalarCost = *ExpectedCost.getValue();
5666   float Cost = ScalarCost;
5667 
5668   bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
5669   if (ForceVectorization && MaxVF.isVector()) {
5670     // Ignore scalar width, because the user explicitly wants vectorization.
5671     // Initialize cost to max so that VF = 2 is, at least, chosen during cost
5672     // evaluation.
5673     Cost = std::numeric_limits<float>::max();
5674   }
5675 
5676   for (auto i = ElementCount::getFixed(2); ElementCount::isKnownLE(i, MaxVF);
5677        i *= 2) {
5678     // Notice that the vector loop needs to be executed less times, so
5679     // we need to divide the cost of the vector loops by the width of
5680     // the vector elements.
5681     VectorizationCostTy C = expectedCost(i);
5682     assert(C.first.isValid() && "Unexpected invalid cost for vector loop");
5683     float VectorCost = *C.first.getValue() / (float)i.getFixedValue();
5684     LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
5685                       << " costs: " << (int)VectorCost << ".\n");
5686     if (!C.second && !ForceVectorization) {
5687       LLVM_DEBUG(
5688           dbgs() << "LV: Not considering vector loop of width " << i
5689                  << " because it will not generate any vector instructions.\n");
5690       continue;
5691     }
5692 
5693     // If profitable add it to ProfitableVF list.
5694     if (VectorCost < ScalarCost) {
5695       ProfitableVFs.push_back(VectorizationFactor(
5696           {i, (unsigned)VectorCost}));
5697     }
5698 
5699     if (VectorCost < Cost) {
5700       Cost = VectorCost;
5701       Width = i;
5702     }
5703   }
5704 
5705   if (!EnableCondStoresVectorization && NumPredStores) {
5706     reportVectorizationFailure("There are conditional stores.",
5707         "store that is conditionally executed prevents vectorization",
5708         "ConditionalStore", ORE, TheLoop);
5709     Width = ElementCount::getFixed(1);
5710     Cost = ScalarCost;
5711   }
5712 
5713   LLVM_DEBUG(if (ForceVectorization && !Width.isScalar() && Cost >= ScalarCost) dbgs()
5714              << "LV: Vectorization seems to be not beneficial, "
5715              << "but was forced by a user.\n");
5716   LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n");
5717   VectorizationFactor Factor = {Width,
5718                                 (unsigned)(Width.getKnownMinValue() * Cost)};
5719   return Factor;
5720 }
5721 
5722 bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization(
5723     const Loop &L, ElementCount VF) const {
5724   // Cross iteration phis such as reductions need special handling and are
5725   // currently unsupported.
5726   if (any_of(L.getHeader()->phis(), [&](PHINode &Phi) {
5727         return Legal->isFirstOrderRecurrence(&Phi) ||
5728                Legal->isReductionVariable(&Phi);
5729       }))
5730     return false;
5731 
5732   // Phis with uses outside of the loop require special handling and are
5733   // currently unsupported.
5734   for (auto &Entry : Legal->getInductionVars()) {
5735     // Look for uses of the value of the induction at the last iteration.
5736     Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch());
5737     for (User *U : PostInc->users())
5738       if (!L.contains(cast<Instruction>(U)))
5739         return false;
5740     // Look for uses of penultimate value of the induction.
5741     for (User *U : Entry.first->users())
5742       if (!L.contains(cast<Instruction>(U)))
5743         return false;
5744   }
5745 
5746   // Induction variables that are widened require special handling that is
5747   // currently not supported.
5748   if (any_of(Legal->getInductionVars(), [&](auto &Entry) {
5749         return !(this->isScalarAfterVectorization(Entry.first, VF) ||
5750                  this->isProfitableToScalarize(Entry.first, VF));
5751       }))
5752     return false;
5753 
5754   return true;
5755 }
5756 
5757 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
5758     const ElementCount VF) const {
5759   // FIXME: We need a much better cost-model to take different parameters such
5760   // as register pressure, code size increase and cost of extra branches into
5761   // account. For now we apply a very crude heuristic and only consider loops
5762   // with vectorization factors larger than a certain value.
5763   // We also consider epilogue vectorization unprofitable for targets that don't
5764   // consider interleaving beneficial (eg. MVE).
5765   if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1)
5766     return false;
5767   if (VF.getFixedValue() >= EpilogueVectorizationMinVF)
5768     return true;
5769   return false;
5770 }
5771 
5772 VectorizationFactor
5773 LoopVectorizationCostModel::selectEpilogueVectorizationFactor(
5774     const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) {
5775   VectorizationFactor Result = VectorizationFactor::Disabled();
5776   if (!EnableEpilogueVectorization) {
5777     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";);
5778     return Result;
5779   }
5780 
5781   if (!isScalarEpilogueAllowed()) {
5782     LLVM_DEBUG(
5783         dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is "
5784                   "allowed.\n";);
5785     return Result;
5786   }
5787 
5788   // FIXME: This can be fixed for scalable vectors later, because at this stage
5789   // the LoopVectorizer will only consider vectorizing a loop with scalable
5790   // vectors when the loop has a hint to enable vectorization for a given VF.
5791   if (MainLoopVF.isScalable()) {
5792     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization for scalable vectors not "
5793                          "yet supported.\n");
5794     return Result;
5795   }
5796 
5797   // Not really a cost consideration, but check for unsupported cases here to
5798   // simplify the logic.
5799   if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) {
5800     LLVM_DEBUG(
5801         dbgs() << "LEV: Unable to vectorize epilogue because the loop is "
5802                   "not a supported candidate.\n";);
5803     return Result;
5804   }
5805 
5806   if (EpilogueVectorizationForceVF > 1) {
5807     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";);
5808     if (LVP.hasPlanWithVFs(
5809             {MainLoopVF, ElementCount::getFixed(EpilogueVectorizationForceVF)}))
5810       return {ElementCount::getFixed(EpilogueVectorizationForceVF), 0};
5811     else {
5812       LLVM_DEBUG(
5813           dbgs()
5814               << "LEV: Epilogue vectorization forced factor is not viable.\n";);
5815       return Result;
5816     }
5817   }
5818 
5819   if (TheLoop->getHeader()->getParent()->hasOptSize() ||
5820       TheLoop->getHeader()->getParent()->hasMinSize()) {
5821     LLVM_DEBUG(
5822         dbgs()
5823             << "LEV: Epilogue vectorization skipped due to opt for size.\n";);
5824     return Result;
5825   }
5826 
5827   if (!isEpilogueVectorizationProfitable(MainLoopVF))
5828     return Result;
5829 
5830   for (auto &NextVF : ProfitableVFs)
5831     if (ElementCount::isKnownLT(NextVF.Width, MainLoopVF) &&
5832         (Result.Width.getFixedValue() == 1 || NextVF.Cost < Result.Cost) &&
5833         LVP.hasPlanWithVFs({MainLoopVF, NextVF.Width}))
5834       Result = NextVF;
5835 
5836   if (Result != VectorizationFactor::Disabled())
5837     LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = "
5838                       << Result.Width.getFixedValue() << "\n";);
5839   return Result;
5840 }
5841 
5842 std::pair<unsigned, unsigned>
5843 LoopVectorizationCostModel::getSmallestAndWidestTypes() {
5844   unsigned MinWidth = -1U;
5845   unsigned MaxWidth = 8;
5846   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5847 
5848   // For each block.
5849   for (BasicBlock *BB : TheLoop->blocks()) {
5850     // For each instruction in the loop.
5851     for (Instruction &I : BB->instructionsWithoutDebug()) {
5852       Type *T = I.getType();
5853 
5854       // Skip ignored values.
5855       if (ValuesToIgnore.count(&I))
5856         continue;
5857 
5858       // Only examine Loads, Stores and PHINodes.
5859       if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
5860         continue;
5861 
5862       // Examine PHI nodes that are reduction variables. Update the type to
5863       // account for the recurrence type.
5864       if (auto *PN = dyn_cast<PHINode>(&I)) {
5865         if (!Legal->isReductionVariable(PN))
5866           continue;
5867         RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[PN];
5868         if (PreferInLoopReductions ||
5869             TTI.preferInLoopReduction(RdxDesc.getOpcode(),
5870                                       RdxDesc.getRecurrenceType(),
5871                                       TargetTransformInfo::ReductionFlags()))
5872           continue;
5873         T = RdxDesc.getRecurrenceType();
5874       }
5875 
5876       // Examine the stored values.
5877       if (auto *ST = dyn_cast<StoreInst>(&I))
5878         T = ST->getValueOperand()->getType();
5879 
5880       // Ignore loaded pointer types and stored pointer types that are not
5881       // vectorizable.
5882       //
5883       // FIXME: The check here attempts to predict whether a load or store will
5884       //        be vectorized. We only know this for certain after a VF has
5885       //        been selected. Here, we assume that if an access can be
5886       //        vectorized, it will be. We should also look at extending this
5887       //        optimization to non-pointer types.
5888       //
5889       if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) &&
5890           !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I))
5891         continue;
5892 
5893       MinWidth = std::min(MinWidth,
5894                           (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
5895       MaxWidth = std::max(MaxWidth,
5896                           (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
5897     }
5898   }
5899 
5900   return {MinWidth, MaxWidth};
5901 }
5902 
5903 unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
5904                                                            unsigned LoopCost) {
5905   // -- The interleave heuristics --
5906   // We interleave the loop in order to expose ILP and reduce the loop overhead.
5907   // There are many micro-architectural considerations that we can't predict
5908   // at this level. For example, frontend pressure (on decode or fetch) due to
5909   // code size, or the number and capabilities of the execution ports.
5910   //
5911   // We use the following heuristics to select the interleave count:
5912   // 1. If the code has reductions, then we interleave to break the cross
5913   // iteration dependency.
5914   // 2. If the loop is really small, then we interleave to reduce the loop
5915   // overhead.
5916   // 3. We don't interleave if we think that we will spill registers to memory
5917   // due to the increased register pressure.
5918 
5919   if (!isScalarEpilogueAllowed())
5920     return 1;
5921 
5922   // We used the distance for the interleave count.
5923   if (Legal->getMaxSafeDepDistBytes() != -1U)
5924     return 1;
5925 
5926   auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
5927   const bool HasReductions = !Legal->getReductionVars().empty();
5928   // Do not interleave loops with a relatively small known or estimated trip
5929   // count. But we will interleave when InterleaveSmallLoopScalarReduction is
5930   // enabled, and the code has scalar reductions(HasReductions && VF = 1),
5931   // because with the above conditions interleaving can expose ILP and break
5932   // cross iteration dependences for reductions.
5933   if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) &&
5934       !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar()))
5935     return 1;
5936 
5937   RegisterUsage R = calculateRegisterUsage({VF})[0];
5938   // We divide by these constants so assume that we have at least one
5939   // instruction that uses at least one register.
5940   for (auto& pair : R.MaxLocalUsers) {
5941     pair.second = std::max(pair.second, 1U);
5942   }
5943 
5944   // We calculate the interleave count using the following formula.
5945   // Subtract the number of loop invariants from the number of available
5946   // registers. These registers are used by all of the interleaved instances.
5947   // Next, divide the remaining registers by the number of registers that is
5948   // required by the loop, in order to estimate how many parallel instances
5949   // fit without causing spills. All of this is rounded down if necessary to be
5950   // a power of two. We want power of two interleave count to simplify any
5951   // addressing operations or alignment considerations.
5952   // We also want power of two interleave counts to ensure that the induction
5953   // variable of the vector loop wraps to zero, when tail is folded by masking;
5954   // this currently happens when OptForSize, in which case IC is set to 1 above.
5955   unsigned IC = UINT_MAX;
5956 
5957   for (auto& pair : R.MaxLocalUsers) {
5958     unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5959     LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
5960                       << " registers of "
5961                       << TTI.getRegisterClassName(pair.first) << " register class\n");
5962     if (VF.isScalar()) {
5963       if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
5964         TargetNumRegisters = ForceTargetNumScalarRegs;
5965     } else {
5966       if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
5967         TargetNumRegisters = ForceTargetNumVectorRegs;
5968     }
5969     unsigned MaxLocalUsers = pair.second;
5970     unsigned LoopInvariantRegs = 0;
5971     if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end())
5972       LoopInvariantRegs = R.LoopInvariantRegs[pair.first];
5973 
5974     unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers);
5975     // Don't count the induction variable as interleaved.
5976     if (EnableIndVarRegisterHeur) {
5977       TmpIC =
5978           PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) /
5979                         std::max(1U, (MaxLocalUsers - 1)));
5980     }
5981 
5982     IC = std::min(IC, TmpIC);
5983   }
5984 
5985   // Clamp the interleave ranges to reasonable counts.
5986   unsigned MaxInterleaveCount =
5987       TTI.getMaxInterleaveFactor(VF.getKnownMinValue());
5988 
5989   // Check if the user has overridden the max.
5990   if (VF.isScalar()) {
5991     if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
5992       MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
5993   } else {
5994     if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
5995       MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
5996   }
5997 
5998   // If trip count is known or estimated compile time constant, limit the
5999   // interleave count to be less than the trip count divided by VF, provided it
6000   // is at least 1.
6001   //
6002   // For scalable vectors we can't know if interleaving is beneficial. It may
6003   // not be beneficial for small loops if none of the lanes in the second vector
6004   // iterations is enabled. However, for larger loops, there is likely to be a
6005   // similar benefit as for fixed-width vectors. For now, we choose to leave
6006   // the InterleaveCount as if vscale is '1', although if some information about
6007   // the vector is known (e.g. min vector size), we can make a better decision.
6008   if (BestKnownTC) {
6009     MaxInterleaveCount =
6010         std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount);
6011     // Make sure MaxInterleaveCount is greater than 0.
6012     MaxInterleaveCount = std::max(1u, MaxInterleaveCount);
6013   }
6014 
6015   assert(MaxInterleaveCount > 0 &&
6016          "Maximum interleave count must be greater than 0");
6017 
6018   // Clamp the calculated IC to be between the 1 and the max interleave count
6019   // that the target and trip count allows.
6020   if (IC > MaxInterleaveCount)
6021     IC = MaxInterleaveCount;
6022   else
6023     // Make sure IC is greater than 0.
6024     IC = std::max(1u, IC);
6025 
6026   assert(IC > 0 && "Interleave count must be greater than 0.");
6027 
6028   // If we did not calculate the cost for VF (because the user selected the VF)
6029   // then we calculate the cost of VF here.
6030   if (LoopCost == 0) {
6031     assert(expectedCost(VF).first.isValid() && "Expected a valid cost");
6032     LoopCost = *expectedCost(VF).first.getValue();
6033   }
6034 
6035   assert(LoopCost && "Non-zero loop cost expected");
6036 
6037   // Interleave if we vectorized this loop and there is a reduction that could
6038   // benefit from interleaving.
6039   if (VF.isVector() && HasReductions) {
6040     LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
6041     return IC;
6042   }
6043 
6044   // Note that if we've already vectorized the loop we will have done the
6045   // runtime check and so interleaving won't require further checks.
6046   bool InterleavingRequiresRuntimePointerCheck =
6047       (VF.isScalar() && Legal->getRuntimePointerChecking()->Need);
6048 
6049   // We want to interleave small loops in order to reduce the loop overhead and
6050   // potentially expose ILP opportunities.
6051   LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'
6052                     << "LV: IC is " << IC << '\n'
6053                     << "LV: VF is " << VF << '\n');
6054   const bool AggressivelyInterleaveReductions =
6055       TTI.enableAggressiveInterleaving(HasReductions);
6056   if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) {
6057     // We assume that the cost overhead is 1 and we use the cost model
6058     // to estimate the cost of the loop and interleave until the cost of the
6059     // loop overhead is about 5% of the cost of the loop.
6060     unsigned SmallIC =
6061         std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost));
6062 
6063     // Interleave until store/load ports (estimated by max interleave count) are
6064     // saturated.
6065     unsigned NumStores = Legal->getNumStores();
6066     unsigned NumLoads = Legal->getNumLoads();
6067     unsigned StoresIC = IC / (NumStores ? NumStores : 1);
6068     unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
6069 
6070     // If we have a scalar reduction (vector reductions are already dealt with
6071     // by this point), we can increase the critical path length if the loop
6072     // we're interleaving is inside another loop. Limit, by default to 2, so the
6073     // critical path only gets increased by one reduction operation.
6074     if (HasReductions && TheLoop->getLoopDepth() > 1) {
6075       unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
6076       SmallIC = std::min(SmallIC, F);
6077       StoresIC = std::min(StoresIC, F);
6078       LoadsIC = std::min(LoadsIC, F);
6079     }
6080 
6081     if (EnableLoadStoreRuntimeInterleave &&
6082         std::max(StoresIC, LoadsIC) > SmallIC) {
6083       LLVM_DEBUG(
6084           dbgs() << "LV: Interleaving to saturate store or load ports.\n");
6085       return std::max(StoresIC, LoadsIC);
6086     }
6087 
6088     // If there are scalar reductions and TTI has enabled aggressive
6089     // interleaving for reductions, we will interleave to expose ILP.
6090     if (InterleaveSmallLoopScalarReduction && VF.isScalar() &&
6091         AggressivelyInterleaveReductions) {
6092       LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
6093       // Interleave no less than SmallIC but not as aggressive as the normal IC
6094       // to satisfy the rare situation when resources are too limited.
6095       return std::max(IC / 2, SmallIC);
6096     } else {
6097       LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
6098       return SmallIC;
6099     }
6100   }
6101 
6102   // Interleave if this is a large loop (small loops are already dealt with by
6103   // this point) that could benefit from interleaving.
6104   if (AggressivelyInterleaveReductions) {
6105     LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
6106     return IC;
6107   }
6108 
6109   LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
6110   return 1;
6111 }
6112 
6113 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
6114 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
6115   // This function calculates the register usage by measuring the highest number
6116   // of values that are alive at a single location. Obviously, this is a very
6117   // rough estimation. We scan the loop in a topological order in order and
6118   // assign a number to each instruction. We use RPO to ensure that defs are
6119   // met before their users. We assume that each instruction that has in-loop
6120   // users starts an interval. We record every time that an in-loop value is
6121   // used, so we have a list of the first and last occurrences of each
6122   // instruction. Next, we transpose this data structure into a multi map that
6123   // holds the list of intervals that *end* at a specific location. This multi
6124   // map allows us to perform a linear search. We scan the instructions linearly
6125   // and record each time that a new interval starts, by placing it in a set.
6126   // If we find this value in the multi-map then we remove it from the set.
6127   // The max register usage is the maximum size of the set.
6128   // We also search for instructions that are defined outside the loop, but are
6129   // used inside the loop. We need this number separately from the max-interval
6130   // usage number because when we unroll, loop-invariant values do not take
6131   // more register.
6132   LoopBlocksDFS DFS(TheLoop);
6133   DFS.perform(LI);
6134 
6135   RegisterUsage RU;
6136 
6137   // Each 'key' in the map opens a new interval. The values
6138   // of the map are the index of the 'last seen' usage of the
6139   // instruction that is the key.
6140   using IntervalMap = DenseMap<Instruction *, unsigned>;
6141 
6142   // Maps instruction to its index.
6143   SmallVector<Instruction *, 64> IdxToInstr;
6144   // Marks the end of each interval.
6145   IntervalMap EndPoint;
6146   // Saves the list of instruction indices that are used in the loop.
6147   SmallPtrSet<Instruction *, 8> Ends;
6148   // Saves the list of values that are used in the loop but are
6149   // defined outside the loop, such as arguments and constants.
6150   SmallPtrSet<Value *, 8> LoopInvariants;
6151 
6152   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
6153     for (Instruction &I : BB->instructionsWithoutDebug()) {
6154       IdxToInstr.push_back(&I);
6155 
6156       // Save the end location of each USE.
6157       for (Value *U : I.operands()) {
6158         auto *Instr = dyn_cast<Instruction>(U);
6159 
6160         // Ignore non-instruction values such as arguments, constants, etc.
6161         if (!Instr)
6162           continue;
6163 
6164         // If this instruction is outside the loop then record it and continue.
6165         if (!TheLoop->contains(Instr)) {
6166           LoopInvariants.insert(Instr);
6167           continue;
6168         }
6169 
6170         // Overwrite previous end points.
6171         EndPoint[Instr] = IdxToInstr.size();
6172         Ends.insert(Instr);
6173       }
6174     }
6175   }
6176 
6177   // Saves the list of intervals that end with the index in 'key'.
6178   using InstrList = SmallVector<Instruction *, 2>;
6179   DenseMap<unsigned, InstrList> TransposeEnds;
6180 
6181   // Transpose the EndPoints to a list of values that end at each index.
6182   for (auto &Interval : EndPoint)
6183     TransposeEnds[Interval.second].push_back(Interval.first);
6184 
6185   SmallPtrSet<Instruction *, 8> OpenIntervals;
6186   SmallVector<RegisterUsage, 8> RUs(VFs.size());
6187   SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size());
6188 
6189   LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
6190 
6191   // A lambda that gets the register usage for the given type and VF.
6192   const auto &TTICapture = TTI;
6193   auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) {
6194     if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty))
6195       return 0U;
6196     return TTICapture.getRegUsageForType(VectorType::get(Ty, VF));
6197   };
6198 
6199   for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
6200     Instruction *I = IdxToInstr[i];
6201 
6202     // Remove all of the instructions that end at this location.
6203     InstrList &List = TransposeEnds[i];
6204     for (Instruction *ToRemove : List)
6205       OpenIntervals.erase(ToRemove);
6206 
6207     // Ignore instructions that are never used within the loop.
6208     if (!Ends.count(I))
6209       continue;
6210 
6211     // Skip ignored values.
6212     if (ValuesToIgnore.count(I))
6213       continue;
6214 
6215     // For each VF find the maximum usage of registers.
6216     for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
6217       // Count the number of live intervals.
6218       SmallMapVector<unsigned, unsigned, 4> RegUsage;
6219 
6220       if (VFs[j].isScalar()) {
6221         for (auto Inst : OpenIntervals) {
6222           unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
6223           if (RegUsage.find(ClassID) == RegUsage.end())
6224             RegUsage[ClassID] = 1;
6225           else
6226             RegUsage[ClassID] += 1;
6227         }
6228       } else {
6229         collectUniformsAndScalars(VFs[j]);
6230         for (auto Inst : OpenIntervals) {
6231           // Skip ignored values for VF > 1.
6232           if (VecValuesToIgnore.count(Inst))
6233             continue;
6234           if (isScalarAfterVectorization(Inst, VFs[j])) {
6235             unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
6236             if (RegUsage.find(ClassID) == RegUsage.end())
6237               RegUsage[ClassID] = 1;
6238             else
6239               RegUsage[ClassID] += 1;
6240           } else {
6241             unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType());
6242             if (RegUsage.find(ClassID) == RegUsage.end())
6243               RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]);
6244             else
6245               RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]);
6246           }
6247         }
6248       }
6249 
6250       for (auto& pair : RegUsage) {
6251         if (MaxUsages[j].find(pair.first) != MaxUsages[j].end())
6252           MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second);
6253         else
6254           MaxUsages[j][pair.first] = pair.second;
6255       }
6256     }
6257 
6258     LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
6259                       << OpenIntervals.size() << '\n');
6260 
6261     // Add the current instruction to the list of open intervals.
6262     OpenIntervals.insert(I);
6263   }
6264 
6265   for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
6266     SmallMapVector<unsigned, unsigned, 4> Invariant;
6267 
6268     for (auto Inst : LoopInvariants) {
6269       unsigned Usage =
6270           VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]);
6271       unsigned ClassID =
6272           TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType());
6273       if (Invariant.find(ClassID) == Invariant.end())
6274         Invariant[ClassID] = Usage;
6275       else
6276         Invariant[ClassID] += Usage;
6277     }
6278 
6279     LLVM_DEBUG({
6280       dbgs() << "LV(REG): VF = " << VFs[i] << '\n';
6281       dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size()
6282              << " item\n";
6283       for (const auto &pair : MaxUsages[i]) {
6284         dbgs() << "LV(REG): RegisterClass: "
6285                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
6286                << " registers\n";
6287       }
6288       dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()
6289              << " item\n";
6290       for (const auto &pair : Invariant) {
6291         dbgs() << "LV(REG): RegisterClass: "
6292                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
6293                << " registers\n";
6294       }
6295     });
6296 
6297     RU.LoopInvariantRegs = Invariant;
6298     RU.MaxLocalUsers = MaxUsages[i];
6299     RUs[i] = RU;
6300   }
6301 
6302   return RUs;
6303 }
6304 
6305 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){
6306   // TODO: Cost model for emulated masked load/store is completely
6307   // broken. This hack guides the cost model to use an artificially
6308   // high enough value to practically disable vectorization with such
6309   // operations, except where previously deployed legality hack allowed
6310   // using very low cost values. This is to avoid regressions coming simply
6311   // from moving "masked load/store" check from legality to cost model.
6312   // Masked Load/Gather emulation was previously never allowed.
6313   // Limited number of Masked Store/Scatter emulation was allowed.
6314   assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction");
6315   return isa<LoadInst>(I) ||
6316          (isa<StoreInst>(I) &&
6317           NumPredStores > NumberOfStoresToPredicate);
6318 }
6319 
6320 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) {
6321   // If we aren't vectorizing the loop, or if we've already collected the
6322   // instructions to scalarize, there's nothing to do. Collection may already
6323   // have occurred if we have a user-selected VF and are now computing the
6324   // expected cost for interleaving.
6325   if (VF.isScalar() || VF.isZero() ||
6326       InstsToScalarize.find(VF) != InstsToScalarize.end())
6327     return;
6328 
6329   // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
6330   // not profitable to scalarize any instructions, the presence of VF in the
6331   // map will indicate that we've analyzed it already.
6332   ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
6333 
6334   // Find all the instructions that are scalar with predication in the loop and
6335   // determine if it would be better to not if-convert the blocks they are in.
6336   // If so, we also record the instructions to scalarize.
6337   for (BasicBlock *BB : TheLoop->blocks()) {
6338     if (!blockNeedsPredication(BB))
6339       continue;
6340     for (Instruction &I : *BB)
6341       if (isScalarWithPredication(&I)) {
6342         ScalarCostsTy ScalarCosts;
6343         // Do not apply discount logic if hacked cost is needed
6344         // for emulated masked memrefs.
6345         if (!useEmulatedMaskMemRefHack(&I) &&
6346             computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
6347           ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
6348         // Remember that BB will remain after vectorization.
6349         PredicatedBBsAfterVectorization.insert(BB);
6350       }
6351   }
6352 }
6353 
6354 int LoopVectorizationCostModel::computePredInstDiscount(
6355     Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) {
6356   assert(!isUniformAfterVectorization(PredInst, VF) &&
6357          "Instruction marked uniform-after-vectorization will be predicated");
6358 
6359   // Initialize the discount to zero, meaning that the scalar version and the
6360   // vector version cost the same.
6361   InstructionCost Discount = 0;
6362 
6363   // Holds instructions to analyze. The instructions we visit are mapped in
6364   // ScalarCosts. Those instructions are the ones that would be scalarized if
6365   // we find that the scalar version costs less.
6366   SmallVector<Instruction *, 8> Worklist;
6367 
6368   // Returns true if the given instruction can be scalarized.
6369   auto canBeScalarized = [&](Instruction *I) -> bool {
6370     // We only attempt to scalarize instructions forming a single-use chain
6371     // from the original predicated block that would otherwise be vectorized.
6372     // Although not strictly necessary, we give up on instructions we know will
6373     // already be scalar to avoid traversing chains that are unlikely to be
6374     // beneficial.
6375     if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
6376         isScalarAfterVectorization(I, VF))
6377       return false;
6378 
6379     // If the instruction is scalar with predication, it will be analyzed
6380     // separately. We ignore it within the context of PredInst.
6381     if (isScalarWithPredication(I))
6382       return false;
6383 
6384     // If any of the instruction's operands are uniform after vectorization,
6385     // the instruction cannot be scalarized. This prevents, for example, a
6386     // masked load from being scalarized.
6387     //
6388     // We assume we will only emit a value for lane zero of an instruction
6389     // marked uniform after vectorization, rather than VF identical values.
6390     // Thus, if we scalarize an instruction that uses a uniform, we would
6391     // create uses of values corresponding to the lanes we aren't emitting code
6392     // for. This behavior can be changed by allowing getScalarValue to clone
6393     // the lane zero values for uniforms rather than asserting.
6394     for (Use &U : I->operands())
6395       if (auto *J = dyn_cast<Instruction>(U.get()))
6396         if (isUniformAfterVectorization(J, VF))
6397           return false;
6398 
6399     // Otherwise, we can scalarize the instruction.
6400     return true;
6401   };
6402 
6403   // Compute the expected cost discount from scalarizing the entire expression
6404   // feeding the predicated instruction. We currently only consider expressions
6405   // that are single-use instruction chains.
6406   Worklist.push_back(PredInst);
6407   while (!Worklist.empty()) {
6408     Instruction *I = Worklist.pop_back_val();
6409 
6410     // If we've already analyzed the instruction, there's nothing to do.
6411     if (ScalarCosts.find(I) != ScalarCosts.end())
6412       continue;
6413 
6414     // Compute the cost of the vector instruction. Note that this cost already
6415     // includes the scalarization overhead of the predicated instruction.
6416     InstructionCost VectorCost = getInstructionCost(I, VF).first;
6417 
6418     // Compute the cost of the scalarized instruction. This cost is the cost of
6419     // the instruction as if it wasn't if-converted and instead remained in the
6420     // predicated block. We will scale this cost by block probability after
6421     // computing the scalarization overhead.
6422     assert(!VF.isScalable() && "scalable vectors not yet supported.");
6423     InstructionCost ScalarCost =
6424         VF.getKnownMinValue() *
6425         getInstructionCost(I, ElementCount::getFixed(1)).first;
6426 
6427     // Compute the scalarization overhead of needed insertelement instructions
6428     // and phi nodes.
6429     if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) {
6430       ScalarCost += TTI.getScalarizationOverhead(
6431           cast<VectorType>(ToVectorTy(I->getType(), VF)),
6432           APInt::getAllOnesValue(VF.getKnownMinValue()), true, false);
6433       assert(!VF.isScalable() && "scalable vectors not yet supported.");
6434       ScalarCost +=
6435           VF.getKnownMinValue() *
6436           TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput);
6437     }
6438 
6439     // Compute the scalarization overhead of needed extractelement
6440     // instructions. For each of the instruction's operands, if the operand can
6441     // be scalarized, add it to the worklist; otherwise, account for the
6442     // overhead.
6443     for (Use &U : I->operands())
6444       if (auto *J = dyn_cast<Instruction>(U.get())) {
6445         assert(VectorType::isValidElementType(J->getType()) &&
6446                "Instruction has non-scalar type");
6447         if (canBeScalarized(J))
6448           Worklist.push_back(J);
6449         else if (needsExtract(J, VF)) {
6450           assert(!VF.isScalable() && "scalable vectors not yet supported.");
6451           ScalarCost += TTI.getScalarizationOverhead(
6452               cast<VectorType>(ToVectorTy(J->getType(), VF)),
6453               APInt::getAllOnesValue(VF.getKnownMinValue()), false, true);
6454         }
6455       }
6456 
6457     // Scale the total scalar cost by block probability.
6458     ScalarCost /= getReciprocalPredBlockProb();
6459 
6460     // Compute the discount. A non-negative discount means the vector version
6461     // of the instruction costs more, and scalarizing would be beneficial.
6462     Discount += VectorCost - ScalarCost;
6463     ScalarCosts[I] = ScalarCost;
6464   }
6465 
6466   return *Discount.getValue();
6467 }
6468 
6469 LoopVectorizationCostModel::VectorizationCostTy
6470 LoopVectorizationCostModel::expectedCost(ElementCount VF) {
6471   VectorizationCostTy Cost;
6472 
6473   // For each block.
6474   for (BasicBlock *BB : TheLoop->blocks()) {
6475     VectorizationCostTy BlockCost;
6476 
6477     // For each instruction in the old loop.
6478     for (Instruction &I : BB->instructionsWithoutDebug()) {
6479       // Skip ignored values.
6480       if (ValuesToIgnore.count(&I) ||
6481           (VF.isVector() && VecValuesToIgnore.count(&I)))
6482         continue;
6483 
6484       VectorizationCostTy C = getInstructionCost(&I, VF);
6485 
6486       // Check if we should override the cost.
6487       if (ForceTargetInstructionCost.getNumOccurrences() > 0)
6488         C.first = InstructionCost(ForceTargetInstructionCost);
6489 
6490       BlockCost.first += C.first;
6491       BlockCost.second |= C.second;
6492       LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first
6493                         << " for VF " << VF << " For instruction: " << I
6494                         << '\n');
6495     }
6496 
6497     // If we are vectorizing a predicated block, it will have been
6498     // if-converted. This means that the block's instructions (aside from
6499     // stores and instructions that may divide by zero) will now be
6500     // unconditionally executed. For the scalar case, we may not always execute
6501     // the predicated block, if it is an if-else block. Thus, scale the block's
6502     // cost by the probability of executing it. blockNeedsPredication from
6503     // Legal is used so as to not include all blocks in tail folded loops.
6504     if (VF.isScalar() && Legal->blockNeedsPredication(BB))
6505       BlockCost.first /= getReciprocalPredBlockProb();
6506 
6507     Cost.first += BlockCost.first;
6508     Cost.second |= BlockCost.second;
6509   }
6510 
6511   return Cost;
6512 }
6513 
6514 /// Gets Address Access SCEV after verifying that the access pattern
6515 /// is loop invariant except the induction variable dependence.
6516 ///
6517 /// This SCEV can be sent to the Target in order to estimate the address
6518 /// calculation cost.
6519 static const SCEV *getAddressAccessSCEV(
6520               Value *Ptr,
6521               LoopVectorizationLegality *Legal,
6522               PredicatedScalarEvolution &PSE,
6523               const Loop *TheLoop) {
6524 
6525   auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
6526   if (!Gep)
6527     return nullptr;
6528 
6529   // We are looking for a gep with all loop invariant indices except for one
6530   // which should be an induction variable.
6531   auto SE = PSE.getSE();
6532   unsigned NumOperands = Gep->getNumOperands();
6533   for (unsigned i = 1; i < NumOperands; ++i) {
6534     Value *Opd = Gep->getOperand(i);
6535     if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
6536         !Legal->isInductionVariable(Opd))
6537       return nullptr;
6538   }
6539 
6540   // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
6541   return PSE.getSCEV(Ptr);
6542 }
6543 
6544 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
6545   return Legal->hasStride(I->getOperand(0)) ||
6546          Legal->hasStride(I->getOperand(1));
6547 }
6548 
6549 InstructionCost
6550 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
6551                                                         ElementCount VF) {
6552   assert(VF.isVector() &&
6553          "Scalarization cost of instruction implies vectorization.");
6554   assert(!VF.isScalable() && "scalable vectors not yet supported.");
6555   Type *ValTy = getMemInstValueType(I);
6556   auto SE = PSE.getSE();
6557 
6558   unsigned AS = getLoadStoreAddressSpace(I);
6559   Value *Ptr = getLoadStorePointerOperand(I);
6560   Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
6561 
6562   // Figure out whether the access is strided and get the stride value
6563   // if it's known in compile time
6564   const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
6565 
6566   // Get the cost of the scalar memory instruction and address computation.
6567   InstructionCost Cost =
6568       VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
6569 
6570   // Don't pass *I here, since it is scalar but will actually be part of a
6571   // vectorized loop where the user of it is a vectorized instruction.
6572   const Align Alignment = getLoadStoreAlignment(I);
6573   Cost += VF.getKnownMinValue() *
6574           TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment,
6575                               AS, TTI::TCK_RecipThroughput);
6576 
6577   // Get the overhead of the extractelement and insertelement instructions
6578   // we might create due to scalarization.
6579   Cost += getScalarizationOverhead(I, VF);
6580 
6581   // If we have a predicated store, it may not be executed for each vector
6582   // lane. Scale the cost by the probability of executing the predicated
6583   // block.
6584   if (isPredicatedInst(I)) {
6585     Cost /= getReciprocalPredBlockProb();
6586 
6587     if (useEmulatedMaskMemRefHack(I))
6588       // Artificially setting to a high enough value to practically disable
6589       // vectorization with such operations.
6590       Cost = 3000000;
6591   }
6592 
6593   return Cost;
6594 }
6595 
6596 InstructionCost
6597 LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
6598                                                     ElementCount VF) {
6599   Type *ValTy = getMemInstValueType(I);
6600   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6601   Value *Ptr = getLoadStorePointerOperand(I);
6602   unsigned AS = getLoadStoreAddressSpace(I);
6603   int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
6604   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6605 
6606   assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6607          "Stride should be 1 or -1 for consecutive memory access");
6608   const Align Alignment = getLoadStoreAlignment(I);
6609   InstructionCost Cost = 0;
6610   if (Legal->isMaskRequired(I))
6611     Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
6612                                       CostKind);
6613   else
6614     Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
6615                                 CostKind, I);
6616 
6617   bool Reverse = ConsecutiveStride < 0;
6618   if (Reverse)
6619     Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
6620   return Cost;
6621 }
6622 
6623 InstructionCost
6624 LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
6625                                                 ElementCount VF) {
6626   assert(Legal->isUniformMemOp(*I));
6627 
6628   Type *ValTy = getMemInstValueType(I);
6629   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6630   const Align Alignment = getLoadStoreAlignment(I);
6631   unsigned AS = getLoadStoreAddressSpace(I);
6632   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6633   if (isa<LoadInst>(I)) {
6634     return TTI.getAddressComputationCost(ValTy) +
6635            TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS,
6636                                CostKind) +
6637            TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
6638   }
6639   StoreInst *SI = cast<StoreInst>(I);
6640 
6641   bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand());
6642   return TTI.getAddressComputationCost(ValTy) +
6643          TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS,
6644                              CostKind) +
6645          (isLoopInvariantStoreValue
6646               ? 0
6647               : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
6648                                        VF.getKnownMinValue() - 1));
6649 }
6650 
6651 InstructionCost
6652 LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
6653                                                  ElementCount VF) {
6654   Type *ValTy = getMemInstValueType(I);
6655   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6656   const Align Alignment = getLoadStoreAlignment(I);
6657   const Value *Ptr = getLoadStorePointerOperand(I);
6658 
6659   return TTI.getAddressComputationCost(VectorTy) +
6660          TTI.getGatherScatterOpCost(
6661              I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment,
6662              TargetTransformInfo::TCK_RecipThroughput, I);
6663 }
6664 
6665 InstructionCost
6666 LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
6667                                                    ElementCount VF) {
6668   // TODO: Once we have support for interleaving with scalable vectors
6669   // we can calculate the cost properly here.
6670   if (VF.isScalable())
6671     return InstructionCost::getInvalid();
6672 
6673   Type *ValTy = getMemInstValueType(I);
6674   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6675   unsigned AS = getLoadStoreAddressSpace(I);
6676 
6677   auto Group = getInterleavedAccessGroup(I);
6678   assert(Group && "Fail to get an interleaved access group.");
6679 
6680   unsigned InterleaveFactor = Group->getFactor();
6681   auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
6682 
6683   // Holds the indices of existing members in an interleaved load group.
6684   // An interleaved store group doesn't need this as it doesn't allow gaps.
6685   SmallVector<unsigned, 4> Indices;
6686   if (isa<LoadInst>(I)) {
6687     for (unsigned i = 0; i < InterleaveFactor; i++)
6688       if (Group->getMember(i))
6689         Indices.push_back(i);
6690   }
6691 
6692   // Calculate the cost of the whole interleaved group.
6693   bool UseMaskForGaps =
6694       Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
6695   InstructionCost Cost = TTI.getInterleavedMemoryOpCost(
6696       I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(),
6697       AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps);
6698 
6699   if (Group->isReverse()) {
6700     // TODO: Add support for reversed masked interleaved access.
6701     assert(!Legal->isMaskRequired(I) &&
6702            "Reverse masked interleaved access not supported.");
6703     Cost += Group->getNumMembers() *
6704             TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
6705   }
6706   return Cost;
6707 }
6708 
6709 InstructionCost LoopVectorizationCostModel::getReductionPatternCost(
6710     Instruction *I, ElementCount VF, Type *Ty, TTI::TargetCostKind CostKind) {
6711   // Early exit for no inloop reductions
6712   if (InLoopReductionChains.empty() || VF.isScalar() || !isa<VectorType>(Ty))
6713     return InstructionCost::getInvalid();
6714   auto *VectorTy = cast<VectorType>(Ty);
6715 
6716   // We are looking for a pattern of, and finding the minimal acceptable cost:
6717   //  reduce(mul(ext(A), ext(B))) or
6718   //  reduce(mul(A, B)) or
6719   //  reduce(ext(A)) or
6720   //  reduce(A).
6721   // The basic idea is that we walk down the tree to do that, finding the root
6722   // reduction instruction in InLoopReductionImmediateChains. From there we find
6723   // the pattern of mul/ext and test the cost of the entire pattern vs the cost
6724   // of the components. If the reduction cost is lower then we return it for the
6725   // reduction instruction and 0 for the other instructions in the pattern. If
6726   // it is not we return an invalid cost specifying the orignal cost method
6727   // should be used.
6728   Instruction *RetI = I;
6729   if ((RetI->getOpcode() == Instruction::SExt ||
6730        RetI->getOpcode() == Instruction::ZExt)) {
6731     if (!RetI->hasOneUser())
6732       return InstructionCost::getInvalid();
6733     RetI = RetI->user_back();
6734   }
6735   if (RetI->getOpcode() == Instruction::Mul &&
6736       RetI->user_back()->getOpcode() == Instruction::Add) {
6737     if (!RetI->hasOneUser())
6738       return InstructionCost::getInvalid();
6739     RetI = RetI->user_back();
6740   }
6741 
6742   // Test if the found instruction is a reduction, and if not return an invalid
6743   // cost specifying the parent to use the original cost modelling.
6744   if (!InLoopReductionImmediateChains.count(RetI))
6745     return InstructionCost::getInvalid();
6746 
6747   // Find the reduction this chain is a part of and calculate the basic cost of
6748   // the reduction on its own.
6749   Instruction *LastChain = InLoopReductionImmediateChains[RetI];
6750   Instruction *ReductionPhi = LastChain;
6751   while (!isa<PHINode>(ReductionPhi))
6752     ReductionPhi = InLoopReductionImmediateChains[ReductionPhi];
6753 
6754   RecurrenceDescriptor RdxDesc =
6755       Legal->getReductionVars()[cast<PHINode>(ReductionPhi)];
6756   unsigned BaseCost = TTI.getArithmeticReductionCost(RdxDesc.getOpcode(),
6757                                                      VectorTy, false, CostKind);
6758 
6759   // Get the operand that was not the reduction chain and match it to one of the
6760   // patterns, returning the better cost if it is found.
6761   Instruction *RedOp = RetI->getOperand(1) == LastChain
6762                            ? dyn_cast<Instruction>(RetI->getOperand(0))
6763                            : dyn_cast<Instruction>(RetI->getOperand(1));
6764 
6765   VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy);
6766 
6767   if (RedOp && (isa<SExtInst>(RedOp) || isa<ZExtInst>(RedOp)) &&
6768       !TheLoop->isLoopInvariant(RedOp)) {
6769     bool IsUnsigned = isa<ZExtInst>(RedOp);
6770     auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy);
6771     InstructionCost RedCost = TTI.getExtendedAddReductionCost(
6772         /*IsMLA=*/false, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
6773         CostKind);
6774 
6775     unsigned ExtCost =
6776         TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType,
6777                              TTI::CastContextHint::None, CostKind, RedOp);
6778     if (RedCost.isValid() && RedCost < BaseCost + ExtCost)
6779       return I == RetI ? *RedCost.getValue() : 0;
6780   } else if (RedOp && RedOp->getOpcode() == Instruction::Mul) {
6781     Instruction *Mul = RedOp;
6782     Instruction *Op0 = dyn_cast<Instruction>(Mul->getOperand(0));
6783     Instruction *Op1 = dyn_cast<Instruction>(Mul->getOperand(1));
6784     if (Op0 && Op1 && (isa<SExtInst>(Op0) || isa<ZExtInst>(Op0)) &&
6785         Op0->getOpcode() == Op1->getOpcode() &&
6786         Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() &&
6787         !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) {
6788       bool IsUnsigned = isa<ZExtInst>(Op0);
6789       auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy);
6790       // reduce(mul(ext, ext))
6791       unsigned ExtCost =
6792           TTI.getCastInstrCost(Op0->getOpcode(), VectorTy, ExtType,
6793                                TTI::CastContextHint::None, CostKind, Op0);
6794       unsigned MulCost =
6795           TTI.getArithmeticInstrCost(Mul->getOpcode(), VectorTy, CostKind);
6796 
6797       InstructionCost RedCost = TTI.getExtendedAddReductionCost(
6798           /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
6799           CostKind);
6800 
6801       if (RedCost.isValid() && RedCost < ExtCost * 2 + MulCost + BaseCost)
6802         return I == RetI ? *RedCost.getValue() : 0;
6803     } else {
6804       unsigned MulCost =
6805           TTI.getArithmeticInstrCost(Mul->getOpcode(), VectorTy, CostKind);
6806 
6807       InstructionCost RedCost = TTI.getExtendedAddReductionCost(
6808           /*IsMLA=*/true, true, RdxDesc.getRecurrenceType(), VectorTy,
6809           CostKind);
6810 
6811       if (RedCost.isValid() && RedCost < MulCost + BaseCost)
6812         return I == RetI ? *RedCost.getValue() : 0;
6813     }
6814   }
6815 
6816   return I == RetI ? BaseCost : InstructionCost::getInvalid();
6817 }
6818 
6819 InstructionCost
6820 LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
6821                                                      ElementCount VF) {
6822   // Calculate scalar cost only. Vectorization cost should be ready at this
6823   // moment.
6824   if (VF.isScalar()) {
6825     Type *ValTy = getMemInstValueType(I);
6826     const Align Alignment = getLoadStoreAlignment(I);
6827     unsigned AS = getLoadStoreAddressSpace(I);
6828 
6829     return TTI.getAddressComputationCost(ValTy) +
6830            TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS,
6831                                TTI::TCK_RecipThroughput, I);
6832   }
6833   return getWideningCost(I, VF);
6834 }
6835 
6836 LoopVectorizationCostModel::VectorizationCostTy
6837 LoopVectorizationCostModel::getInstructionCost(Instruction *I,
6838                                                ElementCount VF) {
6839   // If we know that this instruction will remain uniform, check the cost of
6840   // the scalar version.
6841   if (isUniformAfterVectorization(I, VF))
6842     VF = ElementCount::getFixed(1);
6843 
6844   if (VF.isVector() && isProfitableToScalarize(I, VF))
6845     return VectorizationCostTy(InstsToScalarize[VF][I], false);
6846 
6847   // Forced scalars do not have any scalarization overhead.
6848   auto ForcedScalar = ForcedScalars.find(VF);
6849   if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
6850     auto InstSet = ForcedScalar->second;
6851     if (InstSet.count(I))
6852       return VectorizationCostTy(
6853           (getInstructionCost(I, ElementCount::getFixed(1)).first *
6854            VF.getKnownMinValue()),
6855           false);
6856   }
6857 
6858   Type *VectorTy;
6859   InstructionCost C = getInstructionCost(I, VF, VectorTy);
6860 
6861   bool TypeNotScalarized =
6862       VF.isVector() && VectorTy->isVectorTy() &&
6863       TTI.getNumberOfParts(VectorTy) < VF.getKnownMinValue();
6864   return VectorizationCostTy(C, TypeNotScalarized);
6865 }
6866 
6867 InstructionCost
6868 LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
6869                                                      ElementCount VF) {
6870 
6871   if (VF.isScalable())
6872     return InstructionCost::getInvalid();
6873 
6874   if (VF.isScalar())
6875     return 0;
6876 
6877   InstructionCost Cost = 0;
6878   Type *RetTy = ToVectorTy(I->getType(), VF);
6879   if (!RetTy->isVoidTy() &&
6880       (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
6881     Cost += TTI.getScalarizationOverhead(
6882         cast<VectorType>(RetTy), APInt::getAllOnesValue(VF.getKnownMinValue()),
6883         true, false);
6884 
6885   // Some targets keep addresses scalar.
6886   if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
6887     return Cost;
6888 
6889   // Some targets support efficient element stores.
6890   if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
6891     return Cost;
6892 
6893   // Collect operands to consider.
6894   CallInst *CI = dyn_cast<CallInst>(I);
6895   Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands();
6896 
6897   // Skip operands that do not require extraction/scalarization and do not incur
6898   // any overhead.
6899   return Cost + TTI.getOperandsScalarizationOverhead(
6900                     filterExtractingOperands(Ops, VF), VF.getKnownMinValue());
6901 }
6902 
6903 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
6904   if (VF.isScalar())
6905     return;
6906   NumPredStores = 0;
6907   for (BasicBlock *BB : TheLoop->blocks()) {
6908     // For each instruction in the old loop.
6909     for (Instruction &I : *BB) {
6910       Value *Ptr =  getLoadStorePointerOperand(&I);
6911       if (!Ptr)
6912         continue;
6913 
6914       // TODO: We should generate better code and update the cost model for
6915       // predicated uniform stores. Today they are treated as any other
6916       // predicated store (see added test cases in
6917       // invariant-store-vectorization.ll).
6918       if (isa<StoreInst>(&I) && isScalarWithPredication(&I))
6919         NumPredStores++;
6920 
6921       if (Legal->isUniformMemOp(I)) {
6922         // TODO: Avoid replicating loads and stores instead of
6923         // relying on instcombine to remove them.
6924         // Load: Scalar load + broadcast
6925         // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
6926         InstructionCost Cost = getUniformMemOpCost(&I, VF);
6927         setWideningDecision(&I, VF, CM_Scalarize, Cost);
6928         continue;
6929       }
6930 
6931       // We assume that widening is the best solution when possible.
6932       if (memoryInstructionCanBeWidened(&I, VF)) {
6933         InstructionCost Cost = getConsecutiveMemOpCost(&I, VF);
6934         int ConsecutiveStride =
6935                Legal->isConsecutivePtr(getLoadStorePointerOperand(&I));
6936         assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6937                "Expected consecutive stride.");
6938         InstWidening Decision =
6939             ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
6940         setWideningDecision(&I, VF, Decision, Cost);
6941         continue;
6942       }
6943 
6944       // Choose between Interleaving, Gather/Scatter or Scalarization.
6945       InstructionCost InterleaveCost = InstructionCost::getInvalid();
6946       unsigned NumAccesses = 1;
6947       if (isAccessInterleaved(&I)) {
6948         auto Group = getInterleavedAccessGroup(&I);
6949         assert(Group && "Fail to get an interleaved access group.");
6950 
6951         // Make one decision for the whole group.
6952         if (getWideningDecision(&I, VF) != CM_Unknown)
6953           continue;
6954 
6955         NumAccesses = Group->getNumMembers();
6956         if (interleavedAccessCanBeWidened(&I, VF))
6957           InterleaveCost = getInterleaveGroupCost(&I, VF);
6958       }
6959 
6960       InstructionCost GatherScatterCost =
6961           isLegalGatherOrScatter(&I)
6962               ? getGatherScatterCost(&I, VF) * NumAccesses
6963               : InstructionCost::getInvalid();
6964 
6965       InstructionCost ScalarizationCost =
6966           !VF.isScalable() ? getMemInstScalarizationCost(&I, VF) * NumAccesses
6967                            : InstructionCost::getInvalid();
6968 
6969       // Choose better solution for the current VF,
6970       // write down this decision and use it during vectorization.
6971       InstructionCost Cost;
6972       InstWidening Decision;
6973       if (InterleaveCost <= GatherScatterCost &&
6974           InterleaveCost < ScalarizationCost) {
6975         Decision = CM_Interleave;
6976         Cost = InterleaveCost;
6977       } else if (GatherScatterCost < ScalarizationCost) {
6978         Decision = CM_GatherScatter;
6979         Cost = GatherScatterCost;
6980       } else {
6981         assert(!VF.isScalable() &&
6982                "We cannot yet scalarise for scalable vectors");
6983         Decision = CM_Scalarize;
6984         Cost = ScalarizationCost;
6985       }
6986       // If the instructions belongs to an interleave group, the whole group
6987       // receives the same decision. The whole group receives the cost, but
6988       // the cost will actually be assigned to one instruction.
6989       if (auto Group = getInterleavedAccessGroup(&I))
6990         setWideningDecision(Group, VF, Decision, Cost);
6991       else
6992         setWideningDecision(&I, VF, Decision, Cost);
6993     }
6994   }
6995 
6996   // Make sure that any load of address and any other address computation
6997   // remains scalar unless there is gather/scatter support. This avoids
6998   // inevitable extracts into address registers, and also has the benefit of
6999   // activating LSR more, since that pass can't optimize vectorized
7000   // addresses.
7001   if (TTI.prefersVectorizedAddressing())
7002     return;
7003 
7004   // Start with all scalar pointer uses.
7005   SmallPtrSet<Instruction *, 8> AddrDefs;
7006   for (BasicBlock *BB : TheLoop->blocks())
7007     for (Instruction &I : *BB) {
7008       Instruction *PtrDef =
7009         dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
7010       if (PtrDef && TheLoop->contains(PtrDef) &&
7011           getWideningDecision(&I, VF) != CM_GatherScatter)
7012         AddrDefs.insert(PtrDef);
7013     }
7014 
7015   // Add all instructions used to generate the addresses.
7016   SmallVector<Instruction *, 4> Worklist;
7017   append_range(Worklist, AddrDefs);
7018   while (!Worklist.empty()) {
7019     Instruction *I = Worklist.pop_back_val();
7020     for (auto &Op : I->operands())
7021       if (auto *InstOp = dyn_cast<Instruction>(Op))
7022         if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
7023             AddrDefs.insert(InstOp).second)
7024           Worklist.push_back(InstOp);
7025   }
7026 
7027   for (auto *I : AddrDefs) {
7028     if (isa<LoadInst>(I)) {
7029       // Setting the desired widening decision should ideally be handled in
7030       // by cost functions, but since this involves the task of finding out
7031       // if the loaded register is involved in an address computation, it is
7032       // instead changed here when we know this is the case.
7033       InstWidening Decision = getWideningDecision(I, VF);
7034       if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
7035         // Scalarize a widened load of address.
7036         setWideningDecision(
7037             I, VF, CM_Scalarize,
7038             (VF.getKnownMinValue() *
7039              getMemoryInstructionCost(I, ElementCount::getFixed(1))));
7040       else if (auto Group = getInterleavedAccessGroup(I)) {
7041         // Scalarize an interleave group of address loads.
7042         for (unsigned I = 0; I < Group->getFactor(); ++I) {
7043           if (Instruction *Member = Group->getMember(I))
7044             setWideningDecision(
7045                 Member, VF, CM_Scalarize,
7046                 (VF.getKnownMinValue() *
7047                  getMemoryInstructionCost(Member, ElementCount::getFixed(1))));
7048         }
7049       }
7050     } else
7051       // Make sure I gets scalarized and a cost estimate without
7052       // scalarization overhead.
7053       ForcedScalars[VF].insert(I);
7054   }
7055 }
7056 
7057 InstructionCost
7058 LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
7059                                                Type *&VectorTy) {
7060   Type *RetTy = I->getType();
7061   if (canTruncateToMinimalBitwidth(I, VF))
7062     RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
7063   VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF);
7064   auto SE = PSE.getSE();
7065   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
7066 
7067   // TODO: We need to estimate the cost of intrinsic calls.
7068   switch (I->getOpcode()) {
7069   case Instruction::GetElementPtr:
7070     // We mark this instruction as zero-cost because the cost of GEPs in
7071     // vectorized code depends on whether the corresponding memory instruction
7072     // is scalarized or not. Therefore, we handle GEPs with the memory
7073     // instruction cost.
7074     return 0;
7075   case Instruction::Br: {
7076     // In cases of scalarized and predicated instructions, there will be VF
7077     // predicated blocks in the vectorized loop. Each branch around these
7078     // blocks requires also an extract of its vector compare i1 element.
7079     bool ScalarPredicatedBB = false;
7080     BranchInst *BI = cast<BranchInst>(I);
7081     if (VF.isVector() && BI->isConditional() &&
7082         (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) ||
7083          PredicatedBBsAfterVectorization.count(BI->getSuccessor(1))))
7084       ScalarPredicatedBB = true;
7085 
7086     if (ScalarPredicatedBB) {
7087       // Return cost for branches around scalarized and predicated blocks.
7088       assert(!VF.isScalable() && "scalable vectors not yet supported.");
7089       auto *Vec_i1Ty =
7090           VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
7091       return (TTI.getScalarizationOverhead(
7092                   Vec_i1Ty, APInt::getAllOnesValue(VF.getKnownMinValue()),
7093                   false, true) +
7094               (TTI.getCFInstrCost(Instruction::Br, CostKind) *
7095                VF.getKnownMinValue()));
7096     } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar())
7097       // The back-edge branch will remain, as will all scalar branches.
7098       return TTI.getCFInstrCost(Instruction::Br, CostKind);
7099     else
7100       // This branch will be eliminated by if-conversion.
7101       return 0;
7102     // Note: We currently assume zero cost for an unconditional branch inside
7103     // a predicated block since it will become a fall-through, although we
7104     // may decide in the future to call TTI for all branches.
7105   }
7106   case Instruction::PHI: {
7107     auto *Phi = cast<PHINode>(I);
7108 
7109     // First-order recurrences are replaced by vector shuffles inside the loop.
7110     // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type.
7111     if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi))
7112       return TTI.getShuffleCost(
7113           TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy),
7114           VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1));
7115 
7116     // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
7117     // converted into select instructions. We require N - 1 selects per phi
7118     // node, where N is the number of incoming values.
7119     if (VF.isVector() && Phi->getParent() != TheLoop->getHeader())
7120       return (Phi->getNumIncomingValues() - 1) *
7121              TTI.getCmpSelInstrCost(
7122                  Instruction::Select, ToVectorTy(Phi->getType(), VF),
7123                  ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF),
7124                  CmpInst::BAD_ICMP_PREDICATE, CostKind);
7125 
7126     return TTI.getCFInstrCost(Instruction::PHI, CostKind);
7127   }
7128   case Instruction::UDiv:
7129   case Instruction::SDiv:
7130   case Instruction::URem:
7131   case Instruction::SRem:
7132     // If we have a predicated instruction, it may not be executed for each
7133     // vector lane. Get the scalarization cost and scale this amount by the
7134     // probability of executing the predicated block. If the instruction is not
7135     // predicated, we fall through to the next case.
7136     if (VF.isVector() && isScalarWithPredication(I)) {
7137       InstructionCost Cost = 0;
7138 
7139       // These instructions have a non-void type, so account for the phi nodes
7140       // that we will create. This cost is likely to be zero. The phi node
7141       // cost, if any, should be scaled by the block probability because it
7142       // models a copy at the end of each predicated block.
7143       Cost += VF.getKnownMinValue() *
7144               TTI.getCFInstrCost(Instruction::PHI, CostKind);
7145 
7146       // The cost of the non-predicated instruction.
7147       Cost += VF.getKnownMinValue() *
7148               TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind);
7149 
7150       // The cost of insertelement and extractelement instructions needed for
7151       // scalarization.
7152       Cost += getScalarizationOverhead(I, VF);
7153 
7154       // Scale the cost by the probability of executing the predicated blocks.
7155       // This assumes the predicated block for each vector lane is equally
7156       // likely.
7157       return Cost / getReciprocalPredBlockProb();
7158     }
7159     LLVM_FALLTHROUGH;
7160   case Instruction::Add:
7161   case Instruction::FAdd:
7162   case Instruction::Sub:
7163   case Instruction::FSub:
7164   case Instruction::Mul:
7165   case Instruction::FMul:
7166   case Instruction::FDiv:
7167   case Instruction::FRem:
7168   case Instruction::Shl:
7169   case Instruction::LShr:
7170   case Instruction::AShr:
7171   case Instruction::And:
7172   case Instruction::Or:
7173   case Instruction::Xor: {
7174     // Since we will replace the stride by 1 the multiplication should go away.
7175     if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal))
7176       return 0;
7177 
7178     // Detect reduction patterns
7179     InstructionCost RedCost;
7180     if ((RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
7181             .isValid())
7182       return RedCost;
7183 
7184     // Certain instructions can be cheaper to vectorize if they have a constant
7185     // second vector operand. One example of this are shifts on x86.
7186     Value *Op2 = I->getOperand(1);
7187     TargetTransformInfo::OperandValueProperties Op2VP;
7188     TargetTransformInfo::OperandValueKind Op2VK =
7189         TTI.getOperandInfo(Op2, Op2VP);
7190     if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2))
7191       Op2VK = TargetTransformInfo::OK_UniformValue;
7192 
7193     SmallVector<const Value *, 4> Operands(I->operand_values());
7194     unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1;
7195     return N * TTI.getArithmeticInstrCost(
7196                    I->getOpcode(), VectorTy, CostKind,
7197                    TargetTransformInfo::OK_AnyValue,
7198                    Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I);
7199   }
7200   case Instruction::FNeg: {
7201     assert(!VF.isScalable() && "VF is assumed to be non scalable.");
7202     unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1;
7203     return N * TTI.getArithmeticInstrCost(
7204                    I->getOpcode(), VectorTy, CostKind,
7205                    TargetTransformInfo::OK_AnyValue,
7206                    TargetTransformInfo::OK_AnyValue,
7207                    TargetTransformInfo::OP_None, TargetTransformInfo::OP_None,
7208                    I->getOperand(0), I);
7209   }
7210   case Instruction::Select: {
7211     SelectInst *SI = cast<SelectInst>(I);
7212     const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
7213     bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
7214     Type *CondTy = SI->getCondition()->getType();
7215     if (!ScalarCond)
7216       CondTy = VectorType::get(CondTy, VF);
7217     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy,
7218                                   CmpInst::BAD_ICMP_PREDICATE, CostKind, I);
7219   }
7220   case Instruction::ICmp:
7221   case Instruction::FCmp: {
7222     Type *ValTy = I->getOperand(0)->getType();
7223     Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
7224     if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
7225       ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
7226     VectorTy = ToVectorTy(ValTy, VF);
7227     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr,
7228                                   CmpInst::BAD_ICMP_PREDICATE, CostKind, I);
7229   }
7230   case Instruction::Store:
7231   case Instruction::Load: {
7232     ElementCount Width = VF;
7233     if (Width.isVector()) {
7234       InstWidening Decision = getWideningDecision(I, Width);
7235       assert(Decision != CM_Unknown &&
7236              "CM decision should be taken at this point");
7237       if (Decision == CM_Scalarize)
7238         Width = ElementCount::getFixed(1);
7239     }
7240     VectorTy = ToVectorTy(getMemInstValueType(I), Width);
7241     return getMemoryInstructionCost(I, VF);
7242   }
7243   case Instruction::ZExt:
7244   case Instruction::SExt:
7245   case Instruction::FPToUI:
7246   case Instruction::FPToSI:
7247   case Instruction::FPExt:
7248   case Instruction::PtrToInt:
7249   case Instruction::IntToPtr:
7250   case Instruction::SIToFP:
7251   case Instruction::UIToFP:
7252   case Instruction::Trunc:
7253   case Instruction::FPTrunc:
7254   case Instruction::BitCast: {
7255     // Computes the CastContextHint from a Load/Store instruction.
7256     auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint {
7257       assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
7258              "Expected a load or a store!");
7259 
7260       if (VF.isScalar() || !TheLoop->contains(I))
7261         return TTI::CastContextHint::Normal;
7262 
7263       switch (getWideningDecision(I, VF)) {
7264       case LoopVectorizationCostModel::CM_GatherScatter:
7265         return TTI::CastContextHint::GatherScatter;
7266       case LoopVectorizationCostModel::CM_Interleave:
7267         return TTI::CastContextHint::Interleave;
7268       case LoopVectorizationCostModel::CM_Scalarize:
7269       case LoopVectorizationCostModel::CM_Widen:
7270         return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked
7271                                         : TTI::CastContextHint::Normal;
7272       case LoopVectorizationCostModel::CM_Widen_Reverse:
7273         return TTI::CastContextHint::Reversed;
7274       case LoopVectorizationCostModel::CM_Unknown:
7275         llvm_unreachable("Instr did not go through cost modelling?");
7276       }
7277 
7278       llvm_unreachable("Unhandled case!");
7279     };
7280 
7281     unsigned Opcode = I->getOpcode();
7282     TTI::CastContextHint CCH = TTI::CastContextHint::None;
7283     // For Trunc, the context is the only user, which must be a StoreInst.
7284     if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
7285       if (I->hasOneUse())
7286         if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin()))
7287           CCH = ComputeCCH(Store);
7288     }
7289     // For Z/Sext, the context is the operand, which must be a LoadInst.
7290     else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
7291              Opcode == Instruction::FPExt) {
7292       if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0)))
7293         CCH = ComputeCCH(Load);
7294     }
7295 
7296     // We optimize the truncation of induction variables having constant
7297     // integer steps. The cost of these truncations is the same as the scalar
7298     // operation.
7299     if (isOptimizableIVTruncate(I, VF)) {
7300       auto *Trunc = cast<TruncInst>(I);
7301       return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
7302                                   Trunc->getSrcTy(), CCH, CostKind, Trunc);
7303     }
7304 
7305     // Detect reduction patterns
7306     InstructionCost RedCost;
7307     if ((RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
7308             .isValid())
7309       return RedCost;
7310 
7311     Type *SrcScalarTy = I->getOperand(0)->getType();
7312     Type *SrcVecTy =
7313         VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
7314     if (canTruncateToMinimalBitwidth(I, VF)) {
7315       // This cast is going to be shrunk. This may remove the cast or it might
7316       // turn it into slightly different cast. For example, if MinBW == 16,
7317       // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
7318       //
7319       // Calculate the modified src and dest types.
7320       Type *MinVecTy = VectorTy;
7321       if (Opcode == Instruction::Trunc) {
7322         SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
7323         VectorTy =
7324             largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
7325       } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) {
7326         SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy);
7327         VectorTy =
7328             smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
7329       }
7330     }
7331 
7332     unsigned N;
7333     if (isScalarAfterVectorization(I, VF)) {
7334       assert(!VF.isScalable() && "VF is assumed to be non scalable");
7335       N = VF.getKnownMinValue();
7336     } else
7337       N = 1;
7338     return N *
7339            TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
7340   }
7341   case Instruction::Call: {
7342     bool NeedToScalarize;
7343     CallInst *CI = cast<CallInst>(I);
7344     InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize);
7345     if (getVectorIntrinsicIDForCall(CI, TLI)) {
7346       InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF);
7347       return std::min(CallCost, IntrinsicCost);
7348     }
7349     return CallCost;
7350   }
7351   case Instruction::ExtractValue:
7352     return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput);
7353   default:
7354     // The cost of executing VF copies of the scalar instruction. This opcode
7355     // is unknown. Assume that it is the same as 'mul'.
7356     return VF.getKnownMinValue() * TTI.getArithmeticInstrCost(
7357                                        Instruction::Mul, VectorTy, CostKind) +
7358            getScalarizationOverhead(I, VF);
7359   } // end of switch.
7360 }
7361 
7362 char LoopVectorize::ID = 0;
7363 
7364 static const char lv_name[] = "Loop Vectorization";
7365 
7366 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
7367 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
7368 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)
7369 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
7370 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
7371 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
7372 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
7373 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
7374 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
7375 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
7376 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
7377 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
7378 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
7379 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
7380 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy)
7381 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
7382 
7383 namespace llvm {
7384 
7385 Pass *createLoopVectorizePass() { return new LoopVectorize(); }
7386 
7387 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced,
7388                               bool VectorizeOnlyWhenForced) {
7389   return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced);
7390 }
7391 
7392 } // end namespace llvm
7393 
7394 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
7395   // Check if the pointer operand of a load or store instruction is
7396   // consecutive.
7397   if (auto *Ptr = getLoadStorePointerOperand(Inst))
7398     return Legal->isConsecutivePtr(Ptr);
7399   return false;
7400 }
7401 
7402 void LoopVectorizationCostModel::collectValuesToIgnore() {
7403   // Ignore ephemeral values.
7404   CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
7405 
7406   // Ignore type-promoting instructions we identified during reduction
7407   // detection.
7408   for (auto &Reduction : Legal->getReductionVars()) {
7409     RecurrenceDescriptor &RedDes = Reduction.second;
7410     const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
7411     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7412   }
7413   // Ignore type-casting instructions we identified during induction
7414   // detection.
7415   for (auto &Induction : Legal->getInductionVars()) {
7416     InductionDescriptor &IndDes = Induction.second;
7417     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
7418     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7419   }
7420 }
7421 
7422 void LoopVectorizationCostModel::collectInLoopReductions() {
7423   for (auto &Reduction : Legal->getReductionVars()) {
7424     PHINode *Phi = Reduction.first;
7425     RecurrenceDescriptor &RdxDesc = Reduction.second;
7426 
7427     // We don't collect reductions that are type promoted (yet).
7428     if (RdxDesc.getRecurrenceType() != Phi->getType())
7429       continue;
7430 
7431     // If the target would prefer this reduction to happen "in-loop", then we
7432     // want to record it as such.
7433     unsigned Opcode = RdxDesc.getOpcode();
7434     if (!PreferInLoopReductions &&
7435         !TTI.preferInLoopReduction(Opcode, Phi->getType(),
7436                                    TargetTransformInfo::ReductionFlags()))
7437       continue;
7438 
7439     // Check that we can correctly put the reductions into the loop, by
7440     // finding the chain of operations that leads from the phi to the loop
7441     // exit value.
7442     SmallVector<Instruction *, 4> ReductionOperations =
7443         RdxDesc.getReductionOpChain(Phi, TheLoop);
7444     bool InLoop = !ReductionOperations.empty();
7445     if (InLoop) {
7446       InLoopReductionChains[Phi] = ReductionOperations;
7447       // Add the elements to InLoopReductionImmediateChains for cost modelling.
7448       Instruction *LastChain = Phi;
7449       for (auto *I : ReductionOperations) {
7450         InLoopReductionImmediateChains[I] = LastChain;
7451         LastChain = I;
7452       }
7453     }
7454     LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop")
7455                       << " reduction for phi: " << *Phi << "\n");
7456   }
7457 }
7458 
7459 // TODO: we could return a pair of values that specify the max VF and
7460 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
7461 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
7462 // doesn't have a cost model that can choose which plan to execute if
7463 // more than one is generated.
7464 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits,
7465                                  LoopVectorizationCostModel &CM) {
7466   unsigned WidestType;
7467   std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
7468   return WidestVectorRegBits / WidestType;
7469 }
7470 
7471 VectorizationFactor
7472 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
7473   assert(!UserVF.isScalable() && "scalable vectors not yet supported");
7474   ElementCount VF = UserVF;
7475   // Outer loop handling: They may require CFG and instruction level
7476   // transformations before even evaluating whether vectorization is profitable.
7477   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
7478   // the vectorization pipeline.
7479   if (!OrigLoop->isInnermost()) {
7480     // If the user doesn't provide a vectorization factor, determine a
7481     // reasonable one.
7482     if (UserVF.isZero()) {
7483       VF = ElementCount::getFixed(
7484           determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM));
7485       LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
7486 
7487       // Make sure we have a VF > 1 for stress testing.
7488       if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) {
7489         LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
7490                           << "overriding computed VF.\n");
7491         VF = ElementCount::getFixed(4);
7492       }
7493     }
7494     assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
7495     assert(isPowerOf2_32(VF.getKnownMinValue()) &&
7496            "VF needs to be a power of two");
7497     LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "")
7498                       << "VF " << VF << " to build VPlans.\n");
7499     buildVPlans(VF, VF);
7500 
7501     // For VPlan build stress testing, we bail out after VPlan construction.
7502     if (VPlanBuildStressTest)
7503       return VectorizationFactor::Disabled();
7504 
7505     return {VF, 0 /*Cost*/};
7506   }
7507 
7508   LLVM_DEBUG(
7509       dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
7510                 "VPlan-native path.\n");
7511   return VectorizationFactor::Disabled();
7512 }
7513 
7514 Optional<VectorizationFactor>
7515 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
7516   assert(OrigLoop->isInnermost() && "Inner loop expected.");
7517   Optional<ElementCount> MaybeMaxVF = CM.computeMaxVF(UserVF, UserIC);
7518   if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved.
7519     return None;
7520 
7521   // Invalidate interleave groups if all blocks of loop will be predicated.
7522   if (CM.blockNeedsPredication(OrigLoop->getHeader()) &&
7523       !useMaskedInterleavedAccesses(*TTI)) {
7524     LLVM_DEBUG(
7525         dbgs()
7526         << "LV: Invalidate all interleaved groups due to fold-tail by masking "
7527            "which requires masked-interleaved support.\n");
7528     if (CM.InterleaveInfo.invalidateGroups())
7529       // Invalidating interleave groups also requires invalidating all decisions
7530       // based on them, which includes widening decisions and uniform and scalar
7531       // values.
7532       CM.invalidateCostModelingDecisions();
7533   }
7534 
7535   ElementCount MaxVF = MaybeMaxVF.getValue();
7536   assert(MaxVF.isNonZero() && "MaxVF is zero.");
7537 
7538   bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxVF);
7539   if (!UserVF.isZero() &&
7540       (UserVFIsLegal || (UserVF.isScalable() && MaxVF.isScalable()))) {
7541     // FIXME: MaxVF is temporarily used inplace of UserVF for illegal scalable
7542     // VFs here, this should be reverted to only use legal UserVFs once the
7543     // loop below supports scalable VFs.
7544     ElementCount VF = UserVFIsLegal ? UserVF : MaxVF;
7545     LLVM_DEBUG(dbgs() << "LV: Using " << (UserVFIsLegal ? "user" : "max")
7546                       << " VF " << VF << ".\n");
7547     assert(isPowerOf2_32(VF.getKnownMinValue()) &&
7548            "VF needs to be a power of two");
7549     // Collect the instructions (and their associated costs) that will be more
7550     // profitable to scalarize.
7551     CM.selectUserVectorizationFactor(VF);
7552     CM.collectInLoopReductions();
7553     buildVPlansWithVPRecipes(VF, VF);
7554     LLVM_DEBUG(printPlans(dbgs()));
7555     return {{VF, 0}};
7556   }
7557 
7558   assert(!MaxVF.isScalable() &&
7559          "Scalable vectors not yet supported beyond this point");
7560 
7561   for (ElementCount VF = ElementCount::getFixed(1);
7562        ElementCount::isKnownLE(VF, MaxVF); VF *= 2) {
7563     // Collect Uniform and Scalar instructions after vectorization with VF.
7564     CM.collectUniformsAndScalars(VF);
7565 
7566     // Collect the instructions (and their associated costs) that will be more
7567     // profitable to scalarize.
7568     if (VF.isVector())
7569       CM.collectInstsToScalarize(VF);
7570   }
7571 
7572   CM.collectInLoopReductions();
7573 
7574   buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxVF);
7575   LLVM_DEBUG(printPlans(dbgs()));
7576   if (MaxVF.isScalar())
7577     return VectorizationFactor::Disabled();
7578 
7579   // Select the optimal vectorization factor.
7580   return CM.selectVectorizationFactor(MaxVF);
7581 }
7582 
7583 void LoopVectorizationPlanner::setBestPlan(ElementCount VF, unsigned UF) {
7584   LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF
7585                     << '\n');
7586   BestVF = VF;
7587   BestUF = UF;
7588 
7589   erase_if(VPlans, [VF](const VPlanPtr &Plan) {
7590     return !Plan->hasVF(VF);
7591   });
7592   assert(VPlans.size() == 1 && "Best VF has not a single VPlan.");
7593 }
7594 
7595 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV,
7596                                            DominatorTree *DT) {
7597   // Perform the actual loop transformation.
7598 
7599   // 1. Create a new empty loop. Unlink the old loop and connect the new one.
7600   assert(BestVF.hasValue() && "Vectorization Factor is missing");
7601   assert(VPlans.size() == 1 && "Not a single VPlan to execute.");
7602 
7603   VPTransformState State{
7604       *BestVF, BestUF, LI, DT, ILV.Builder, &ILV, VPlans.front().get()};
7605   State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
7606   State.TripCount = ILV.getOrCreateTripCount(nullptr);
7607   State.CanonicalIV = ILV.Induction;
7608 
7609   ILV.printDebugTracesAtStart();
7610 
7611   //===------------------------------------------------===//
7612   //
7613   // Notice: any optimization or new instruction that go
7614   // into the code below should also be implemented in
7615   // the cost-model.
7616   //
7617   //===------------------------------------------------===//
7618 
7619   // 2. Copy and widen instructions from the old loop into the new loop.
7620   VPlans.front()->execute(&State);
7621 
7622   // 3. Fix the vectorized code: take care of header phi's, live-outs,
7623   //    predication, updating analyses.
7624   ILV.fixVectorizedLoop(State);
7625 
7626   ILV.printDebugTracesAtEnd();
7627 }
7628 
7629 void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
7630     SmallPtrSetImpl<Instruction *> &DeadInstructions) {
7631 
7632   // We create new control-flow for the vectorized loop, so the original exit
7633   // conditions will be dead after vectorization if it's only used by the
7634   // terminator
7635   SmallVector<BasicBlock*> ExitingBlocks;
7636   OrigLoop->getExitingBlocks(ExitingBlocks);
7637   for (auto *BB : ExitingBlocks) {
7638     auto *Cmp = dyn_cast<Instruction>(BB->getTerminator()->getOperand(0));
7639     if (!Cmp || !Cmp->hasOneUse())
7640       continue;
7641 
7642     // TODO: we should introduce a getUniqueExitingBlocks on Loop
7643     if (!DeadInstructions.insert(Cmp).second)
7644       continue;
7645 
7646     // The operands of the icmp is often a dead trunc, used by IndUpdate.
7647     // TODO: can recurse through operands in general
7648     for (Value *Op : Cmp->operands()) {
7649       if (isa<TruncInst>(Op) && Op->hasOneUse())
7650           DeadInstructions.insert(cast<Instruction>(Op));
7651     }
7652   }
7653 
7654   // We create new "steps" for induction variable updates to which the original
7655   // induction variables map. An original update instruction will be dead if
7656   // all its users except the induction variable are dead.
7657   auto *Latch = OrigLoop->getLoopLatch();
7658   for (auto &Induction : Legal->getInductionVars()) {
7659     PHINode *Ind = Induction.first;
7660     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
7661 
7662     // If the tail is to be folded by masking, the primary induction variable,
7663     // if exists, isn't dead: it will be used for masking. Don't kill it.
7664     if (CM.foldTailByMasking() && IndUpdate == Legal->getPrimaryInduction())
7665       continue;
7666 
7667     if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
7668           return U == Ind || DeadInstructions.count(cast<Instruction>(U));
7669         }))
7670       DeadInstructions.insert(IndUpdate);
7671 
7672     // We record as "Dead" also the type-casting instructions we had identified
7673     // during induction analysis. We don't need any handling for them in the
7674     // vectorized loop because we have proven that, under a proper runtime
7675     // test guarding the vectorized loop, the value of the phi, and the casted
7676     // value of the phi, are the same. The last instruction in this casting chain
7677     // will get its scalar/vector/widened def from the scalar/vector/widened def
7678     // of the respective phi node. Any other casts in the induction def-use chain
7679     // have no other uses outside the phi update chain, and will be ignored.
7680     InductionDescriptor &IndDes = Induction.second;
7681     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
7682     DeadInstructions.insert(Casts.begin(), Casts.end());
7683   }
7684 }
7685 
7686 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; }
7687 
7688 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; }
7689 
7690 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step,
7691                                         Instruction::BinaryOps BinOp) {
7692   // When unrolling and the VF is 1, we only need to add a simple scalar.
7693   Type *Ty = Val->getType();
7694   assert(!Ty->isVectorTy() && "Val must be a scalar");
7695 
7696   if (Ty->isFloatingPointTy()) {
7697     Constant *C = ConstantFP::get(Ty, (double)StartIdx);
7698 
7699     // Floating point operations had to be 'fast' to enable the unrolling.
7700     Value *MulOp = addFastMathFlag(Builder.CreateFMul(C, Step));
7701     return addFastMathFlag(Builder.CreateBinOp(BinOp, Val, MulOp));
7702   }
7703   Constant *C = ConstantInt::get(Ty, StartIdx);
7704   return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction");
7705 }
7706 
7707 static void AddRuntimeUnrollDisableMetaData(Loop *L) {
7708   SmallVector<Metadata *, 4> MDs;
7709   // Reserve first location for self reference to the LoopID metadata node.
7710   MDs.push_back(nullptr);
7711   bool IsUnrollMetadata = false;
7712   MDNode *LoopID = L->getLoopID();
7713   if (LoopID) {
7714     // First find existing loop unrolling disable metadata.
7715     for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
7716       auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
7717       if (MD) {
7718         const auto *S = dyn_cast<MDString>(MD->getOperand(0));
7719         IsUnrollMetadata =
7720             S && S->getString().startswith("llvm.loop.unroll.disable");
7721       }
7722       MDs.push_back(LoopID->getOperand(i));
7723     }
7724   }
7725 
7726   if (!IsUnrollMetadata) {
7727     // Add runtime unroll disable metadata.
7728     LLVMContext &Context = L->getHeader()->getContext();
7729     SmallVector<Metadata *, 1> DisableOperands;
7730     DisableOperands.push_back(
7731         MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
7732     MDNode *DisableNode = MDNode::get(Context, DisableOperands);
7733     MDs.push_back(DisableNode);
7734     MDNode *NewLoopID = MDNode::get(Context, MDs);
7735     // Set operand 0 to refer to the loop id itself.
7736     NewLoopID->replaceOperandWith(0, NewLoopID);
7737     L->setLoopID(NewLoopID);
7738   }
7739 }
7740 
7741 //===--------------------------------------------------------------------===//
7742 // EpilogueVectorizerMainLoop
7743 //===--------------------------------------------------------------------===//
7744 
7745 /// This function is partially responsible for generating the control flow
7746 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7747 BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() {
7748   MDNode *OrigLoopID = OrigLoop->getLoopID();
7749   Loop *Lp = createVectorLoopSkeleton("");
7750 
7751   // Generate the code to check the minimum iteration count of the vector
7752   // epilogue (see below).
7753   EPI.EpilogueIterationCountCheck =
7754       emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, true);
7755   EPI.EpilogueIterationCountCheck->setName("iter.check");
7756 
7757   // Generate the code to check any assumptions that we've made for SCEV
7758   // expressions.
7759   BasicBlock *SavedPreHeader = LoopVectorPreHeader;
7760   emitSCEVChecks(Lp, LoopScalarPreHeader);
7761 
7762   // If a safety check was generated save it.
7763   if (SavedPreHeader != LoopVectorPreHeader)
7764     EPI.SCEVSafetyCheck = SavedPreHeader;
7765 
7766   // Generate the code that checks at runtime if arrays overlap. We put the
7767   // checks into a separate block to make the more common case of few elements
7768   // faster.
7769   SavedPreHeader = LoopVectorPreHeader;
7770   emitMemRuntimeChecks(Lp, LoopScalarPreHeader);
7771 
7772   // If a safety check was generated save/overwite it.
7773   if (SavedPreHeader != LoopVectorPreHeader)
7774     EPI.MemSafetyCheck = SavedPreHeader;
7775 
7776   // Generate the iteration count check for the main loop, *after* the check
7777   // for the epilogue loop, so that the path-length is shorter for the case
7778   // that goes directly through the vector epilogue. The longer-path length for
7779   // the main loop is compensated for, by the gain from vectorizing the larger
7780   // trip count. Note: the branch will get updated later on when we vectorize
7781   // the epilogue.
7782   EPI.MainLoopIterationCountCheck =
7783       emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, false);
7784 
7785   // Generate the induction variable.
7786   OldInduction = Legal->getPrimaryInduction();
7787   Type *IdxTy = Legal->getWidestInductionType();
7788   Value *StartIdx = ConstantInt::get(IdxTy, 0);
7789   Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF);
7790   Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
7791   EPI.VectorTripCount = CountRoundDown;
7792   Induction =
7793       createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
7794                               getDebugLocFromInstOrOperands(OldInduction));
7795 
7796   // Skip induction resume value creation here because they will be created in
7797   // the second pass. If we created them here, they wouldn't be used anyway,
7798   // because the vplan in the second pass still contains the inductions from the
7799   // original loop.
7800 
7801   return completeLoopSkeleton(Lp, OrigLoopID);
7802 }
7803 
7804 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() {
7805   LLVM_DEBUG({
7806     dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n"
7807            << "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue()
7808            << ", Main Loop UF:" << EPI.MainLoopUF
7809            << ", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue()
7810            << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7811   });
7812 }
7813 
7814 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() {
7815   DEBUG_WITH_TYPE(VerboseDebug, {
7816     dbgs() << "intermediate fn:\n" << *Induction->getFunction() << "\n";
7817   });
7818 }
7819 
7820 BasicBlock *EpilogueVectorizerMainLoop::emitMinimumIterationCountCheck(
7821     Loop *L, BasicBlock *Bypass, bool ForEpilogue) {
7822   assert(L && "Expected valid Loop.");
7823   assert(Bypass && "Expected valid bypass basic block.");
7824   unsigned VFactor =
7825       ForEpilogue ? EPI.EpilogueVF.getKnownMinValue() : VF.getKnownMinValue();
7826   unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF;
7827   Value *Count = getOrCreateTripCount(L);
7828   // Reuse existing vector loop preheader for TC checks.
7829   // Note that new preheader block is generated for vector loop.
7830   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
7831   IRBuilder<> Builder(TCCheckBlock->getTerminator());
7832 
7833   // Generate code to check if the loop's trip count is less than VF * UF of the
7834   // main vector loop.
7835   auto P =
7836       Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
7837 
7838   Value *CheckMinIters = Builder.CreateICmp(
7839       P, Count, ConstantInt::get(Count->getType(), VFactor * UFactor),
7840       "min.iters.check");
7841 
7842   if (!ForEpilogue)
7843     TCCheckBlock->setName("vector.main.loop.iter.check");
7844 
7845   // Create new preheader for vector loop.
7846   LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(),
7847                                    DT, LI, nullptr, "vector.ph");
7848 
7849   if (ForEpilogue) {
7850     assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
7851                                  DT->getNode(Bypass)->getIDom()) &&
7852            "TC check is expected to dominate Bypass");
7853 
7854     // Update dominator for Bypass & LoopExit.
7855     DT->changeImmediateDominator(Bypass, TCCheckBlock);
7856     DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
7857 
7858     LoopBypassBlocks.push_back(TCCheckBlock);
7859 
7860     // Save the trip count so we don't have to regenerate it in the
7861     // vec.epilog.iter.check. This is safe to do because the trip count
7862     // generated here dominates the vector epilog iter check.
7863     EPI.TripCount = Count;
7864   }
7865 
7866   ReplaceInstWithInst(
7867       TCCheckBlock->getTerminator(),
7868       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
7869 
7870   return TCCheckBlock;
7871 }
7872 
7873 //===--------------------------------------------------------------------===//
7874 // EpilogueVectorizerEpilogueLoop
7875 //===--------------------------------------------------------------------===//
7876 
7877 /// This function is partially responsible for generating the control flow
7878 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7879 BasicBlock *
7880 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
7881   MDNode *OrigLoopID = OrigLoop->getLoopID();
7882   Loop *Lp = createVectorLoopSkeleton("vec.epilog.");
7883 
7884   // Now, compare the remaining count and if there aren't enough iterations to
7885   // execute the vectorized epilogue skip to the scalar part.
7886   BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader;
7887   VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check");
7888   LoopVectorPreHeader =
7889       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
7890                  LI, nullptr, "vec.epilog.ph");
7891   emitMinimumVectorEpilogueIterCountCheck(Lp, LoopScalarPreHeader,
7892                                           VecEpilogueIterationCountCheck);
7893 
7894   // Adjust the control flow taking the state info from the main loop
7895   // vectorization into account.
7896   assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck &&
7897          "expected this to be saved from the previous pass.");
7898   EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith(
7899       VecEpilogueIterationCountCheck, LoopVectorPreHeader);
7900 
7901   DT->changeImmediateDominator(LoopVectorPreHeader,
7902                                EPI.MainLoopIterationCountCheck);
7903 
7904   EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith(
7905       VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7906 
7907   if (EPI.SCEVSafetyCheck)
7908     EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith(
7909         VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7910   if (EPI.MemSafetyCheck)
7911     EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith(
7912         VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7913 
7914   DT->changeImmediateDominator(
7915       VecEpilogueIterationCountCheck,
7916       VecEpilogueIterationCountCheck->getSinglePredecessor());
7917 
7918   DT->changeImmediateDominator(LoopScalarPreHeader,
7919                                EPI.EpilogueIterationCountCheck);
7920   DT->changeImmediateDominator(LoopExitBlock, EPI.EpilogueIterationCountCheck);
7921 
7922   // Keep track of bypass blocks, as they feed start values to the induction
7923   // phis in the scalar loop preheader.
7924   if (EPI.SCEVSafetyCheck)
7925     LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck);
7926   if (EPI.MemSafetyCheck)
7927     LoopBypassBlocks.push_back(EPI.MemSafetyCheck);
7928   LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck);
7929 
7930   // Generate a resume induction for the vector epilogue and put it in the
7931   // vector epilogue preheader
7932   Type *IdxTy = Legal->getWidestInductionType();
7933   PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val",
7934                                          LoopVectorPreHeader->getFirstNonPHI());
7935   EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck);
7936   EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0),
7937                            EPI.MainLoopIterationCountCheck);
7938 
7939   // Generate the induction variable.
7940   OldInduction = Legal->getPrimaryInduction();
7941   Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
7942   Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF);
7943   Value *StartIdx = EPResumeVal;
7944   Induction =
7945       createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
7946                               getDebugLocFromInstOrOperands(OldInduction));
7947 
7948   // Generate induction resume values. These variables save the new starting
7949   // indexes for the scalar loop. They are used to test if there are any tail
7950   // iterations left once the vector loop has completed.
7951   // Note that when the vectorized epilogue is skipped due to iteration count
7952   // check, then the resume value for the induction variable comes from
7953   // the trip count of the main vector loop, hence passing the AdditionalBypass
7954   // argument.
7955   createInductionResumeValues(Lp, CountRoundDown,
7956                               {VecEpilogueIterationCountCheck,
7957                                EPI.VectorTripCount} /* AdditionalBypass */);
7958 
7959   AddRuntimeUnrollDisableMetaData(Lp);
7960   return completeLoopSkeleton(Lp, OrigLoopID);
7961 }
7962 
7963 BasicBlock *
7964 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
7965     Loop *L, BasicBlock *Bypass, BasicBlock *Insert) {
7966 
7967   assert(EPI.TripCount &&
7968          "Expected trip count to have been safed in the first pass.");
7969   assert(
7970       (!isa<Instruction>(EPI.TripCount) ||
7971        DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) &&
7972       "saved trip count does not dominate insertion point.");
7973   Value *TC = EPI.TripCount;
7974   IRBuilder<> Builder(Insert->getTerminator());
7975   Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining");
7976 
7977   // Generate code to check if the loop's trip count is less than VF * UF of the
7978   // vector epilogue loop.
7979   auto P =
7980       Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
7981 
7982   Value *CheckMinIters = Builder.CreateICmp(
7983       P, Count,
7984       ConstantInt::get(Count->getType(),
7985                        EPI.EpilogueVF.getKnownMinValue() * EPI.EpilogueUF),
7986       "min.epilog.iters.check");
7987 
7988   ReplaceInstWithInst(
7989       Insert->getTerminator(),
7990       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
7991 
7992   LoopBypassBlocks.push_back(Insert);
7993   return Insert;
7994 }
7995 
7996 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() {
7997   LLVM_DEBUG({
7998     dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n"
7999            << "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue()
8000            << ", Main Loop UF:" << EPI.MainLoopUF
8001            << ", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue()
8002            << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
8003   });
8004 }
8005 
8006 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() {
8007   DEBUG_WITH_TYPE(VerboseDebug, {
8008     dbgs() << "final fn:\n" << *Induction->getFunction() << "\n";
8009   });
8010 }
8011 
8012 bool LoopVectorizationPlanner::getDecisionAndClampRange(
8013     const std::function<bool(ElementCount)> &Predicate, VFRange &Range) {
8014   assert(!Range.isEmpty() && "Trying to test an empty VF range.");
8015   bool PredicateAtRangeStart = Predicate(Range.Start);
8016 
8017   for (ElementCount TmpVF = Range.Start * 2;
8018        ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2)
8019     if (Predicate(TmpVF) != PredicateAtRangeStart) {
8020       Range.End = TmpVF;
8021       break;
8022     }
8023 
8024   return PredicateAtRangeStart;
8025 }
8026 
8027 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
8028 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
8029 /// of VF's starting at a given VF and extending it as much as possible. Each
8030 /// vectorization decision can potentially shorten this sub-range during
8031 /// buildVPlan().
8032 void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF,
8033                                            ElementCount MaxVF) {
8034   auto MaxVFPlusOne = MaxVF.getWithIncrement(1);
8035   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) {
8036     VFRange SubRange = {VF, MaxVFPlusOne};
8037     VPlans.push_back(buildVPlan(SubRange));
8038     VF = SubRange.End;
8039   }
8040 }
8041 
8042 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
8043                                          VPlanPtr &Plan) {
8044   assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
8045 
8046   // Look for cached value.
8047   std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
8048   EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
8049   if (ECEntryIt != EdgeMaskCache.end())
8050     return ECEntryIt->second;
8051 
8052   VPValue *SrcMask = createBlockInMask(Src, Plan);
8053 
8054   // The terminator has to be a branch inst!
8055   BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
8056   assert(BI && "Unexpected terminator found");
8057 
8058   if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1))
8059     return EdgeMaskCache[Edge] = SrcMask;
8060 
8061   // If source is an exiting block, we know the exit edge is dynamically dead
8062   // in the vector loop, and thus we don't need to restrict the mask.  Avoid
8063   // adding uses of an otherwise potentially dead instruction.
8064   if (OrigLoop->isLoopExiting(Src))
8065     return EdgeMaskCache[Edge] = SrcMask;
8066 
8067   VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition());
8068   assert(EdgeMask && "No Edge Mask found for condition");
8069 
8070   if (BI->getSuccessor(0) != Dst)
8071     EdgeMask = Builder.createNot(EdgeMask);
8072 
8073   if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND.
8074     // The condition is 'SrcMask && EdgeMask', which is equivalent to
8075     // 'select i1 SrcMask, i1 EdgeMask, i1 false'.
8076     // The select version does not introduce new UB if SrcMask is false and
8077     // EdgeMask is poison. Using 'and' here introduces undefined behavior.
8078     VPValue *False = Plan->getOrAddVPValue(
8079         ConstantInt::getFalse(BI->getCondition()->getType()));
8080     EdgeMask = Builder.createSelect(SrcMask, EdgeMask, False);
8081   }
8082 
8083   return EdgeMaskCache[Edge] = EdgeMask;
8084 }
8085 
8086 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
8087   assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
8088 
8089   // Look for cached value.
8090   BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB);
8091   if (BCEntryIt != BlockMaskCache.end())
8092     return BCEntryIt->second;
8093 
8094   // All-one mask is modelled as no-mask following the convention for masked
8095   // load/store/gather/scatter. Initialize BlockMask to no-mask.
8096   VPValue *BlockMask = nullptr;
8097 
8098   if (OrigLoop->getHeader() == BB) {
8099     if (!CM.blockNeedsPredication(BB))
8100       return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one.
8101 
8102     // Create the block in mask as the first non-phi instruction in the block.
8103     VPBuilder::InsertPointGuard Guard(Builder);
8104     auto NewInsertionPoint = Builder.getInsertBlock()->getFirstNonPhi();
8105     Builder.setInsertPoint(Builder.getInsertBlock(), NewInsertionPoint);
8106 
8107     // Introduce the early-exit compare IV <= BTC to form header block mask.
8108     // This is used instead of IV < TC because TC may wrap, unlike BTC.
8109     // Start by constructing the desired canonical IV.
8110     VPValue *IV = nullptr;
8111     if (Legal->getPrimaryInduction())
8112       IV = Plan->getOrAddVPValue(Legal->getPrimaryInduction());
8113     else {
8114       auto IVRecipe = new VPWidenCanonicalIVRecipe();
8115       Builder.getInsertBlock()->insert(IVRecipe, NewInsertionPoint);
8116       IV = IVRecipe->getVPValue();
8117     }
8118     VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
8119     bool TailFolded = !CM.isScalarEpilogueAllowed();
8120 
8121     if (TailFolded && CM.TTI.emitGetActiveLaneMask()) {
8122       // While ActiveLaneMask is a binary op that consumes the loop tripcount
8123       // as a second argument, we only pass the IV here and extract the
8124       // tripcount from the transform state where codegen of the VP instructions
8125       // happen.
8126       BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV});
8127     } else {
8128       BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
8129     }
8130     return BlockMaskCache[BB] = BlockMask;
8131   }
8132 
8133   // This is the block mask. We OR all incoming edges.
8134   for (auto *Predecessor : predecessors(BB)) {
8135     VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan);
8136     if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too.
8137       return BlockMaskCache[BB] = EdgeMask;
8138 
8139     if (!BlockMask) { // BlockMask has its initialized nullptr value.
8140       BlockMask = EdgeMask;
8141       continue;
8142     }
8143 
8144     BlockMask = Builder.createOr(BlockMask, EdgeMask);
8145   }
8146 
8147   return BlockMaskCache[BB] = BlockMask;
8148 }
8149 
8150 VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range,
8151                                                 VPlanPtr &Plan) {
8152   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
8153          "Must be called with either a load or store");
8154 
8155   auto willWiden = [&](ElementCount VF) -> bool {
8156     if (VF.isScalar())
8157       return false;
8158     LoopVectorizationCostModel::InstWidening Decision =
8159         CM.getWideningDecision(I, VF);
8160     assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
8161            "CM decision should be taken at this point.");
8162     if (Decision == LoopVectorizationCostModel::CM_Interleave)
8163       return true;
8164     if (CM.isScalarAfterVectorization(I, VF) ||
8165         CM.isProfitableToScalarize(I, VF))
8166       return false;
8167     return Decision != LoopVectorizationCostModel::CM_Scalarize;
8168   };
8169 
8170   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
8171     return nullptr;
8172 
8173   VPValue *Mask = nullptr;
8174   if (Legal->isMaskRequired(I))
8175     Mask = createBlockInMask(I->getParent(), Plan);
8176 
8177   VPValue *Addr = Plan->getOrAddVPValue(getLoadStorePointerOperand(I));
8178   if (LoadInst *Load = dyn_cast<LoadInst>(I))
8179     return new VPWidenMemoryInstructionRecipe(*Load, Addr, Mask);
8180 
8181   StoreInst *Store = cast<StoreInst>(I);
8182   VPValue *StoredValue = Plan->getOrAddVPValue(Store->getValueOperand());
8183   return new VPWidenMemoryInstructionRecipe(*Store, Addr, StoredValue, Mask);
8184 }
8185 
8186 VPWidenIntOrFpInductionRecipe *
8187 VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi, VPlan &Plan) const {
8188   // Check if this is an integer or fp induction. If so, build the recipe that
8189   // produces its scalar and vector values.
8190   InductionDescriptor II = Legal->getInductionVars().lookup(Phi);
8191   if (II.getKind() == InductionDescriptor::IK_IntInduction ||
8192       II.getKind() == InductionDescriptor::IK_FpInduction) {
8193     VPValue *Start = Plan.getOrAddVPValue(II.getStartValue());
8194     const SmallVectorImpl<Instruction *> &Casts = II.getCastInsts();
8195     return new VPWidenIntOrFpInductionRecipe(
8196         Phi, Start, Casts.empty() ? nullptr : Casts.front());
8197   }
8198 
8199   return nullptr;
8200 }
8201 
8202 VPWidenIntOrFpInductionRecipe *
8203 VPRecipeBuilder::tryToOptimizeInductionTruncate(TruncInst *I, VFRange &Range,
8204                                                 VPlan &Plan) const {
8205   // Optimize the special case where the source is a constant integer
8206   // induction variable. Notice that we can only optimize the 'trunc' case
8207   // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
8208   // (c) other casts depend on pointer size.
8209 
8210   // Determine whether \p K is a truncation based on an induction variable that
8211   // can be optimized.
8212   auto isOptimizableIVTruncate =
8213       [&](Instruction *K) -> std::function<bool(ElementCount)> {
8214     return [=](ElementCount VF) -> bool {
8215       return CM.isOptimizableIVTruncate(K, VF);
8216     };
8217   };
8218 
8219   if (LoopVectorizationPlanner::getDecisionAndClampRange(
8220           isOptimizableIVTruncate(I), Range)) {
8221 
8222     InductionDescriptor II =
8223         Legal->getInductionVars().lookup(cast<PHINode>(I->getOperand(0)));
8224     VPValue *Start = Plan.getOrAddVPValue(II.getStartValue());
8225     return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)),
8226                                              Start, nullptr, I);
8227   }
8228   return nullptr;
8229 }
8230 
8231 VPBlendRecipe *VPRecipeBuilder::tryToBlend(PHINode *Phi, VPlanPtr &Plan) {
8232   // We know that all PHIs in non-header blocks are converted into selects, so
8233   // we don't have to worry about the insertion order and we can just use the
8234   // builder. At this point we generate the predication tree. There may be
8235   // duplications since this is a simple recursive scan, but future
8236   // optimizations will clean it up.
8237 
8238   SmallVector<VPValue *, 2> Operands;
8239   unsigned NumIncoming = Phi->getNumIncomingValues();
8240   for (unsigned In = 0; In < NumIncoming; In++) {
8241     VPValue *EdgeMask =
8242       createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan);
8243     assert((EdgeMask || NumIncoming == 1) &&
8244            "Multiple predecessors with one having a full mask");
8245     Operands.push_back(Plan->getOrAddVPValue(Phi->getIncomingValue(In)));
8246     if (EdgeMask)
8247       Operands.push_back(EdgeMask);
8248   }
8249   return new VPBlendRecipe(Phi, Operands);
8250 }
8251 
8252 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, VFRange &Range,
8253                                                    VPlan &Plan) const {
8254 
8255   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
8256       [this, CI](ElementCount VF) {
8257         return CM.isScalarWithPredication(CI, VF);
8258       },
8259       Range);
8260 
8261   if (IsPredicated)
8262     return nullptr;
8263 
8264   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
8265   if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
8266              ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect ||
8267              ID == Intrinsic::pseudoprobe ||
8268              ID == Intrinsic::experimental_noalias_scope_decl))
8269     return nullptr;
8270 
8271   auto willWiden = [&](ElementCount VF) -> bool {
8272     Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
8273     // The following case may be scalarized depending on the VF.
8274     // The flag shows whether we use Intrinsic or a usual Call for vectorized
8275     // version of the instruction.
8276     // Is it beneficial to perform intrinsic call compared to lib call?
8277     bool NeedToScalarize = false;
8278     InstructionCost CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize);
8279     InstructionCost IntrinsicCost = ID ? CM.getVectorIntrinsicCost(CI, VF) : 0;
8280     bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost;
8281     assert(IntrinsicCost.isValid() && CallCost.isValid() &&
8282            "Cannot have invalid costs while widening");
8283     return UseVectorIntrinsic || !NeedToScalarize;
8284   };
8285 
8286   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
8287     return nullptr;
8288 
8289   return new VPWidenCallRecipe(*CI, Plan.mapToVPValues(CI->arg_operands()));
8290 }
8291 
8292 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
8293   assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) &&
8294          !isa<StoreInst>(I) && "Instruction should have been handled earlier");
8295   // Instruction should be widened, unless it is scalar after vectorization,
8296   // scalarization is profitable or it is predicated.
8297   auto WillScalarize = [this, I](ElementCount VF) -> bool {
8298     return CM.isScalarAfterVectorization(I, VF) ||
8299            CM.isProfitableToScalarize(I, VF) ||
8300            CM.isScalarWithPredication(I, VF);
8301   };
8302   return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize,
8303                                                              Range);
8304 }
8305 
8306 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, VPlan &Plan) const {
8307   auto IsVectorizableOpcode = [](unsigned Opcode) {
8308     switch (Opcode) {
8309     case Instruction::Add:
8310     case Instruction::And:
8311     case Instruction::AShr:
8312     case Instruction::BitCast:
8313     case Instruction::FAdd:
8314     case Instruction::FCmp:
8315     case Instruction::FDiv:
8316     case Instruction::FMul:
8317     case Instruction::FNeg:
8318     case Instruction::FPExt:
8319     case Instruction::FPToSI:
8320     case Instruction::FPToUI:
8321     case Instruction::FPTrunc:
8322     case Instruction::FRem:
8323     case Instruction::FSub:
8324     case Instruction::ICmp:
8325     case Instruction::IntToPtr:
8326     case Instruction::LShr:
8327     case Instruction::Mul:
8328     case Instruction::Or:
8329     case Instruction::PtrToInt:
8330     case Instruction::SDiv:
8331     case Instruction::Select:
8332     case Instruction::SExt:
8333     case Instruction::Shl:
8334     case Instruction::SIToFP:
8335     case Instruction::SRem:
8336     case Instruction::Sub:
8337     case Instruction::Trunc:
8338     case Instruction::UDiv:
8339     case Instruction::UIToFP:
8340     case Instruction::URem:
8341     case Instruction::Xor:
8342     case Instruction::ZExt:
8343       return true;
8344     }
8345     return false;
8346   };
8347 
8348   if (!IsVectorizableOpcode(I->getOpcode()))
8349     return nullptr;
8350 
8351   // Success: widen this instruction.
8352   return new VPWidenRecipe(*I, Plan.mapToVPValues(I->operands()));
8353 }
8354 
8355 VPBasicBlock *VPRecipeBuilder::handleReplication(
8356     Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
8357     DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe,
8358     VPlanPtr &Plan) {
8359   bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
8360       [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },
8361       Range);
8362 
8363   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
8364       [&](ElementCount VF) { return CM.isScalarWithPredication(I, VF); },
8365       Range);
8366 
8367   auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()),
8368                                        IsUniform, IsPredicated);
8369   setRecipe(I, Recipe);
8370   Plan->addVPValue(I, Recipe);
8371 
8372   // Find if I uses a predicated instruction. If so, it will use its scalar
8373   // value. Avoid hoisting the insert-element which packs the scalar value into
8374   // a vector value, as that happens iff all users use the vector value.
8375   for (auto &Op : I->operands())
8376     if (auto *PredInst = dyn_cast<Instruction>(Op))
8377       if (PredInst2Recipe.find(PredInst) != PredInst2Recipe.end())
8378         PredInst2Recipe[PredInst]->setAlsoPack(false);
8379 
8380   // Finalize the recipe for Instr, first if it is not predicated.
8381   if (!IsPredicated) {
8382     LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
8383     VPBB->appendRecipe(Recipe);
8384     return VPBB;
8385   }
8386   LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
8387   assert(VPBB->getSuccessors().empty() &&
8388          "VPBB has successors when handling predicated replication.");
8389   // Record predicated instructions for above packing optimizations.
8390   PredInst2Recipe[I] = Recipe;
8391   VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan);
8392   VPBlockUtils::insertBlockAfter(Region, VPBB);
8393   auto *RegSucc = new VPBasicBlock();
8394   VPBlockUtils::insertBlockAfter(RegSucc, Region);
8395   return RegSucc;
8396 }
8397 
8398 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr,
8399                                                       VPRecipeBase *PredRecipe,
8400                                                       VPlanPtr &Plan) {
8401   // Instructions marked for predication are replicated and placed under an
8402   // if-then construct to prevent side-effects.
8403 
8404   // Generate recipes to compute the block mask for this region.
8405   VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan);
8406 
8407   // Build the triangular if-then region.
8408   std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
8409   assert(Instr->getParent() && "Predicated instruction not in any basic block");
8410   auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask);
8411   auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
8412   auto *PHIRecipe = Instr->getType()->isVoidTy()
8413                         ? nullptr
8414                         : new VPPredInstPHIRecipe(Plan->getOrAddVPValue(Instr));
8415   if (PHIRecipe) {
8416     Plan->removeVPValueFor(Instr);
8417     Plan->addVPValue(Instr, PHIRecipe);
8418   }
8419   auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
8420   auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe);
8421   VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true);
8422 
8423   // Note: first set Entry as region entry and then connect successors starting
8424   // from it in order, to propagate the "parent" of each VPBasicBlock.
8425   VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry);
8426   VPBlockUtils::connectBlocks(Pred, Exit);
8427 
8428   return Region;
8429 }
8430 
8431 VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
8432                                                       VFRange &Range,
8433                                                       VPlanPtr &Plan) {
8434   // First, check for specific widening recipes that deal with calls, memory
8435   // operations, inductions and Phi nodes.
8436   if (auto *CI = dyn_cast<CallInst>(Instr))
8437     return tryToWidenCall(CI, Range, *Plan);
8438 
8439   if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
8440     return tryToWidenMemory(Instr, Range, Plan);
8441 
8442   VPRecipeBase *Recipe;
8443   if (auto Phi = dyn_cast<PHINode>(Instr)) {
8444     if (Phi->getParent() != OrigLoop->getHeader())
8445       return tryToBlend(Phi, Plan);
8446     if ((Recipe = tryToOptimizeInductionPHI(Phi, *Plan)))
8447       return Recipe;
8448 
8449     if (Legal->isReductionVariable(Phi)) {
8450       RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi];
8451       VPValue *StartV =
8452           Plan->getOrAddVPValue(RdxDesc.getRecurrenceStartValue());
8453       return new VPWidenPHIRecipe(Phi, RdxDesc, *StartV);
8454     }
8455 
8456     return new VPWidenPHIRecipe(Phi);
8457   }
8458 
8459   if (isa<TruncInst>(Instr) && (Recipe = tryToOptimizeInductionTruncate(
8460                                     cast<TruncInst>(Instr), Range, *Plan)))
8461     return Recipe;
8462 
8463   if (!shouldWiden(Instr, Range))
8464     return nullptr;
8465 
8466   if (auto GEP = dyn_cast<GetElementPtrInst>(Instr))
8467     return new VPWidenGEPRecipe(GEP, Plan->mapToVPValues(GEP->operands()),
8468                                 OrigLoop);
8469 
8470   if (auto *SI = dyn_cast<SelectInst>(Instr)) {
8471     bool InvariantCond =
8472         PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop);
8473     return new VPWidenSelectRecipe(*SI, Plan->mapToVPValues(SI->operands()),
8474                                    InvariantCond);
8475   }
8476 
8477   return tryToWiden(Instr, *Plan);
8478 }
8479 
8480 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
8481                                                         ElementCount MaxVF) {
8482   assert(OrigLoop->isInnermost() && "Inner loop expected.");
8483 
8484   // Collect instructions from the original loop that will become trivially dead
8485   // in the vectorized loop. We don't need to vectorize these instructions. For
8486   // example, original induction update instructions can become dead because we
8487   // separately emit induction "steps" when generating code for the new loop.
8488   // Similarly, we create a new latch condition when setting up the structure
8489   // of the new loop, so the old one can become dead.
8490   SmallPtrSet<Instruction *, 4> DeadInstructions;
8491   collectTriviallyDeadInstructions(DeadInstructions);
8492 
8493   // Add assume instructions we need to drop to DeadInstructions, to prevent
8494   // them from being added to the VPlan.
8495   // TODO: We only need to drop assumes in blocks that get flattend. If the
8496   // control flow is preserved, we should keep them.
8497   auto &ConditionalAssumes = Legal->getConditionalAssumes();
8498   DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end());
8499 
8500   DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter();
8501   // Dead instructions do not need sinking. Remove them from SinkAfter.
8502   for (Instruction *I : DeadInstructions)
8503     SinkAfter.erase(I);
8504 
8505   auto MaxVFPlusOne = MaxVF.getWithIncrement(1);
8506   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) {
8507     VFRange SubRange = {VF, MaxVFPlusOne};
8508     VPlans.push_back(
8509         buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter));
8510     VF = SubRange.End;
8511   }
8512 }
8513 
8514 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
8515     VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions,
8516     const DenseMap<Instruction *, Instruction *> &SinkAfter) {
8517 
8518   // Hold a mapping from predicated instructions to their recipes, in order to
8519   // fix their AlsoPack behavior if a user is determined to replicate and use a
8520   // scalar instead of vector value.
8521   DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe;
8522 
8523   SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
8524 
8525   VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder);
8526 
8527   // ---------------------------------------------------------------------------
8528   // Pre-construction: record ingredients whose recipes we'll need to further
8529   // process after constructing the initial VPlan.
8530   // ---------------------------------------------------------------------------
8531 
8532   // Mark instructions we'll need to sink later and their targets as
8533   // ingredients whose recipe we'll need to record.
8534   for (auto &Entry : SinkAfter) {
8535     RecipeBuilder.recordRecipeOf(Entry.first);
8536     RecipeBuilder.recordRecipeOf(Entry.second);
8537   }
8538   for (auto &Reduction : CM.getInLoopReductionChains()) {
8539     PHINode *Phi = Reduction.first;
8540     RecurKind Kind = Legal->getReductionVars()[Phi].getRecurrenceKind();
8541     const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
8542 
8543     RecipeBuilder.recordRecipeOf(Phi);
8544     for (auto &R : ReductionOperations) {
8545       RecipeBuilder.recordRecipeOf(R);
8546       // For min/max reducitons, where we have a pair of icmp/select, we also
8547       // need to record the ICmp recipe, so it can be removed later.
8548       if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind))
8549         RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0)));
8550     }
8551   }
8552 
8553   // For each interleave group which is relevant for this (possibly trimmed)
8554   // Range, add it to the set of groups to be later applied to the VPlan and add
8555   // placeholders for its members' Recipes which we'll be replacing with a
8556   // single VPInterleaveRecipe.
8557   for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
8558     auto applyIG = [IG, this](ElementCount VF) -> bool {
8559       return (VF.isVector() && // Query is illegal for VF == 1
8560               CM.getWideningDecision(IG->getInsertPos(), VF) ==
8561                   LoopVectorizationCostModel::CM_Interleave);
8562     };
8563     if (!getDecisionAndClampRange(applyIG, Range))
8564       continue;
8565     InterleaveGroups.insert(IG);
8566     for (unsigned i = 0; i < IG->getFactor(); i++)
8567       if (Instruction *Member = IG->getMember(i))
8568         RecipeBuilder.recordRecipeOf(Member);
8569   };
8570 
8571   // ---------------------------------------------------------------------------
8572   // Build initial VPlan: Scan the body of the loop in a topological order to
8573   // visit each basic block after having visited its predecessor basic blocks.
8574   // ---------------------------------------------------------------------------
8575 
8576   // Create a dummy pre-entry VPBasicBlock to start building the VPlan.
8577   auto Plan = std::make_unique<VPlan>();
8578   VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry");
8579   Plan->setEntry(VPBB);
8580 
8581   // Scan the body of the loop in a topological order to visit each basic block
8582   // after having visited its predecessor basic blocks.
8583   LoopBlocksDFS DFS(OrigLoop);
8584   DFS.perform(LI);
8585 
8586   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
8587     // Relevant instructions from basic block BB will be grouped into VPRecipe
8588     // ingredients and fill a new VPBasicBlock.
8589     unsigned VPBBsForBB = 0;
8590     auto *FirstVPBBForBB = new VPBasicBlock(BB->getName());
8591     VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB);
8592     VPBB = FirstVPBBForBB;
8593     Builder.setInsertPoint(VPBB);
8594 
8595     // Introduce each ingredient into VPlan.
8596     // TODO: Model and preserve debug instrinsics in VPlan.
8597     for (Instruction &I : BB->instructionsWithoutDebug()) {
8598       Instruction *Instr = &I;
8599 
8600       // First filter out irrelevant instructions, to ensure no recipes are
8601       // built for them.
8602       if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr))
8603         continue;
8604 
8605       if (auto Recipe =
8606               RecipeBuilder.tryToCreateWidenRecipe(Instr, Range, Plan)) {
8607 
8608         // VPBlendRecipes with a single incoming (value, mask) pair are no-ops.
8609         // Use the incoming value directly.
8610         if (isa<VPBlendRecipe>(Recipe) && Recipe->getNumOperands() <= 2) {
8611           Plan->removeVPValueFor(Instr);
8612           Plan->addVPValue(Instr, Recipe->getOperand(0));
8613           delete Recipe;
8614           continue;
8615         }
8616         for (auto *Def : Recipe->definedValues()) {
8617           auto *UV = Def->getUnderlyingValue();
8618           Plan->addVPValue(UV, Def);
8619         }
8620 
8621         RecipeBuilder.setRecipe(Instr, Recipe);
8622         VPBB->appendRecipe(Recipe);
8623         continue;
8624       }
8625 
8626       // Otherwise, if all widening options failed, Instruction is to be
8627       // replicated. This may create a successor for VPBB.
8628       VPBasicBlock *NextVPBB = RecipeBuilder.handleReplication(
8629           Instr, Range, VPBB, PredInst2Recipe, Plan);
8630       if (NextVPBB != VPBB) {
8631         VPBB = NextVPBB;
8632         VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++)
8633                                     : "");
8634       }
8635     }
8636   }
8637 
8638   // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks
8639   // may also be empty, such as the last one VPBB, reflecting original
8640   // basic-blocks with no recipes.
8641   VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry());
8642   assert(PreEntry->empty() && "Expecting empty pre-entry block.");
8643   VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor());
8644   VPBlockUtils::disconnectBlocks(PreEntry, Entry);
8645   delete PreEntry;
8646 
8647   // ---------------------------------------------------------------------------
8648   // Transform initial VPlan: Apply previously taken decisions, in order, to
8649   // bring the VPlan to its final state.
8650   // ---------------------------------------------------------------------------
8651 
8652   // Apply Sink-After legal constraints.
8653   for (auto &Entry : SinkAfter) {
8654     VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first);
8655     VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second);
8656     // If the target is in a replication region, make sure to move Sink to the
8657     // block after it, not into the replication region itself.
8658     if (auto *Region =
8659             dyn_cast_or_null<VPRegionBlock>(Target->getParent()->getParent())) {
8660       if (Region->isReplicator()) {
8661         assert(Region->getNumSuccessors() == 1 && "Expected SESE region!");
8662         VPBasicBlock *NextBlock =
8663             cast<VPBasicBlock>(Region->getSuccessors().front());
8664         Sink->moveBefore(*NextBlock, NextBlock->getFirstNonPhi());
8665         continue;
8666       }
8667     }
8668     Sink->moveAfter(Target);
8669   }
8670 
8671   // Interleave memory: for each Interleave Group we marked earlier as relevant
8672   // for this VPlan, replace the Recipes widening its memory instructions with a
8673   // single VPInterleaveRecipe at its insertion point.
8674   for (auto IG : InterleaveGroups) {
8675     auto *Recipe = cast<VPWidenMemoryInstructionRecipe>(
8676         RecipeBuilder.getRecipe(IG->getInsertPos()));
8677     SmallVector<VPValue *, 4> StoredValues;
8678     for (unsigned i = 0; i < IG->getFactor(); ++i)
8679       if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i)))
8680         StoredValues.push_back(Plan->getOrAddVPValue(SI->getOperand(0)));
8681 
8682     auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues,
8683                                         Recipe->getMask());
8684     VPIG->insertBefore(Recipe);
8685     unsigned J = 0;
8686     for (unsigned i = 0; i < IG->getFactor(); ++i)
8687       if (Instruction *Member = IG->getMember(i)) {
8688         if (!Member->getType()->isVoidTy()) {
8689           VPValue *OriginalV = Plan->getVPValue(Member);
8690           Plan->removeVPValueFor(Member);
8691           Plan->addVPValue(Member, VPIG->getVPValue(J));
8692           OriginalV->replaceAllUsesWith(VPIG->getVPValue(J));
8693           J++;
8694         }
8695         RecipeBuilder.getRecipe(Member)->eraseFromParent();
8696       }
8697   }
8698 
8699   // Adjust the recipes for any inloop reductions.
8700   if (Range.Start.isVector())
8701     adjustRecipesForInLoopReductions(Plan, RecipeBuilder);
8702 
8703   // Finally, if tail is folded by masking, introduce selects between the phi
8704   // and the live-out instruction of each reduction, at the end of the latch.
8705   if (CM.foldTailByMasking() && !Legal->getReductionVars().empty()) {
8706     Builder.setInsertPoint(VPBB);
8707     auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan);
8708     for (auto &Reduction : Legal->getReductionVars()) {
8709       if (CM.isInLoopReduction(Reduction.first))
8710         continue;
8711       VPValue *Phi = Plan->getOrAddVPValue(Reduction.first);
8712       VPValue *Red = Plan->getOrAddVPValue(Reduction.second.getLoopExitInstr());
8713       Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi});
8714     }
8715   }
8716 
8717   std::string PlanName;
8718   raw_string_ostream RSO(PlanName);
8719   ElementCount VF = Range.Start;
8720   Plan->addVF(VF);
8721   RSO << "Initial VPlan for VF={" << VF;
8722   for (VF *= 2; ElementCount::isKnownLT(VF, Range.End); VF *= 2) {
8723     Plan->addVF(VF);
8724     RSO << "," << VF;
8725   }
8726   RSO << "},UF>=1";
8727   RSO.flush();
8728   Plan->setName(PlanName);
8729 
8730   return Plan;
8731 }
8732 
8733 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
8734   // Outer loop handling: They may require CFG and instruction level
8735   // transformations before even evaluating whether vectorization is profitable.
8736   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
8737   // the vectorization pipeline.
8738   assert(!OrigLoop->isInnermost());
8739   assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
8740 
8741   // Create new empty VPlan
8742   auto Plan = std::make_unique<VPlan>();
8743 
8744   // Build hierarchical CFG
8745   VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
8746   HCFGBuilder.buildHierarchicalCFG();
8747 
8748   for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End);
8749        VF *= 2)
8750     Plan->addVF(VF);
8751 
8752   if (EnableVPlanPredication) {
8753     VPlanPredicator VPP(*Plan);
8754     VPP.predicate();
8755 
8756     // Avoid running transformation to recipes until masked code generation in
8757     // VPlan-native path is in place.
8758     return Plan;
8759   }
8760 
8761   SmallPtrSet<Instruction *, 1> DeadInstructions;
8762   VPlanTransforms::VPInstructionsToVPRecipes(
8763       OrigLoop, Plan, Legal->getInductionVars(), DeadInstructions);
8764   return Plan;
8765 }
8766 
8767 // Adjust the recipes for any inloop reductions. The chain of instructions
8768 // leading from the loop exit instr to the phi need to be converted to
8769 // reductions, with one operand being vector and the other being the scalar
8770 // reduction chain.
8771 void LoopVectorizationPlanner::adjustRecipesForInLoopReductions(
8772     VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder) {
8773   for (auto &Reduction : CM.getInLoopReductionChains()) {
8774     PHINode *Phi = Reduction.first;
8775     RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi];
8776     const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
8777 
8778     // ReductionOperations are orders top-down from the phi's use to the
8779     // LoopExitValue. We keep a track of the previous item (the Chain) to tell
8780     // which of the two operands will remain scalar and which will be reduced.
8781     // For minmax the chain will be the select instructions.
8782     Instruction *Chain = Phi;
8783     for (Instruction *R : ReductionOperations) {
8784       VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R);
8785       RecurKind Kind = RdxDesc.getRecurrenceKind();
8786 
8787       VPValue *ChainOp = Plan->getVPValue(Chain);
8788       unsigned FirstOpId;
8789       if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
8790         assert(isa<VPWidenSelectRecipe>(WidenRecipe) &&
8791                "Expected to replace a VPWidenSelectSC");
8792         FirstOpId = 1;
8793       } else {
8794         assert(isa<VPWidenRecipe>(WidenRecipe) &&
8795                "Expected to replace a VPWidenSC");
8796         FirstOpId = 0;
8797       }
8798       unsigned VecOpId =
8799           R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId;
8800       VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId));
8801 
8802       auto *CondOp = CM.foldTailByMasking()
8803                          ? RecipeBuilder.createBlockInMask(R->getParent(), Plan)
8804                          : nullptr;
8805       VPReductionRecipe *RedRecipe = new VPReductionRecipe(
8806           &RdxDesc, R, ChainOp, VecOp, CondOp, TTI);
8807       WidenRecipe->getVPValue()->replaceAllUsesWith(RedRecipe);
8808       Plan->removeVPValueFor(R);
8809       Plan->addVPValue(R, RedRecipe);
8810       WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator());
8811       WidenRecipe->getVPValue()->replaceAllUsesWith(RedRecipe);
8812       WidenRecipe->eraseFromParent();
8813 
8814       if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
8815         VPRecipeBase *CompareRecipe =
8816             RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0)));
8817         assert(isa<VPWidenRecipe>(CompareRecipe) &&
8818                "Expected to replace a VPWidenSC");
8819         assert(cast<VPWidenRecipe>(CompareRecipe)->getNumUsers() == 0 &&
8820                "Expected no remaining users");
8821         CompareRecipe->eraseFromParent();
8822       }
8823       Chain = R;
8824     }
8825   }
8826 }
8827 
8828 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
8829                                VPSlotTracker &SlotTracker) const {
8830   O << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
8831   IG->getInsertPos()->printAsOperand(O, false);
8832   O << ", ";
8833   getAddr()->printAsOperand(O, SlotTracker);
8834   VPValue *Mask = getMask();
8835   if (Mask) {
8836     O << ", ";
8837     Mask->printAsOperand(O, SlotTracker);
8838   }
8839   for (unsigned i = 0; i < IG->getFactor(); ++i)
8840     if (Instruction *I = IG->getMember(i))
8841       O << "\\l\" +\n" << Indent << "\"  " << VPlanIngredient(I) << " " << i;
8842 }
8843 
8844 void VPWidenCallRecipe::execute(VPTransformState &State) {
8845   State.ILV->widenCallInstruction(*cast<CallInst>(getUnderlyingInstr()), this,
8846                                   *this, State);
8847 }
8848 
8849 void VPWidenSelectRecipe::execute(VPTransformState &State) {
8850   State.ILV->widenSelectInstruction(*cast<SelectInst>(getUnderlyingInstr()),
8851                                     this, *this, InvariantCond, State);
8852 }
8853 
8854 void VPWidenRecipe::execute(VPTransformState &State) {
8855   State.ILV->widenInstruction(*getUnderlyingInstr(), this, *this, State);
8856 }
8857 
8858 void VPWidenGEPRecipe::execute(VPTransformState &State) {
8859   State.ILV->widenGEP(cast<GetElementPtrInst>(getUnderlyingInstr()), this,
8860                       *this, State.UF, State.VF, IsPtrLoopInvariant,
8861                       IsIndexLoopInvariant, State);
8862 }
8863 
8864 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
8865   assert(!State.Instance && "Int or FP induction being replicated.");
8866   State.ILV->widenIntOrFpInduction(IV, getStartValue()->getLiveInIRValue(),
8867                                    getTruncInst(), getVPValue(0),
8868                                    getCastValue(), State);
8869 }
8870 
8871 void VPWidenPHIRecipe::execute(VPTransformState &State) {
8872   State.ILV->widenPHIInstruction(cast<PHINode>(getUnderlyingValue()), RdxDesc,
8873                                  getStartValue(), this, State);
8874 }
8875 
8876 void VPBlendRecipe::execute(VPTransformState &State) {
8877   State.ILV->setDebugLocFromInst(State.Builder, Phi);
8878   // We know that all PHIs in non-header blocks are converted into
8879   // selects, so we don't have to worry about the insertion order and we
8880   // can just use the builder.
8881   // At this point we generate the predication tree. There may be
8882   // duplications since this is a simple recursive scan, but future
8883   // optimizations will clean it up.
8884 
8885   unsigned NumIncoming = getNumIncomingValues();
8886 
8887   // Generate a sequence of selects of the form:
8888   // SELECT(Mask3, In3,
8889   //        SELECT(Mask2, In2,
8890   //               SELECT(Mask1, In1,
8891   //                      In0)))
8892   // Note that Mask0 is never used: lanes for which no path reaches this phi and
8893   // are essentially undef are taken from In0.
8894   InnerLoopVectorizer::VectorParts Entry(State.UF);
8895   for (unsigned In = 0; In < NumIncoming; ++In) {
8896     for (unsigned Part = 0; Part < State.UF; ++Part) {
8897       // We might have single edge PHIs (blocks) - use an identity
8898       // 'select' for the first PHI operand.
8899       Value *In0 = State.get(getIncomingValue(In), Part);
8900       if (In == 0)
8901         Entry[Part] = In0; // Initialize with the first incoming value.
8902       else {
8903         // Select between the current value and the previous incoming edge
8904         // based on the incoming mask.
8905         Value *Cond = State.get(getMask(In), Part);
8906         Entry[Part] =
8907             State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi");
8908       }
8909     }
8910   }
8911   for (unsigned Part = 0; Part < State.UF; ++Part)
8912     State.set(this, Entry[Part], Part);
8913 }
8914 
8915 void VPInterleaveRecipe::execute(VPTransformState &State) {
8916   assert(!State.Instance && "Interleave group being replicated.");
8917   State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(),
8918                                       getStoredValues(), getMask());
8919 }
8920 
8921 void VPReductionRecipe::execute(VPTransformState &State) {
8922   assert(!State.Instance && "Reduction being replicated.");
8923   for (unsigned Part = 0; Part < State.UF; ++Part) {
8924     RecurKind Kind = RdxDesc->getRecurrenceKind();
8925     Value *NewVecOp = State.get(getVecOp(), Part);
8926     if (VPValue *Cond = getCondOp()) {
8927       Value *NewCond = State.get(Cond, Part);
8928       VectorType *VecTy = cast<VectorType>(NewVecOp->getType());
8929       Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity(
8930           Kind, VecTy->getElementType());
8931       Constant *IdenVec =
8932           ConstantVector::getSplat(VecTy->getElementCount(), Iden);
8933       Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec);
8934       NewVecOp = Select;
8935     }
8936     Value *NewRed =
8937         createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp);
8938     Value *PrevInChain = State.get(getChainOp(), Part);
8939     Value *NextInChain;
8940     if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
8941       NextInChain =
8942           createMinMaxOp(State.Builder, RdxDesc->getRecurrenceKind(),
8943                          NewRed, PrevInChain);
8944     } else {
8945       NextInChain = State.Builder.CreateBinOp(
8946           (Instruction::BinaryOps)getUnderlyingInstr()->getOpcode(), NewRed,
8947           PrevInChain);
8948     }
8949     State.set(this, NextInChain, Part);
8950   }
8951 }
8952 
8953 void VPReplicateRecipe::execute(VPTransformState &State) {
8954   if (State.Instance) { // Generate a single instance.
8955     assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
8956     State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *this,
8957                                     *State.Instance, IsPredicated, State);
8958     // Insert scalar instance packing it into a vector.
8959     if (AlsoPack && State.VF.isVector()) {
8960       // If we're constructing lane 0, initialize to start from poison.
8961       if (State.Instance->Lane == 0) {
8962         assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
8963         Value *Poison = PoisonValue::get(
8964             VectorType::get(getUnderlyingValue()->getType(), State.VF));
8965         State.set(this, Poison, State.Instance->Part);
8966       }
8967       State.ILV->packScalarIntoVectorValue(this, *State.Instance, State);
8968     }
8969     return;
8970   }
8971 
8972   // Generate scalar instances for all VF lanes of all UF parts, unless the
8973   // instruction is uniform inwhich case generate only the first lane for each
8974   // of the UF parts.
8975   unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue();
8976   assert((!State.VF.isScalable() || IsUniform) &&
8977          "Can't scalarize a scalable vector");
8978   for (unsigned Part = 0; Part < State.UF; ++Part)
8979     for (unsigned Lane = 0; Lane < EndLane; ++Lane)
8980       State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *this,
8981                                       VPIteration(Part, Lane), IsPredicated,
8982                                       State);
8983 }
8984 
8985 void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
8986   assert(State.Instance && "Branch on Mask works only on single instance.");
8987 
8988   unsigned Part = State.Instance->Part;
8989   unsigned Lane = State.Instance->Lane;
8990 
8991   Value *ConditionBit = nullptr;
8992   VPValue *BlockInMask = getMask();
8993   if (BlockInMask) {
8994     ConditionBit = State.get(BlockInMask, Part);
8995     if (ConditionBit->getType()->isVectorTy())
8996       ConditionBit = State.Builder.CreateExtractElement(
8997           ConditionBit, State.Builder.getInt32(Lane));
8998   } else // Block in mask is all-one.
8999     ConditionBit = State.Builder.getTrue();
9000 
9001   // Replace the temporary unreachable terminator with a new conditional branch,
9002   // whose two destinations will be set later when they are created.
9003   auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
9004   assert(isa<UnreachableInst>(CurrentTerminator) &&
9005          "Expected to replace unreachable terminator with conditional branch.");
9006   auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
9007   CondBr->setSuccessor(0, nullptr);
9008   ReplaceInstWithInst(CurrentTerminator, CondBr);
9009 }
9010 
9011 void VPPredInstPHIRecipe::execute(VPTransformState &State) {
9012   assert(State.Instance && "Predicated instruction PHI works per instance.");
9013   Instruction *ScalarPredInst =
9014       cast<Instruction>(State.get(getOperand(0), *State.Instance));
9015   BasicBlock *PredicatedBB = ScalarPredInst->getParent();
9016   BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
9017   assert(PredicatingBB && "Predicated block has no single predecessor.");
9018   assert(isa<VPReplicateRecipe>(getOperand(0)) &&
9019          "operand must be VPReplicateRecipe");
9020 
9021   // By current pack/unpack logic we need to generate only a single phi node: if
9022   // a vector value for the predicated instruction exists at this point it means
9023   // the instruction has vector users only, and a phi for the vector value is
9024   // needed. In this case the recipe of the predicated instruction is marked to
9025   // also do that packing, thereby "hoisting" the insert-element sequence.
9026   // Otherwise, a phi node for the scalar value is needed.
9027   unsigned Part = State.Instance->Part;
9028   if (State.hasVectorValue(getOperand(0), Part)) {
9029     Value *VectorValue = State.get(getOperand(0), Part);
9030     InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
9031     PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
9032     VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
9033     VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
9034     if (State.hasVectorValue(this, Part))
9035       State.reset(this, VPhi, Part);
9036     else
9037       State.set(this, VPhi, Part);
9038     // NOTE: Currently we need to update the value of the operand, so the next
9039     // predicated iteration inserts its generated value in the correct vector.
9040     State.reset(getOperand(0), VPhi, Part);
9041   } else {
9042     Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType();
9043     PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
9044     Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()),
9045                      PredicatingBB);
9046     Phi->addIncoming(ScalarPredInst, PredicatedBB);
9047     if (State.hasScalarValue(this, *State.Instance))
9048       State.reset(this, Phi, *State.Instance);
9049     else
9050       State.set(this, Phi, *State.Instance);
9051     // NOTE: Currently we need to update the value of the operand, so the next
9052     // predicated iteration inserts its generated value in the correct vector.
9053     State.reset(getOperand(0), Phi, *State.Instance);
9054   }
9055 }
9056 
9057 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
9058   VPValue *StoredValue = isStore() ? getStoredValue() : nullptr;
9059   State.ILV->vectorizeMemoryInstruction(&Ingredient, State,
9060                                         StoredValue ? nullptr : getVPValue(),
9061                                         getAddr(), StoredValue, getMask());
9062 }
9063 
9064 // Determine how to lower the scalar epilogue, which depends on 1) optimising
9065 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
9066 // predication, and 4) a TTI hook that analyses whether the loop is suitable
9067 // for predication.
9068 static ScalarEpilogueLowering getScalarEpilogueLowering(
9069     Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI,
9070     BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
9071     AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
9072     LoopVectorizationLegality &LVL) {
9073   // 1) OptSize takes precedence over all other options, i.e. if this is set,
9074   // don't look at hints or options, and don't request a scalar epilogue.
9075   // (For PGSO, as shouldOptimizeForSize isn't currently accessible from
9076   // LoopAccessInfo (due to code dependency and not being able to reliably get
9077   // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection
9078   // of strides in LoopAccessInfo::analyzeLoop() and vectorize without
9079   // versioning when the vectorization is forced, unlike hasOptSize. So revert
9080   // back to the old way and vectorize with versioning when forced. See D81345.)
9081   if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
9082                                                       PGSOQueryType::IRPass) &&
9083                           Hints.getForce() != LoopVectorizeHints::FK_Enabled))
9084     return CM_ScalarEpilogueNotAllowedOptSize;
9085 
9086   // 2) If set, obey the directives
9087   if (PreferPredicateOverEpilogue.getNumOccurrences()) {
9088     switch (PreferPredicateOverEpilogue) {
9089     case PreferPredicateTy::ScalarEpilogue:
9090       return CM_ScalarEpilogueAllowed;
9091     case PreferPredicateTy::PredicateElseScalarEpilogue:
9092       return CM_ScalarEpilogueNotNeededUsePredicate;
9093     case PreferPredicateTy::PredicateOrDontVectorize:
9094       return CM_ScalarEpilogueNotAllowedUsePredicate;
9095     };
9096   }
9097 
9098   // 3) If set, obey the hints
9099   switch (Hints.getPredicate()) {
9100   case LoopVectorizeHints::FK_Enabled:
9101     return CM_ScalarEpilogueNotNeededUsePredicate;
9102   case LoopVectorizeHints::FK_Disabled:
9103     return CM_ScalarEpilogueAllowed;
9104   };
9105 
9106   // 4) if the TTI hook indicates this is profitable, request predication.
9107   if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT,
9108                                        LVL.getLAI()))
9109     return CM_ScalarEpilogueNotNeededUsePredicate;
9110 
9111   return CM_ScalarEpilogueAllowed;
9112 }
9113 
9114 Value *VPTransformState::get(VPValue *Def, unsigned Part) {
9115   // If Values have been set for this Def return the one relevant for \p Part.
9116   if (hasVectorValue(Def, Part))
9117     return Data.PerPartOutput[Def][Part];
9118 
9119   if (!hasScalarValue(Def, {Part, 0})) {
9120     Value *IRV = Def->getLiveInIRValue();
9121     Value *B = ILV->getBroadcastInstrs(IRV);
9122     set(Def, B, Part);
9123     return B;
9124   }
9125 
9126   Value *ScalarValue = get(Def, {Part, 0});
9127   // If we aren't vectorizing, we can just copy the scalar map values over
9128   // to the vector map.
9129   if (VF.isScalar()) {
9130     set(Def, ScalarValue, Part);
9131     return ScalarValue;
9132   }
9133 
9134   auto *RepR = dyn_cast<VPReplicateRecipe>(Def);
9135   bool IsUniform = RepR && RepR->isUniform();
9136 
9137   unsigned LastLane = IsUniform ? 0 : VF.getKnownMinValue() - 1;
9138   auto *LastInst = cast<Instruction>(get(Def, {Part, LastLane}));
9139 
9140   // Set the insert point after the last scalarized instruction. This
9141   // ensures the insertelement sequence will directly follow the scalar
9142   // definitions.
9143   auto OldIP = Builder.saveIP();
9144   auto NewIP = std::next(BasicBlock::iterator(LastInst));
9145   Builder.SetInsertPoint(&*NewIP);
9146 
9147   // However, if we are vectorizing, we need to construct the vector values.
9148   // If the value is known to be uniform after vectorization, we can just
9149   // broadcast the scalar value corresponding to lane zero for each unroll
9150   // iteration. Otherwise, we construct the vector values using
9151   // insertelement instructions. Since the resulting vectors are stored in
9152   // State, we will only generate the insertelements once.
9153   Value *VectorValue = nullptr;
9154   if (IsUniform) {
9155     VectorValue = ILV->getBroadcastInstrs(ScalarValue);
9156     set(Def, VectorValue, Part);
9157   } else {
9158     // Initialize packing with insertelements to start from undef.
9159     assert(!VF.isScalable() && "VF is assumed to be non scalable.");
9160     Value *Undef = PoisonValue::get(VectorType::get(LastInst->getType(), VF));
9161     set(Def, Undef, Part);
9162     for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane)
9163       ILV->packScalarIntoVectorValue(Def, {Part, Lane}, *this);
9164     VectorValue = get(Def, Part);
9165   }
9166   Builder.restoreIP(OldIP);
9167   return VectorValue;
9168 }
9169 
9170 // Process the loop in the VPlan-native vectorization path. This path builds
9171 // VPlan upfront in the vectorization pipeline, which allows to apply
9172 // VPlan-to-VPlan transformations from the very beginning without modifying the
9173 // input LLVM IR.
9174 static bool processLoopInVPlanNativePath(
9175     Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
9176     LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
9177     TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
9178     OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
9179     ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) {
9180 
9181   if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) {
9182     LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n");
9183     return false;
9184   }
9185   assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
9186   Function *F = L->getHeader()->getParent();
9187   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
9188 
9189   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
9190       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL);
9191 
9192   LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
9193                                 &Hints, IAI);
9194   // Use the planner for outer loop vectorization.
9195   // TODO: CM is not used at this point inside the planner. Turn CM into an
9196   // optional argument if we don't need it in the future.
9197   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE);
9198 
9199   // Get user vectorization factor.
9200   ElementCount UserVF = Hints.getWidth();
9201 
9202   // Plan how to best vectorize, return the best VF and its cost.
9203   const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
9204 
9205   // If we are stress testing VPlan builds, do not attempt to generate vector
9206   // code. Masked vector code generation support will follow soon.
9207   // Also, do not attempt to vectorize if no vector code will be produced.
9208   if (VPlanBuildStressTest || EnableVPlanPredication ||
9209       VectorizationFactor::Disabled() == VF)
9210     return false;
9211 
9212   LVP.setBestPlan(VF.Width, 1);
9213 
9214   InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL,
9215                          &CM, BFI, PSI);
9216   LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
9217                     << L->getHeader()->getParent()->getName() << "\"\n");
9218   LVP.executePlan(LB, DT);
9219 
9220   // Mark the loop as already vectorized to avoid vectorizing again.
9221   Hints.setAlreadyVectorized();
9222 
9223   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
9224   return true;
9225 }
9226 
9227 // Emit a remark if there are stores to floats that required a floating point
9228 // extension. If the vectorized loop was generated with floating point there
9229 // will be a performance penalty from the conversion overhead and the change in
9230 // the vector width.
9231 static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) {
9232   SmallVector<Instruction *, 4> Worklist;
9233   for (BasicBlock *BB : L->getBlocks()) {
9234     for (Instruction &Inst : *BB) {
9235       if (auto *S = dyn_cast<StoreInst>(&Inst)) {
9236         if (S->getValueOperand()->getType()->isFloatTy())
9237           Worklist.push_back(S);
9238       }
9239     }
9240   }
9241 
9242   // Traverse the floating point stores upwards searching, for floating point
9243   // conversions.
9244   SmallPtrSet<const Instruction *, 4> Visited;
9245   SmallPtrSet<const Instruction *, 4> EmittedRemark;
9246   while (!Worklist.empty()) {
9247     auto *I = Worklist.pop_back_val();
9248     if (!L->contains(I))
9249       continue;
9250     if (!Visited.insert(I).second)
9251       continue;
9252 
9253     // Emit a remark if the floating point store required a floating
9254     // point conversion.
9255     // TODO: More work could be done to identify the root cause such as a
9256     // constant or a function return type and point the user to it.
9257     if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second)
9258       ORE->emit([&]() {
9259         return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision",
9260                                           I->getDebugLoc(), L->getHeader())
9261                << "floating point conversion changes vector width. "
9262                << "Mixed floating point precision requires an up/down "
9263                << "cast that will negatively impact performance.";
9264       });
9265 
9266     for (Use &Op : I->operands())
9267       if (auto *OpI = dyn_cast<Instruction>(Op))
9268         Worklist.push_back(OpI);
9269   }
9270 }
9271 
9272 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts)
9273     : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced ||
9274                                !EnableLoopInterleaving),
9275       VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced ||
9276                               !EnableLoopVectorization) {}
9277 
9278 bool LoopVectorizePass::processLoop(Loop *L) {
9279   assert((EnableVPlanNativePath || L->isInnermost()) &&
9280          "VPlan-native path is not enabled. Only process inner loops.");
9281 
9282 #ifndef NDEBUG
9283   const std::string DebugLocStr = getDebugLocString(L);
9284 #endif /* NDEBUG */
9285 
9286   LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \""
9287                     << L->getHeader()->getParent()->getName() << "\" from "
9288                     << DebugLocStr << "\n");
9289 
9290   LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE);
9291 
9292   LLVM_DEBUG(
9293       dbgs() << "LV: Loop hints:"
9294              << " force="
9295              << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
9296                      ? "disabled"
9297                      : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
9298                             ? "enabled"
9299                             : "?"))
9300              << " width=" << Hints.getWidth()
9301              << " unroll=" << Hints.getInterleave() << "\n");
9302 
9303   // Function containing loop
9304   Function *F = L->getHeader()->getParent();
9305 
9306   // Looking at the diagnostic output is the only way to determine if a loop
9307   // was vectorized (other than looking at the IR or machine code), so it
9308   // is important to generate an optimization remark for each loop. Most of
9309   // these messages are generated as OptimizationRemarkAnalysis. Remarks
9310   // generated as OptimizationRemark and OptimizationRemarkMissed are
9311   // less verbose reporting vectorized loops and unvectorized loops that may
9312   // benefit from vectorization, respectively.
9313 
9314   if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
9315     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
9316     return false;
9317   }
9318 
9319   PredicatedScalarEvolution PSE(*SE, *L);
9320 
9321   // Check if it is legal to vectorize the loop.
9322   LoopVectorizationRequirements Requirements(*ORE);
9323   LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE,
9324                                 &Requirements, &Hints, DB, AC, BFI, PSI);
9325   if (!LVL.canVectorize(EnableVPlanNativePath)) {
9326     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
9327     Hints.emitRemarkWithHints();
9328     return false;
9329   }
9330 
9331   // Check the function attributes and profiles to find out if this function
9332   // should be optimized for size.
9333   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
9334       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL);
9335 
9336   // Entrance to the VPlan-native vectorization path. Outer loops are processed
9337   // here. They may require CFG and instruction level transformations before
9338   // even evaluating whether vectorization is profitable. Since we cannot modify
9339   // the incoming IR, we need to build VPlan upfront in the vectorization
9340   // pipeline.
9341   if (!L->isInnermost())
9342     return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
9343                                         ORE, BFI, PSI, Hints);
9344 
9345   assert(L->isInnermost() && "Inner loop expected.");
9346 
9347   // Check the loop for a trip count threshold: vectorize loops with a tiny trip
9348   // count by optimizing for size, to minimize overheads.
9349   auto ExpectedTC = getSmallBestKnownTC(*SE, L);
9350   if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
9351     LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
9352                       << "This loop is worth vectorizing only if no scalar "
9353                       << "iteration overheads are incurred.");
9354     if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
9355       LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
9356     else {
9357       LLVM_DEBUG(dbgs() << "\n");
9358       SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
9359     }
9360   }
9361 
9362   // Check the function attributes to see if implicit floats are allowed.
9363   // FIXME: This check doesn't seem possibly correct -- what if the loop is
9364   // an integer loop and the vector instructions selected are purely integer
9365   // vector instructions?
9366   if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
9367     reportVectorizationFailure(
9368         "Can't vectorize when the NoImplicitFloat attribute is used",
9369         "loop not vectorized due to NoImplicitFloat attribute",
9370         "NoImplicitFloat", ORE, L);
9371     Hints.emitRemarkWithHints();
9372     return false;
9373   }
9374 
9375   // Check if the target supports potentially unsafe FP vectorization.
9376   // FIXME: Add a check for the type of safety issue (denormal, signaling)
9377   // for the target we're vectorizing for, to make sure none of the
9378   // additional fp-math flags can help.
9379   if (Hints.isPotentiallyUnsafe() &&
9380       TTI->isFPVectorizationPotentiallyUnsafe()) {
9381     reportVectorizationFailure(
9382         "Potentially unsafe FP op prevents vectorization",
9383         "loop not vectorized due to unsafe FP support.",
9384         "UnsafeFP", ORE, L);
9385     Hints.emitRemarkWithHints();
9386     return false;
9387   }
9388 
9389   bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
9390   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
9391 
9392   // If an override option has been passed in for interleaved accesses, use it.
9393   if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
9394     UseInterleaved = EnableInterleavedMemAccesses;
9395 
9396   // Analyze interleaved memory accesses.
9397   if (UseInterleaved) {
9398     IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
9399   }
9400 
9401   // Use the cost model.
9402   LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
9403                                 F, &Hints, IAI);
9404   CM.collectValuesToIgnore();
9405 
9406   // Use the planner for vectorization.
9407   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE);
9408 
9409   // Get user vectorization factor and interleave count.
9410   ElementCount UserVF = Hints.getWidth();
9411   unsigned UserIC = Hints.getInterleave();
9412 
9413   // Plan how to best vectorize, return the best VF and its cost.
9414   Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC);
9415 
9416   VectorizationFactor VF = VectorizationFactor::Disabled();
9417   unsigned IC = 1;
9418 
9419   if (MaybeVF) {
9420     VF = *MaybeVF;
9421     // Select the interleave count.
9422     IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
9423   }
9424 
9425   // Identify the diagnostic messages that should be produced.
9426   std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
9427   bool VectorizeLoop = true, InterleaveLoop = true;
9428   if (Requirements.doesNotMeet(F, L, Hints)) {
9429     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization "
9430                          "requirements.\n");
9431     Hints.emitRemarkWithHints();
9432     return false;
9433   }
9434 
9435   if (VF.Width.isScalar()) {
9436     LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
9437     VecDiagMsg = std::make_pair(
9438         "VectorizationNotBeneficial",
9439         "the cost-model indicates that vectorization is not beneficial");
9440     VectorizeLoop = false;
9441   }
9442 
9443   if (!MaybeVF && UserIC > 1) {
9444     // Tell the user interleaving was avoided up-front, despite being explicitly
9445     // requested.
9446     LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
9447                          "interleaving should be avoided up front\n");
9448     IntDiagMsg = std::make_pair(
9449         "InterleavingAvoided",
9450         "Ignoring UserIC, because interleaving was avoided up front");
9451     InterleaveLoop = false;
9452   } else if (IC == 1 && UserIC <= 1) {
9453     // Tell the user interleaving is not beneficial.
9454     LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
9455     IntDiagMsg = std::make_pair(
9456         "InterleavingNotBeneficial",
9457         "the cost-model indicates that interleaving is not beneficial");
9458     InterleaveLoop = false;
9459     if (UserIC == 1) {
9460       IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
9461       IntDiagMsg.second +=
9462           " and is explicitly disabled or interleave count is set to 1";
9463     }
9464   } else if (IC > 1 && UserIC == 1) {
9465     // Tell the user interleaving is beneficial, but it explicitly disabled.
9466     LLVM_DEBUG(
9467         dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
9468     IntDiagMsg = std::make_pair(
9469         "InterleavingBeneficialButDisabled",
9470         "the cost-model indicates that interleaving is beneficial "
9471         "but is explicitly disabled or interleave count is set to 1");
9472     InterleaveLoop = false;
9473   }
9474 
9475   // Override IC if user provided an interleave count.
9476   IC = UserIC > 0 ? UserIC : IC;
9477 
9478   // Emit diagnostic messages, if any.
9479   const char *VAPassName = Hints.vectorizeAnalysisPassName();
9480   if (!VectorizeLoop && !InterleaveLoop) {
9481     // Do not vectorize or interleaving the loop.
9482     ORE->emit([&]() {
9483       return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
9484                                       L->getStartLoc(), L->getHeader())
9485              << VecDiagMsg.second;
9486     });
9487     ORE->emit([&]() {
9488       return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
9489                                       L->getStartLoc(), L->getHeader())
9490              << IntDiagMsg.second;
9491     });
9492     return false;
9493   } else if (!VectorizeLoop && InterleaveLoop) {
9494     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
9495     ORE->emit([&]() {
9496       return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
9497                                         L->getStartLoc(), L->getHeader())
9498              << VecDiagMsg.second;
9499     });
9500   } else if (VectorizeLoop && !InterleaveLoop) {
9501     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
9502                       << ") in " << DebugLocStr << '\n');
9503     ORE->emit([&]() {
9504       return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
9505                                         L->getStartLoc(), L->getHeader())
9506              << IntDiagMsg.second;
9507     });
9508   } else if (VectorizeLoop && InterleaveLoop) {
9509     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
9510                       << ") in " << DebugLocStr << '\n');
9511     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
9512   }
9513 
9514   LVP.setBestPlan(VF.Width, IC);
9515 
9516   using namespace ore;
9517   bool DisableRuntimeUnroll = false;
9518   MDNode *OrigLoopID = L->getLoopID();
9519 
9520   if (!VectorizeLoop) {
9521     assert(IC > 1 && "interleave count should not be 1 or 0");
9522     // If we decided that it is not legal to vectorize the loop, then
9523     // interleave it.
9524     InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, &CM,
9525                                BFI, PSI);
9526     LVP.executePlan(Unroller, DT);
9527 
9528     ORE->emit([&]() {
9529       return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
9530                                 L->getHeader())
9531              << "interleaved loop (interleaved count: "
9532              << NV("InterleaveCount", IC) << ")";
9533     });
9534   } else {
9535     // If we decided that it is *legal* to vectorize the loop, then do it.
9536 
9537     // Consider vectorizing the epilogue too if it's profitable.
9538     VectorizationFactor EpilogueVF =
9539       CM.selectEpilogueVectorizationFactor(VF.Width, LVP);
9540     if (EpilogueVF.Width.isVector()) {
9541 
9542       // The first pass vectorizes the main loop and creates a scalar epilogue
9543       // to be vectorized by executing the plan (potentially with a different
9544       // factor) again shortly afterwards.
9545       EpilogueLoopVectorizationInfo EPI(VF.Width.getKnownMinValue(), IC,
9546                                         EpilogueVF.Width.getKnownMinValue(), 1);
9547       EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, EPI,
9548                                          &LVL, &CM, BFI, PSI);
9549 
9550       LVP.setBestPlan(EPI.MainLoopVF, EPI.MainLoopUF);
9551       LVP.executePlan(MainILV, DT);
9552       ++LoopsVectorized;
9553 
9554       simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
9555       formLCSSARecursively(*L, *DT, LI, SE);
9556 
9557       // Second pass vectorizes the epilogue and adjusts the control flow
9558       // edges from the first pass.
9559       LVP.setBestPlan(EPI.EpilogueVF, EPI.EpilogueUF);
9560       EPI.MainLoopVF = EPI.EpilogueVF;
9561       EPI.MainLoopUF = EPI.EpilogueUF;
9562       EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC,
9563                                                ORE, EPI, &LVL, &CM, BFI, PSI);
9564       LVP.executePlan(EpilogILV, DT);
9565       ++LoopsEpilogueVectorized;
9566 
9567       if (!MainILV.areSafetyChecksAdded())
9568         DisableRuntimeUnroll = true;
9569     } else {
9570       InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
9571                              &LVL, &CM, BFI, PSI);
9572       LVP.executePlan(LB, DT);
9573       ++LoopsVectorized;
9574 
9575       // Add metadata to disable runtime unrolling a scalar loop when there are
9576       // no runtime checks about strides and memory. A scalar loop that is
9577       // rarely used is not worth unrolling.
9578       if (!LB.areSafetyChecksAdded())
9579         DisableRuntimeUnroll = true;
9580     }
9581 
9582     // Report the vectorization decision.
9583     ORE->emit([&]() {
9584       return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(),
9585                                 L->getHeader())
9586              << "vectorized loop (vectorization width: "
9587              << NV("VectorizationFactor", VF.Width)
9588              << ", interleaved count: " << NV("InterleaveCount", IC) << ")";
9589     });
9590 
9591     if (ORE->allowExtraAnalysis(LV_NAME))
9592       checkMixedPrecision(L, ORE);
9593   }
9594 
9595   Optional<MDNode *> RemainderLoopID =
9596       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
9597                                       LLVMLoopVectorizeFollowupEpilogue});
9598   if (RemainderLoopID.hasValue()) {
9599     L->setLoopID(RemainderLoopID.getValue());
9600   } else {
9601     if (DisableRuntimeUnroll)
9602       AddRuntimeUnrollDisableMetaData(L);
9603 
9604     // Mark the loop as already vectorized to avoid vectorizing again.
9605     Hints.setAlreadyVectorized();
9606   }
9607 
9608   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
9609   return true;
9610 }
9611 
9612 LoopVectorizeResult LoopVectorizePass::runImpl(
9613     Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_,
9614     DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_,
9615     DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_,
9616     std::function<const LoopAccessInfo &(Loop &)> &GetLAA_,
9617     OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) {
9618   SE = &SE_;
9619   LI = &LI_;
9620   TTI = &TTI_;
9621   DT = &DT_;
9622   BFI = &BFI_;
9623   TLI = TLI_;
9624   AA = &AA_;
9625   AC = &AC_;
9626   GetLAA = &GetLAA_;
9627   DB = &DB_;
9628   ORE = &ORE_;
9629   PSI = PSI_;
9630 
9631   // Don't attempt if
9632   // 1. the target claims to have no vector registers, and
9633   // 2. interleaving won't help ILP.
9634   //
9635   // The second condition is necessary because, even if the target has no
9636   // vector registers, loop vectorization may still enable scalar
9637   // interleaving.
9638   if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) &&
9639       TTI->getMaxInterleaveFactor(1) < 2)
9640     return LoopVectorizeResult(false, false);
9641 
9642   bool Changed = false, CFGChanged = false;
9643 
9644   // The vectorizer requires loops to be in simplified form.
9645   // Since simplification may add new inner loops, it has to run before the
9646   // legality and profitability checks. This means running the loop vectorizer
9647   // will simplify all loops, regardless of whether anything end up being
9648   // vectorized.
9649   for (auto &L : *LI)
9650     Changed |= CFGChanged |=
9651         simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
9652 
9653   // Build up a worklist of inner-loops to vectorize. This is necessary as
9654   // the act of vectorizing or partially unrolling a loop creates new loops
9655   // and can invalidate iterators across the loops.
9656   SmallVector<Loop *, 8> Worklist;
9657 
9658   for (Loop *L : *LI)
9659     collectSupportedLoops(*L, LI, ORE, Worklist);
9660 
9661   LoopsAnalyzed += Worklist.size();
9662 
9663   // Now walk the identified inner loops.
9664   while (!Worklist.empty()) {
9665     Loop *L = Worklist.pop_back_val();
9666 
9667     // For the inner loops we actually process, form LCSSA to simplify the
9668     // transform.
9669     Changed |= formLCSSARecursively(*L, *DT, LI, SE);
9670 
9671     Changed |= CFGChanged |= processLoop(L);
9672   }
9673 
9674   // Process each loop nest in the function.
9675   return LoopVectorizeResult(Changed, CFGChanged);
9676 }
9677 
9678 PreservedAnalyses LoopVectorizePass::run(Function &F,
9679                                          FunctionAnalysisManager &AM) {
9680     auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
9681     auto &LI = AM.getResult<LoopAnalysis>(F);
9682     auto &TTI = AM.getResult<TargetIRAnalysis>(F);
9683     auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
9684     auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F);
9685     auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
9686     auto &AA = AM.getResult<AAManager>(F);
9687     auto &AC = AM.getResult<AssumptionAnalysis>(F);
9688     auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
9689     auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
9690     MemorySSA *MSSA = EnableMSSALoopDependency
9691                           ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA()
9692                           : nullptr;
9693 
9694     auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
9695     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
9696         [&](Loop &L) -> const LoopAccessInfo & {
9697       LoopStandardAnalysisResults AR = {AA,  AC,  DT,      LI,  SE,
9698                                         TLI, TTI, nullptr, MSSA};
9699       return LAM.getResult<LoopAccessAnalysis>(L, AR);
9700     };
9701     auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
9702     ProfileSummaryInfo *PSI =
9703         MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
9704     LoopVectorizeResult Result =
9705         runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI);
9706     if (!Result.MadeAnyChange)
9707       return PreservedAnalyses::all();
9708     PreservedAnalyses PA;
9709 
9710     // We currently do not preserve loopinfo/dominator analyses with outer loop
9711     // vectorization. Until this is addressed, mark these analyses as preserved
9712     // only for non-VPlan-native path.
9713     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
9714     if (!EnableVPlanNativePath) {
9715       PA.preserve<LoopAnalysis>();
9716       PA.preserve<DominatorTreeAnalysis>();
9717     }
9718     PA.preserve<BasicAA>();
9719     PA.preserve<GlobalsAA>();
9720     if (!Result.MadeCFGChange)
9721       PA.preserveSet<CFGAnalyses>();
9722     return PA;
9723 }
9724