1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10 // and generates target-independent LLVM-IR.
11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12 // of instructions in order to estimate the profitability of vectorization.
13 //
14 // The loop vectorizer combines consecutive loop iterations into a single
15 // 'wide' iteration. After this transformation the index is incremented
16 // by the SIMD vector width, and not by one.
17 //
18 // This pass has three parts:
19 // 1. The main loop pass that drives the different parts.
20 // 2. LoopVectorizationLegality - A unit that checks for the legality
21 //    of the vectorization.
22 // 3. InnerLoopVectorizer - A unit that performs the actual
23 //    widening of instructions.
24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
25 //    of vectorization. It decides on the optimal vector width, which
26 //    can be one, if vectorization is not profitable.
27 //
28 // There is a development effort going on to migrate loop vectorizer to the
29 // VPlan infrastructure and to introduce outer loop vectorization support (see
30 // docs/Proposal/VectorizationPlan.rst and
31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32 // purpose, we temporarily introduced the VPlan-native vectorization path: an
33 // alternative vectorization path that is natively implemented on top of the
34 // VPlan infrastructure. See EnableVPlanNativePath for enabling.
35 //
36 //===----------------------------------------------------------------------===//
37 //
38 // The reduction-variable vectorization is based on the paper:
39 //  D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40 //
41 // Variable uniformity checks are inspired by:
42 //  Karrenberg, R. and Hack, S. Whole Function Vectorization.
43 //
44 // The interleaved access vectorization is based on the paper:
45 //  Dorit Nuzman, Ira Rosen and Ayal Zaks.  Auto-Vectorization of Interleaved
46 //  Data for SIMD
47 //
48 // Other ideas/concepts are from:
49 //  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50 //
51 //  S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua.  An Evaluation of
52 //  Vectorizing Compilers.
53 //
54 //===----------------------------------------------------------------------===//
55 
56 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
57 #include "LoopVectorizationPlanner.h"
58 #include "VPRecipeBuilder.h"
59 #include "VPlan.h"
60 #include "VPlanHCFGBuilder.h"
61 #include "VPlanPredicator.h"
62 #include "VPlanTransforms.h"
63 #include "llvm/ADT/APInt.h"
64 #include "llvm/ADT/ArrayRef.h"
65 #include "llvm/ADT/DenseMap.h"
66 #include "llvm/ADT/DenseMapInfo.h"
67 #include "llvm/ADT/Hashing.h"
68 #include "llvm/ADT/MapVector.h"
69 #include "llvm/ADT/None.h"
70 #include "llvm/ADT/Optional.h"
71 #include "llvm/ADT/STLExtras.h"
72 #include "llvm/ADT/SmallPtrSet.h"
73 #include "llvm/ADT/SmallSet.h"
74 #include "llvm/ADT/SmallVector.h"
75 #include "llvm/ADT/Statistic.h"
76 #include "llvm/ADT/StringRef.h"
77 #include "llvm/ADT/Twine.h"
78 #include "llvm/ADT/iterator_range.h"
79 #include "llvm/Analysis/AssumptionCache.h"
80 #include "llvm/Analysis/BasicAliasAnalysis.h"
81 #include "llvm/Analysis/BlockFrequencyInfo.h"
82 #include "llvm/Analysis/CFG.h"
83 #include "llvm/Analysis/CodeMetrics.h"
84 #include "llvm/Analysis/DemandedBits.h"
85 #include "llvm/Analysis/GlobalsModRef.h"
86 #include "llvm/Analysis/LoopAccessAnalysis.h"
87 #include "llvm/Analysis/LoopAnalysisManager.h"
88 #include "llvm/Analysis/LoopInfo.h"
89 #include "llvm/Analysis/LoopIterator.h"
90 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
91 #include "llvm/Analysis/ProfileSummaryInfo.h"
92 #include "llvm/Analysis/ScalarEvolution.h"
93 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
94 #include "llvm/Analysis/TargetLibraryInfo.h"
95 #include "llvm/Analysis/TargetTransformInfo.h"
96 #include "llvm/Analysis/VectorUtils.h"
97 #include "llvm/IR/Attributes.h"
98 #include "llvm/IR/BasicBlock.h"
99 #include "llvm/IR/CFG.h"
100 #include "llvm/IR/Constant.h"
101 #include "llvm/IR/Constants.h"
102 #include "llvm/IR/DataLayout.h"
103 #include "llvm/IR/DebugInfoMetadata.h"
104 #include "llvm/IR/DebugLoc.h"
105 #include "llvm/IR/DerivedTypes.h"
106 #include "llvm/IR/DiagnosticInfo.h"
107 #include "llvm/IR/Dominators.h"
108 #include "llvm/IR/Function.h"
109 #include "llvm/IR/IRBuilder.h"
110 #include "llvm/IR/InstrTypes.h"
111 #include "llvm/IR/Instruction.h"
112 #include "llvm/IR/Instructions.h"
113 #include "llvm/IR/IntrinsicInst.h"
114 #include "llvm/IR/Intrinsics.h"
115 #include "llvm/IR/LLVMContext.h"
116 #include "llvm/IR/Metadata.h"
117 #include "llvm/IR/Module.h"
118 #include "llvm/IR/Operator.h"
119 #include "llvm/IR/PatternMatch.h"
120 #include "llvm/IR/Type.h"
121 #include "llvm/IR/Use.h"
122 #include "llvm/IR/User.h"
123 #include "llvm/IR/Value.h"
124 #include "llvm/IR/ValueHandle.h"
125 #include "llvm/IR/Verifier.h"
126 #include "llvm/InitializePasses.h"
127 #include "llvm/Pass.h"
128 #include "llvm/Support/Casting.h"
129 #include "llvm/Support/CommandLine.h"
130 #include "llvm/Support/Compiler.h"
131 #include "llvm/Support/Debug.h"
132 #include "llvm/Support/ErrorHandling.h"
133 #include "llvm/Support/InstructionCost.h"
134 #include "llvm/Support/MathExtras.h"
135 #include "llvm/Support/raw_ostream.h"
136 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
137 #include "llvm/Transforms/Utils/InjectTLIMappings.h"
138 #include "llvm/Transforms/Utils/LoopSimplify.h"
139 #include "llvm/Transforms/Utils/LoopUtils.h"
140 #include "llvm/Transforms/Utils/LoopVersioning.h"
141 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
142 #include "llvm/Transforms/Utils/SizeOpts.h"
143 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
144 #include <algorithm>
145 #include <cassert>
146 #include <cstdint>
147 #include <cstdlib>
148 #include <functional>
149 #include <iterator>
150 #include <limits>
151 #include <memory>
152 #include <string>
153 #include <tuple>
154 #include <utility>
155 
156 using namespace llvm;
157 
158 #define LV_NAME "loop-vectorize"
159 #define DEBUG_TYPE LV_NAME
160 
161 #ifndef NDEBUG
162 const char VerboseDebug[] = DEBUG_TYPE "-verbose";
163 #endif
164 
165 /// @{
166 /// Metadata attribute names
167 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all";
168 const char LLVMLoopVectorizeFollowupVectorized[] =
169     "llvm.loop.vectorize.followup_vectorized";
170 const char LLVMLoopVectorizeFollowupEpilogue[] =
171     "llvm.loop.vectorize.followup_epilogue";
172 /// @}
173 
174 STATISTIC(LoopsVectorized, "Number of loops vectorized");
175 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
176 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
177 
178 static cl::opt<bool> EnableEpilogueVectorization(
179     "enable-epilogue-vectorization", cl::init(true), cl::Hidden,
180     cl::desc("Enable vectorization of epilogue loops."));
181 
182 static cl::opt<unsigned> EpilogueVectorizationForceVF(
183     "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden,
184     cl::desc("When epilogue vectorization is enabled, and a value greater than "
185              "1 is specified, forces the given VF for all applicable epilogue "
186              "loops."));
187 
188 static cl::opt<unsigned> EpilogueVectorizationMinVF(
189     "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden,
190     cl::desc("Only loops with vectorization factor equal to or larger than "
191              "the specified value are considered for epilogue vectorization."));
192 
193 /// Loops with a known constant trip count below this number are vectorized only
194 /// if no scalar iteration overheads are incurred.
195 static cl::opt<unsigned> TinyTripCountVectorThreshold(
196     "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
197     cl::desc("Loops with a constant trip count that is smaller than this "
198              "value are vectorized only if no scalar iteration overheads "
199              "are incurred."));
200 
201 static cl::opt<unsigned> PragmaVectorizeMemoryCheckThreshold(
202     "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
203     cl::desc("The maximum allowed number of runtime memory checks with a "
204              "vectorize(enable) pragma."));
205 
206 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
207 // that predication is preferred, and this lists all options. I.e., the
208 // vectorizer will try to fold the tail-loop (epilogue) into the vector body
209 // and predicate the instructions accordingly. If tail-folding fails, there are
210 // different fallback strategies depending on these values:
211 namespace PreferPredicateTy {
212   enum Option {
213     ScalarEpilogue = 0,
214     PredicateElseScalarEpilogue,
215     PredicateOrDontVectorize
216   };
217 } // namespace PreferPredicateTy
218 
219 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue(
220     "prefer-predicate-over-epilogue",
221     cl::init(PreferPredicateTy::ScalarEpilogue),
222     cl::Hidden,
223     cl::desc("Tail-folding and predication preferences over creating a scalar "
224              "epilogue loop."),
225     cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue,
226                          "scalar-epilogue",
227                          "Don't tail-predicate loops, create scalar epilogue"),
228               clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue,
229                          "predicate-else-scalar-epilogue",
230                          "prefer tail-folding, create scalar epilogue if tail "
231                          "folding fails."),
232               clEnumValN(PreferPredicateTy::PredicateOrDontVectorize,
233                          "predicate-dont-vectorize",
234                          "prefers tail-folding, don't attempt vectorization if "
235                          "tail-folding fails.")));
236 
237 static cl::opt<bool> MaximizeBandwidth(
238     "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
239     cl::desc("Maximize bandwidth when selecting vectorization factor which "
240              "will be determined by the smallest type in loop."));
241 
242 static cl::opt<bool> EnableInterleavedMemAccesses(
243     "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
244     cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
245 
246 /// An interleave-group may need masking if it resides in a block that needs
247 /// predication, or in order to mask away gaps.
248 static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
249     "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
250     cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
251 
252 static cl::opt<unsigned> TinyTripCountInterleaveThreshold(
253     "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden,
254     cl::desc("We don't interleave loops with a estimated constant trip count "
255              "below this number"));
256 
257 static cl::opt<unsigned> ForceTargetNumScalarRegs(
258     "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
259     cl::desc("A flag that overrides the target's number of scalar registers."));
260 
261 static cl::opt<unsigned> ForceTargetNumVectorRegs(
262     "force-target-num-vector-regs", cl::init(0), cl::Hidden,
263     cl::desc("A flag that overrides the target's number of vector registers."));
264 
265 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
266     "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
267     cl::desc("A flag that overrides the target's max interleave factor for "
268              "scalar loops."));
269 
270 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
271     "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
272     cl::desc("A flag that overrides the target's max interleave factor for "
273              "vectorized loops."));
274 
275 static cl::opt<unsigned> ForceTargetInstructionCost(
276     "force-target-instruction-cost", cl::init(0), cl::Hidden,
277     cl::desc("A flag that overrides the target's expected cost for "
278              "an instruction to a single constant value. Mostly "
279              "useful for getting consistent testing."));
280 
281 static cl::opt<bool> ForceTargetSupportsScalableVectors(
282     "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden,
283     cl::desc(
284         "Pretend that scalable vectors are supported, even if the target does "
285         "not support them. This flag should only be used for testing."));
286 
287 static cl::opt<unsigned> SmallLoopCost(
288     "small-loop-cost", cl::init(20), cl::Hidden,
289     cl::desc(
290         "The cost of a loop that is considered 'small' by the interleaver."));
291 
292 static cl::opt<bool> LoopVectorizeWithBlockFrequency(
293     "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
294     cl::desc("Enable the use of the block frequency analysis to access PGO "
295              "heuristics minimizing code growth in cold regions and being more "
296              "aggressive in hot regions."));
297 
298 // Runtime interleave loops for load/store throughput.
299 static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
300     "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
301     cl::desc(
302         "Enable runtime interleaving until load/store ports are saturated"));
303 
304 /// Interleave small loops with scalar reductions.
305 static cl::opt<bool> InterleaveSmallLoopScalarReduction(
306     "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden,
307     cl::desc("Enable interleaving for loops with small iteration counts that "
308              "contain scalar reductions to expose ILP."));
309 
310 /// The number of stores in a loop that are allowed to need predication.
311 static cl::opt<unsigned> NumberOfStoresToPredicate(
312     "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
313     cl::desc("Max number of stores to be predicated behind an if."));
314 
315 static cl::opt<bool> EnableIndVarRegisterHeur(
316     "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
317     cl::desc("Count the induction variable only once when interleaving"));
318 
319 static cl::opt<bool> EnableCondStoresVectorization(
320     "enable-cond-stores-vec", cl::init(true), cl::Hidden,
321     cl::desc("Enable if predication of stores during vectorization."));
322 
323 static cl::opt<unsigned> MaxNestedScalarReductionIC(
324     "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
325     cl::desc("The maximum interleave count to use when interleaving a scalar "
326              "reduction in a nested loop."));
327 
328 static cl::opt<bool>
329     PreferInLoopReductions("prefer-inloop-reductions", cl::init(false),
330                            cl::Hidden,
331                            cl::desc("Prefer in-loop vector reductions, "
332                                     "overriding the targets preference."));
333 
334 static cl::opt<bool> ForceOrderedReductions(
335     "force-ordered-reductions", cl::init(false), cl::Hidden,
336     cl::desc("Enable the vectorisation of loops with in-order (strict) "
337              "FP reductions"));
338 
339 static cl::opt<bool> PreferPredicatedReductionSelect(
340     "prefer-predicated-reduction-select", cl::init(false), cl::Hidden,
341     cl::desc(
342         "Prefer predicating a reduction operation over an after loop select."));
343 
344 cl::opt<bool> EnableVPlanNativePath(
345     "enable-vplan-native-path", cl::init(false), cl::Hidden,
346     cl::desc("Enable VPlan-native vectorization path with "
347              "support for outer loop vectorization."));
348 
349 // FIXME: Remove this switch once we have divergence analysis. Currently we
350 // assume divergent non-backedge branches when this switch is true.
351 cl::opt<bool> EnableVPlanPredication(
352     "enable-vplan-predication", cl::init(false), cl::Hidden,
353     cl::desc("Enable VPlan-native vectorization path predicator with "
354              "support for outer loop vectorization."));
355 
356 // This flag enables the stress testing of the VPlan H-CFG construction in the
357 // VPlan-native vectorization path. It must be used in conjuction with
358 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
359 // verification of the H-CFGs built.
360 static cl::opt<bool> VPlanBuildStressTest(
361     "vplan-build-stress-test", cl::init(false), cl::Hidden,
362     cl::desc(
363         "Build VPlan for every supported loop nest in the function and bail "
364         "out right after the build (stress test the VPlan H-CFG construction "
365         "in the VPlan-native vectorization path)."));
366 
367 cl::opt<bool> llvm::EnableLoopInterleaving(
368     "interleave-loops", cl::init(true), cl::Hidden,
369     cl::desc("Enable loop interleaving in Loop vectorization passes"));
370 cl::opt<bool> llvm::EnableLoopVectorization(
371     "vectorize-loops", cl::init(true), cl::Hidden,
372     cl::desc("Run the Loop vectorization passes"));
373 
374 cl::opt<bool> PrintVPlansInDotFormat(
375     "vplan-print-in-dot-format", cl::init(false), cl::Hidden,
376     cl::desc("Use dot format instead of plain text when dumping VPlans"));
377 
378 /// A helper function that returns true if the given type is irregular. The
379 /// type is irregular if its allocated size doesn't equal the store size of an
380 /// element of the corresponding vector type.
381 static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
382   // Determine if an array of N elements of type Ty is "bitcast compatible"
383   // with a <N x Ty> vector.
384   // This is only true if there is no padding between the array elements.
385   return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
386 }
387 
388 /// A helper function that returns the reciprocal of the block probability of
389 /// predicated blocks. If we return X, we are assuming the predicated block
390 /// will execute once for every X iterations of the loop header.
391 ///
392 /// TODO: We should use actual block probability here, if available. Currently,
393 ///       we always assume predicated blocks have a 50% chance of executing.
394 static unsigned getReciprocalPredBlockProb() { return 2; }
395 
396 /// A helper function that returns an integer or floating-point constant with
397 /// value C.
398 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
399   return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
400                            : ConstantFP::get(Ty, C);
401 }
402 
403 /// Returns "best known" trip count for the specified loop \p L as defined by
404 /// the following procedure:
405 ///   1) Returns exact trip count if it is known.
406 ///   2) Returns expected trip count according to profile data if any.
407 ///   3) Returns upper bound estimate if it is known.
408 ///   4) Returns None if all of the above failed.
409 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) {
410   // Check if exact trip count is known.
411   if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
412     return ExpectedTC;
413 
414   // Check if there is an expected trip count available from profile data.
415   if (LoopVectorizeWithBlockFrequency)
416     if (auto EstimatedTC = getLoopEstimatedTripCount(L))
417       return EstimatedTC;
418 
419   // Check if upper bound estimate is known.
420   if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
421     return ExpectedTC;
422 
423   return None;
424 }
425 
426 // Forward declare GeneratedRTChecks.
427 class GeneratedRTChecks;
428 
429 namespace llvm {
430 
431 AnalysisKey ShouldRunExtraVectorPasses::Key;
432 
433 /// InnerLoopVectorizer vectorizes loops which contain only one basic
434 /// block to a specified vectorization factor (VF).
435 /// This class performs the widening of scalars into vectors, or multiple
436 /// scalars. This class also implements the following features:
437 /// * It inserts an epilogue loop for handling loops that don't have iteration
438 ///   counts that are known to be a multiple of the vectorization factor.
439 /// * It handles the code generation for reduction variables.
440 /// * Scalarization (implementation using scalars) of un-vectorizable
441 ///   instructions.
442 /// InnerLoopVectorizer does not perform any vectorization-legality
443 /// checks, and relies on the caller to check for the different legality
444 /// aspects. The InnerLoopVectorizer relies on the
445 /// LoopVectorizationLegality class to provide information about the induction
446 /// and reduction variables that were found to a given vectorization factor.
447 class InnerLoopVectorizer {
448 public:
449   InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
450                       LoopInfo *LI, DominatorTree *DT,
451                       const TargetLibraryInfo *TLI,
452                       const TargetTransformInfo *TTI, AssumptionCache *AC,
453                       OptimizationRemarkEmitter *ORE, ElementCount VecWidth,
454                       unsigned UnrollFactor, LoopVectorizationLegality *LVL,
455                       LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
456                       ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks)
457       : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
458         AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
459         Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI),
460         PSI(PSI), RTChecks(RTChecks) {
461     // Query this against the original loop and save it here because the profile
462     // of the original loop header may change as the transformation happens.
463     OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize(
464         OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass);
465   }
466 
467   virtual ~InnerLoopVectorizer() = default;
468 
469   /// Create a new empty loop that will contain vectorized instructions later
470   /// on, while the old loop will be used as the scalar remainder. Control flow
471   /// is generated around the vectorized (and scalar epilogue) loops consisting
472   /// of various checks and bypasses. Return the pre-header block of the new
473   /// loop.
474   /// In the case of epilogue vectorization, this function is overriden to
475   /// handle the more complex control flow around the loops.
476   virtual BasicBlock *createVectorizedLoopSkeleton();
477 
478   /// Widen a single call instruction within the innermost loop.
479   void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands,
480                             VPTransformState &State);
481 
482   /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
483   void fixVectorizedLoop(VPTransformState &State);
484 
485   // Return true if any runtime check is added.
486   bool areSafetyChecksAdded() { return AddedSafetyChecks; }
487 
488   /// A type for vectorized values in the new loop. Each value from the
489   /// original loop, when vectorized, is represented by UF vector values in the
490   /// new unrolled loop, where UF is the unroll factor.
491   using VectorParts = SmallVector<Value *, 2>;
492 
493   /// Vectorize a single first-order recurrence or pointer induction PHINode in
494   /// a block. This method handles the induction variable canonicalization. It
495   /// supports both VF = 1 for unrolled loops and arbitrary length vectors.
496   void widenPHIInstruction(Instruction *PN, VPWidenPHIRecipe *PhiR,
497                            VPTransformState &State);
498 
499   /// A helper function to scalarize a single Instruction in the innermost loop.
500   /// Generates a sequence of scalar instances for each lane between \p MinLane
501   /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
502   /// inclusive. Uses the VPValue operands from \p RepRecipe instead of \p
503   /// Instr's operands.
504   void scalarizeInstruction(Instruction *Instr, VPReplicateRecipe *RepRecipe,
505                             const VPIteration &Instance, bool IfPredicateInstr,
506                             VPTransformState &State);
507 
508   /// Widen an integer or floating-point induction variable \p IV. If \p Trunc
509   /// is provided, the integer induction variable will first be truncated to
510   /// the corresponding type.
511   void widenIntOrFpInduction(PHINode *IV, const InductionDescriptor &ID,
512                              Value *Start, TruncInst *Trunc, VPValue *Def,
513                              VPTransformState &State);
514 
515   /// Construct the vector value of a scalarized value \p V one lane at a time.
516   void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance,
517                                  VPTransformState &State);
518 
519   /// Try to vectorize interleaved access group \p Group with the base address
520   /// given in \p Addr, optionally masking the vector operations if \p
521   /// BlockInMask is non-null. Use \p State to translate given VPValues to IR
522   /// values in the vectorized loop.
523   void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group,
524                                 ArrayRef<VPValue *> VPDefs,
525                                 VPTransformState &State, VPValue *Addr,
526                                 ArrayRef<VPValue *> StoredValues,
527                                 VPValue *BlockInMask = nullptr);
528 
529   /// Set the debug location in the builder \p Ptr using the debug location in
530   /// \p V. If \p Ptr is None then it uses the class member's Builder.
531   void setDebugLocFromInst(const Value *V,
532                            Optional<IRBuilder<> *> CustomBuilder = None);
533 
534   /// Fix the non-induction PHIs in the OrigPHIsToFix vector.
535   void fixNonInductionPHIs(VPTransformState &State);
536 
537   /// Returns true if the reordering of FP operations is not allowed, but we are
538   /// able to vectorize with strict in-order reductions for the given RdxDesc.
539   bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc);
540 
541   /// Create a broadcast instruction. This method generates a broadcast
542   /// instruction (shuffle) for loop invariant values and for the induction
543   /// value. If this is the induction variable then we extend it to N, N+1, ...
544   /// this is needed because each iteration in the loop corresponds to a SIMD
545   /// element.
546   virtual Value *getBroadcastInstrs(Value *V);
547 
548   /// Add metadata from one instruction to another.
549   ///
550   /// This includes both the original MDs from \p From and additional ones (\see
551   /// addNewMetadata).  Use this for *newly created* instructions in the vector
552   /// loop.
553   void addMetadata(Instruction *To, Instruction *From);
554 
555   /// Similar to the previous function but it adds the metadata to a
556   /// vector of instructions.
557   void addMetadata(ArrayRef<Value *> To, Instruction *From);
558 
559 protected:
560   friend class LoopVectorizationPlanner;
561 
562   /// A small list of PHINodes.
563   using PhiVector = SmallVector<PHINode *, 4>;
564 
565   /// A type for scalarized values in the new loop. Each value from the
566   /// original loop, when scalarized, is represented by UF x VF scalar values
567   /// in the new unrolled loop, where UF is the unroll factor and VF is the
568   /// vectorization factor.
569   using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
570 
571   /// Set up the values of the IVs correctly when exiting the vector loop.
572   void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
573                     Value *CountRoundDown, Value *EndValue,
574                     BasicBlock *MiddleBlock);
575 
576   /// Create a new induction variable inside L.
577   PHINode *createInductionVariable(Loop *L, Value *Start, Value *End,
578                                    Value *Step, Instruction *DL);
579 
580   /// Handle all cross-iteration phis in the header.
581   void fixCrossIterationPHIs(VPTransformState &State);
582 
583   /// Create the exit value of first order recurrences in the middle block and
584   /// update their users.
585   void fixFirstOrderRecurrence(VPWidenPHIRecipe *PhiR, VPTransformState &State);
586 
587   /// Create code for the loop exit value of the reduction.
588   void fixReduction(VPReductionPHIRecipe *Phi, VPTransformState &State);
589 
590   /// Clear NSW/NUW flags from reduction instructions if necessary.
591   void clearReductionWrapFlags(const RecurrenceDescriptor &RdxDesc,
592                                VPTransformState &State);
593 
594   /// Fixup the LCSSA phi nodes in the unique exit block.  This simply
595   /// means we need to add the appropriate incoming value from the middle
596   /// block as exiting edges from the scalar epilogue loop (if present) are
597   /// already in place, and we exit the vector loop exclusively to the middle
598   /// block.
599   void fixLCSSAPHIs(VPTransformState &State);
600 
601   /// Iteratively sink the scalarized operands of a predicated instruction into
602   /// the block that was created for it.
603   void sinkScalarOperands(Instruction *PredInst);
604 
605   /// Shrinks vector element sizes to the smallest bitwidth they can be legally
606   /// represented as.
607   void truncateToMinimalBitwidths(VPTransformState &State);
608 
609   /// This function adds
610   /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...)
611   /// to each vector element of Val. The sequence starts at StartIndex.
612   /// \p Opcode is relevant for FP induction variable.
613   virtual Value *
614   getStepVector(Value *Val, Value *StartIdx, Value *Step,
615                 Instruction::BinaryOps Opcode = Instruction::BinaryOpsEnd);
616 
617   /// Compute scalar induction steps. \p ScalarIV is the scalar induction
618   /// variable on which to base the steps, \p Step is the size of the step, and
619   /// \p EntryVal is the value from the original loop that maps to the steps.
620   /// Note that \p EntryVal doesn't have to be an induction variable - it
621   /// can also be a truncate instruction.
622   void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal,
623                         const InductionDescriptor &ID, VPValue *Def,
624                         VPTransformState &State);
625 
626   /// Create a vector induction phi node based on an existing scalar one. \p
627   /// EntryVal is the value from the original loop that maps to the vector phi
628   /// node, and \p Step is the loop-invariant step. If \p EntryVal is a
629   /// truncate instruction, instead of widening the original IV, we widen a
630   /// version of the IV truncated to \p EntryVal's type.
631   void createVectorIntOrFpInductionPHI(const InductionDescriptor &II,
632                                        Value *Step, Value *Start,
633                                        Instruction *EntryVal, VPValue *Def,
634                                        VPTransformState &State);
635 
636   /// Returns true if an instruction \p I should be scalarized instead of
637   /// vectorized for the chosen vectorization factor.
638   bool shouldScalarizeInstruction(Instruction *I) const;
639 
640   /// Returns true if we should generate a scalar version of \p IV.
641   bool needsScalarInduction(Instruction *IV) const;
642 
643   /// Generate a shuffle sequence that will reverse the vector Vec.
644   virtual Value *reverseVector(Value *Vec);
645 
646   /// Returns (and creates if needed) the original loop trip count.
647   Value *getOrCreateTripCount(Loop *NewLoop);
648 
649   /// Returns (and creates if needed) the trip count of the widened loop.
650   Value *getOrCreateVectorTripCount(Loop *NewLoop);
651 
652   /// Returns a bitcasted value to the requested vector type.
653   /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
654   Value *createBitOrPointerCast(Value *V, VectorType *DstVTy,
655                                 const DataLayout &DL);
656 
657   /// Emit a bypass check to see if the vector trip count is zero, including if
658   /// it overflows.
659   void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass);
660 
661   /// Emit a bypass check to see if all of the SCEV assumptions we've
662   /// had to make are correct. Returns the block containing the checks or
663   /// nullptr if no checks have been added.
664   BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass);
665 
666   /// Emit bypass checks to check any memory assumptions we may have made.
667   /// Returns the block containing the checks or nullptr if no checks have been
668   /// added.
669   BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass);
670 
671   /// Compute the transformed value of Index at offset StartValue using step
672   /// StepValue.
673   /// For integer induction, returns StartValue + Index * StepValue.
674   /// For pointer induction, returns StartValue[Index * StepValue].
675   /// FIXME: The newly created binary instructions should contain nsw/nuw
676   /// flags, which can be found from the original scalar operations.
677   Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE,
678                               const DataLayout &DL,
679                               const InductionDescriptor &ID,
680                               BasicBlock *VectorHeader) const;
681 
682   /// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
683   /// vector loop preheader, middle block and scalar preheader. Also
684   /// allocate a loop object for the new vector loop and return it.
685   Loop *createVectorLoopSkeleton(StringRef Prefix);
686 
687   /// Create new phi nodes for the induction variables to resume iteration count
688   /// in the scalar epilogue, from where the vectorized loop left off (given by
689   /// \p VectorTripCount).
690   /// In cases where the loop skeleton is more complicated (eg. epilogue
691   /// vectorization) and the resume values can come from an additional bypass
692   /// block, the \p AdditionalBypass pair provides information about the bypass
693   /// block and the end value on the edge from bypass to this loop.
694   void createInductionResumeValues(
695       Loop *L, Value *VectorTripCount,
696       std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
697 
698   /// Complete the loop skeleton by adding debug MDs, creating appropriate
699   /// conditional branches in the middle block, preparing the builder and
700   /// running the verifier. Take in the vector loop \p L as argument, and return
701   /// the preheader of the completed vector loop.
702   BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID);
703 
704   /// Add additional metadata to \p To that was not present on \p Orig.
705   ///
706   /// Currently this is used to add the noalias annotations based on the
707   /// inserted memchecks.  Use this for instructions that are *cloned* into the
708   /// vector loop.
709   void addNewMetadata(Instruction *To, const Instruction *Orig);
710 
711   /// Collect poison-generating recipes that may generate a poison value that is
712   /// used after vectorization, even when their operands are not poison. Those
713   /// recipes meet the following conditions:
714   ///  * Contribute to the address computation of a recipe generating a widen
715   ///    memory load/store (VPWidenMemoryInstructionRecipe or
716   ///    VPInterleaveRecipe).
717   ///  * Such a widen memory load/store has at least one underlying Instruction
718   ///    that is in a basic block that needs predication and after vectorization
719   ///    the generated instruction won't be predicated.
720   void collectPoisonGeneratingRecipes(VPTransformState &State);
721 
722   /// Allow subclasses to override and print debug traces before/after vplan
723   /// execution, when trace information is requested.
724   virtual void printDebugTracesAtStart(){};
725   virtual void printDebugTracesAtEnd(){};
726 
727   /// The original loop.
728   Loop *OrigLoop;
729 
730   /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
731   /// dynamic knowledge to simplify SCEV expressions and converts them to a
732   /// more usable form.
733   PredicatedScalarEvolution &PSE;
734 
735   /// Loop Info.
736   LoopInfo *LI;
737 
738   /// Dominator Tree.
739   DominatorTree *DT;
740 
741   /// Alias Analysis.
742   AAResults *AA;
743 
744   /// Target Library Info.
745   const TargetLibraryInfo *TLI;
746 
747   /// Target Transform Info.
748   const TargetTransformInfo *TTI;
749 
750   /// Assumption Cache.
751   AssumptionCache *AC;
752 
753   /// Interface to emit optimization remarks.
754   OptimizationRemarkEmitter *ORE;
755 
756   /// LoopVersioning.  It's only set up (non-null) if memchecks were
757   /// used.
758   ///
759   /// This is currently only used to add no-alias metadata based on the
760   /// memchecks.  The actually versioning is performed manually.
761   std::unique_ptr<LoopVersioning> LVer;
762 
763   /// The vectorization SIMD factor to use. Each vector will have this many
764   /// vector elements.
765   ElementCount VF;
766 
767   /// The vectorization unroll factor to use. Each scalar is vectorized to this
768   /// many different vector instructions.
769   unsigned UF;
770 
771   /// The builder that we use
772   IRBuilder<> Builder;
773 
774   // --- Vectorization state ---
775 
776   /// The vector-loop preheader.
777   BasicBlock *LoopVectorPreHeader;
778 
779   /// The scalar-loop preheader.
780   BasicBlock *LoopScalarPreHeader;
781 
782   /// Middle Block between the vector and the scalar.
783   BasicBlock *LoopMiddleBlock;
784 
785   /// The unique ExitBlock of the scalar loop if one exists.  Note that
786   /// there can be multiple exiting edges reaching this block.
787   BasicBlock *LoopExitBlock;
788 
789   /// The vector loop body.
790   BasicBlock *LoopVectorBody;
791 
792   /// The scalar loop body.
793   BasicBlock *LoopScalarBody;
794 
795   /// A list of all bypass blocks. The first block is the entry of the loop.
796   SmallVector<BasicBlock *, 4> LoopBypassBlocks;
797 
798   /// The new Induction variable which was added to the new block.
799   PHINode *Induction = nullptr;
800 
801   /// The induction variable of the old basic block.
802   PHINode *OldInduction = nullptr;
803 
804   /// Store instructions that were predicated.
805   SmallVector<Instruction *, 4> PredicatedInstructions;
806 
807   /// Trip count of the original loop.
808   Value *TripCount = nullptr;
809 
810   /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
811   Value *VectorTripCount = nullptr;
812 
813   /// The legality analysis.
814   LoopVectorizationLegality *Legal;
815 
816   /// The profitablity analysis.
817   LoopVectorizationCostModel *Cost;
818 
819   // Record whether runtime checks are added.
820   bool AddedSafetyChecks = false;
821 
822   // Holds the end values for each induction variable. We save the end values
823   // so we can later fix-up the external users of the induction variables.
824   DenseMap<PHINode *, Value *> IVEndValues;
825 
826   // Vector of original scalar PHIs whose corresponding widened PHIs need to be
827   // fixed up at the end of vector code generation.
828   SmallVector<PHINode *, 8> OrigPHIsToFix;
829 
830   /// BFI and PSI are used to check for profile guided size optimizations.
831   BlockFrequencyInfo *BFI;
832   ProfileSummaryInfo *PSI;
833 
834   // Whether this loop should be optimized for size based on profile guided size
835   // optimizatios.
836   bool OptForSizeBasedOnProfile;
837 
838   /// Structure to hold information about generated runtime checks, responsible
839   /// for cleaning the checks, if vectorization turns out unprofitable.
840   GeneratedRTChecks &RTChecks;
841 };
842 
843 class InnerLoopUnroller : public InnerLoopVectorizer {
844 public:
845   InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
846                     LoopInfo *LI, DominatorTree *DT,
847                     const TargetLibraryInfo *TLI,
848                     const TargetTransformInfo *TTI, AssumptionCache *AC,
849                     OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
850                     LoopVectorizationLegality *LVL,
851                     LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
852                     ProfileSummaryInfo *PSI, GeneratedRTChecks &Check)
853       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
854                             ElementCount::getFixed(1), UnrollFactor, LVL, CM,
855                             BFI, PSI, Check) {}
856 
857 private:
858   Value *getBroadcastInstrs(Value *V) override;
859   Value *getStepVector(
860       Value *Val, Value *StartIdx, Value *Step,
861       Instruction::BinaryOps Opcode = Instruction::BinaryOpsEnd) override;
862   Value *reverseVector(Value *Vec) override;
863 };
864 
865 /// Encapsulate information regarding vectorization of a loop and its epilogue.
866 /// This information is meant to be updated and used across two stages of
867 /// epilogue vectorization.
868 struct EpilogueLoopVectorizationInfo {
869   ElementCount MainLoopVF = ElementCount::getFixed(0);
870   unsigned MainLoopUF = 0;
871   ElementCount EpilogueVF = ElementCount::getFixed(0);
872   unsigned EpilogueUF = 0;
873   BasicBlock *MainLoopIterationCountCheck = nullptr;
874   BasicBlock *EpilogueIterationCountCheck = nullptr;
875   BasicBlock *SCEVSafetyCheck = nullptr;
876   BasicBlock *MemSafetyCheck = nullptr;
877   Value *TripCount = nullptr;
878   Value *VectorTripCount = nullptr;
879 
880   EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF,
881                                 ElementCount EVF, unsigned EUF)
882       : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF) {
883     assert(EUF == 1 &&
884            "A high UF for the epilogue loop is likely not beneficial.");
885   }
886 };
887 
888 /// An extension of the inner loop vectorizer that creates a skeleton for a
889 /// vectorized loop that has its epilogue (residual) also vectorized.
890 /// The idea is to run the vplan on a given loop twice, firstly to setup the
891 /// skeleton and vectorize the main loop, and secondly to complete the skeleton
892 /// from the first step and vectorize the epilogue.  This is achieved by
893 /// deriving two concrete strategy classes from this base class and invoking
894 /// them in succession from the loop vectorizer planner.
895 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer {
896 public:
897   InnerLoopAndEpilogueVectorizer(
898       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
899       DominatorTree *DT, const TargetLibraryInfo *TLI,
900       const TargetTransformInfo *TTI, AssumptionCache *AC,
901       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
902       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
903       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
904       GeneratedRTChecks &Checks)
905       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
906                             EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI,
907                             Checks),
908         EPI(EPI) {}
909 
910   // Override this function to handle the more complex control flow around the
911   // three loops.
912   BasicBlock *createVectorizedLoopSkeleton() final override {
913     return createEpilogueVectorizedLoopSkeleton();
914   }
915 
916   /// The interface for creating a vectorized skeleton using one of two
917   /// different strategies, each corresponding to one execution of the vplan
918   /// as described above.
919   virtual BasicBlock *createEpilogueVectorizedLoopSkeleton() = 0;
920 
921   /// Holds and updates state information required to vectorize the main loop
922   /// and its epilogue in two separate passes. This setup helps us avoid
923   /// regenerating and recomputing runtime safety checks. It also helps us to
924   /// shorten the iteration-count-check path length for the cases where the
925   /// iteration count of the loop is so small that the main vector loop is
926   /// completely skipped.
927   EpilogueLoopVectorizationInfo &EPI;
928 };
929 
930 /// A specialized derived class of inner loop vectorizer that performs
931 /// vectorization of *main* loops in the process of vectorizing loops and their
932 /// epilogues.
933 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer {
934 public:
935   EpilogueVectorizerMainLoop(
936       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
937       DominatorTree *DT, const TargetLibraryInfo *TLI,
938       const TargetTransformInfo *TTI, AssumptionCache *AC,
939       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
940       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
941       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
942       GeneratedRTChecks &Check)
943       : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
944                                        EPI, LVL, CM, BFI, PSI, Check) {}
945   /// Implements the interface for creating a vectorized skeleton using the
946   /// *main loop* strategy (ie the first pass of vplan execution).
947   BasicBlock *createEpilogueVectorizedLoopSkeleton() final override;
948 
949 protected:
950   /// Emits an iteration count bypass check once for the main loop (when \p
951   /// ForEpilogue is false) and once for the epilogue loop (when \p
952   /// ForEpilogue is true).
953   BasicBlock *emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass,
954                                              bool ForEpilogue);
955   void printDebugTracesAtStart() override;
956   void printDebugTracesAtEnd() override;
957 };
958 
959 // A specialized derived class of inner loop vectorizer that performs
960 // vectorization of *epilogue* loops in the process of vectorizing loops and
961 // their epilogues.
962 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer {
963 public:
964   EpilogueVectorizerEpilogueLoop(
965       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
966       DominatorTree *DT, const TargetLibraryInfo *TLI,
967       const TargetTransformInfo *TTI, AssumptionCache *AC,
968       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
969       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
970       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
971       GeneratedRTChecks &Checks)
972       : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
973                                        EPI, LVL, CM, BFI, PSI, Checks) {}
974   /// Implements the interface for creating a vectorized skeleton using the
975   /// *epilogue loop* strategy (ie the second pass of vplan execution).
976   BasicBlock *createEpilogueVectorizedLoopSkeleton() final override;
977 
978 protected:
979   /// Emits an iteration count bypass check after the main vector loop has
980   /// finished to see if there are any iterations left to execute by either
981   /// the vector epilogue or the scalar epilogue.
982   BasicBlock *emitMinimumVectorEpilogueIterCountCheck(Loop *L,
983                                                       BasicBlock *Bypass,
984                                                       BasicBlock *Insert);
985   void printDebugTracesAtStart() override;
986   void printDebugTracesAtEnd() override;
987 };
988 } // end namespace llvm
989 
990 /// Look for a meaningful debug location on the instruction or it's
991 /// operands.
992 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
993   if (!I)
994     return I;
995 
996   DebugLoc Empty;
997   if (I->getDebugLoc() != Empty)
998     return I;
999 
1000   for (Use &Op : I->operands()) {
1001     if (Instruction *OpInst = dyn_cast<Instruction>(Op))
1002       if (OpInst->getDebugLoc() != Empty)
1003         return OpInst;
1004   }
1005 
1006   return I;
1007 }
1008 
1009 void InnerLoopVectorizer::setDebugLocFromInst(
1010     const Value *V, Optional<IRBuilder<> *> CustomBuilder) {
1011   IRBuilder<> *B = (CustomBuilder == None) ? &Builder : *CustomBuilder;
1012   if (const Instruction *Inst = dyn_cast_or_null<Instruction>(V)) {
1013     const DILocation *DIL = Inst->getDebugLoc();
1014 
1015     // When a FSDiscriminator is enabled, we don't need to add the multiply
1016     // factors to the discriminators.
1017     if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
1018         !isa<DbgInfoIntrinsic>(Inst) && !EnableFSDiscriminator) {
1019       // FIXME: For scalable vectors, assume vscale=1.
1020       auto NewDIL =
1021           DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue());
1022       if (NewDIL)
1023         B->SetCurrentDebugLocation(NewDIL.getValue());
1024       else
1025         LLVM_DEBUG(dbgs()
1026                    << "Failed to create new discriminator: "
1027                    << DIL->getFilename() << " Line: " << DIL->getLine());
1028     } else
1029       B->SetCurrentDebugLocation(DIL);
1030   } else
1031     B->SetCurrentDebugLocation(DebugLoc());
1032 }
1033 
1034 /// Write a \p DebugMsg about vectorization to the debug output stream. If \p I
1035 /// is passed, the message relates to that particular instruction.
1036 #ifndef NDEBUG
1037 static void debugVectorizationMessage(const StringRef Prefix,
1038                                       const StringRef DebugMsg,
1039                                       Instruction *I) {
1040   dbgs() << "LV: " << Prefix << DebugMsg;
1041   if (I != nullptr)
1042     dbgs() << " " << *I;
1043   else
1044     dbgs() << '.';
1045   dbgs() << '\n';
1046 }
1047 #endif
1048 
1049 /// Create an analysis remark that explains why vectorization failed
1050 ///
1051 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint).  \p
1052 /// RemarkName is the identifier for the remark.  If \p I is passed it is an
1053 /// instruction that prevents vectorization.  Otherwise \p TheLoop is used for
1054 /// the location of the remark.  \return the remark object that can be
1055 /// streamed to.
1056 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
1057     StringRef RemarkName, Loop *TheLoop, Instruction *I) {
1058   Value *CodeRegion = TheLoop->getHeader();
1059   DebugLoc DL = TheLoop->getStartLoc();
1060 
1061   if (I) {
1062     CodeRegion = I->getParent();
1063     // If there is no debug location attached to the instruction, revert back to
1064     // using the loop's.
1065     if (I->getDebugLoc())
1066       DL = I->getDebugLoc();
1067   }
1068 
1069   return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion);
1070 }
1071 
1072 /// Return a value for Step multiplied by VF.
1073 static Value *createStepForVF(IRBuilder<> &B, Type *Ty, ElementCount VF,
1074                               int64_t Step) {
1075   assert(Ty->isIntegerTy() && "Expected an integer step");
1076   Constant *StepVal = ConstantInt::get(Ty, Step * VF.getKnownMinValue());
1077   return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal;
1078 }
1079 
1080 namespace llvm {
1081 
1082 /// Return the runtime value for VF.
1083 Value *getRuntimeVF(IRBuilder<> &B, Type *Ty, ElementCount VF) {
1084   Constant *EC = ConstantInt::get(Ty, VF.getKnownMinValue());
1085   return VF.isScalable() ? B.CreateVScale(EC) : EC;
1086 }
1087 
1088 static Value *getRuntimeVFAsFloat(IRBuilder<> &B, Type *FTy, ElementCount VF) {
1089   assert(FTy->isFloatingPointTy() && "Expected floating point type!");
1090   Type *IntTy = IntegerType::get(FTy->getContext(), FTy->getScalarSizeInBits());
1091   Value *RuntimeVF = getRuntimeVF(B, IntTy, VF);
1092   return B.CreateUIToFP(RuntimeVF, FTy);
1093 }
1094 
1095 void reportVectorizationFailure(const StringRef DebugMsg,
1096                                 const StringRef OREMsg, const StringRef ORETag,
1097                                 OptimizationRemarkEmitter *ORE, Loop *TheLoop,
1098                                 Instruction *I) {
1099   LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I));
1100   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
1101   ORE->emit(
1102       createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
1103       << "loop not vectorized: " << OREMsg);
1104 }
1105 
1106 void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag,
1107                              OptimizationRemarkEmitter *ORE, Loop *TheLoop,
1108                              Instruction *I) {
1109   LLVM_DEBUG(debugVectorizationMessage("", Msg, I));
1110   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
1111   ORE->emit(
1112       createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
1113       << Msg);
1114 }
1115 
1116 } // end namespace llvm
1117 
1118 #ifndef NDEBUG
1119 /// \return string containing a file name and a line # for the given loop.
1120 static std::string getDebugLocString(const Loop *L) {
1121   std::string Result;
1122   if (L) {
1123     raw_string_ostream OS(Result);
1124     if (const DebugLoc LoopDbgLoc = L->getStartLoc())
1125       LoopDbgLoc.print(OS);
1126     else
1127       // Just print the module name.
1128       OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
1129     OS.flush();
1130   }
1131   return Result;
1132 }
1133 #endif
1134 
1135 void InnerLoopVectorizer::addNewMetadata(Instruction *To,
1136                                          const Instruction *Orig) {
1137   // If the loop was versioned with memchecks, add the corresponding no-alias
1138   // metadata.
1139   if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig)))
1140     LVer->annotateInstWithNoAlias(To, Orig);
1141 }
1142 
1143 void InnerLoopVectorizer::collectPoisonGeneratingRecipes(
1144     VPTransformState &State) {
1145 
1146   // Collect recipes in the backward slice of `Root` that may generate a poison
1147   // value that is used after vectorization.
1148   SmallPtrSet<VPRecipeBase *, 16> Visited;
1149   auto collectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) {
1150     SmallVector<VPRecipeBase *, 16> Worklist;
1151     Worklist.push_back(Root);
1152 
1153     // Traverse the backward slice of Root through its use-def chain.
1154     while (!Worklist.empty()) {
1155       VPRecipeBase *CurRec = Worklist.back();
1156       Worklist.pop_back();
1157 
1158       if (!Visited.insert(CurRec).second)
1159         continue;
1160 
1161       // Prune search if we find another recipe generating a widen memory
1162       // instruction. Widen memory instructions involved in address computation
1163       // will lead to gather/scatter instructions, which don't need to be
1164       // handled.
1165       if (isa<VPWidenMemoryInstructionRecipe>(CurRec) ||
1166           isa<VPInterleaveRecipe>(CurRec))
1167         continue;
1168 
1169       // This recipe contributes to the address computation of a widen
1170       // load/store. Collect recipe if its underlying instruction has
1171       // poison-generating flags.
1172       Instruction *Instr = CurRec->getUnderlyingInstr();
1173       if (Instr && Instr->hasPoisonGeneratingFlags())
1174         State.MayGeneratePoisonRecipes.insert(CurRec);
1175 
1176       // Add new definitions to the worklist.
1177       for (VPValue *operand : CurRec->operands())
1178         if (VPDef *OpDef = operand->getDef())
1179           Worklist.push_back(cast<VPRecipeBase>(OpDef));
1180     }
1181   });
1182 
1183   // Traverse all the recipes in the VPlan and collect the poison-generating
1184   // recipes in the backward slice starting at the address of a VPWidenRecipe or
1185   // VPInterleaveRecipe.
1186   auto Iter = depth_first(
1187       VPBlockRecursiveTraversalWrapper<VPBlockBase *>(State.Plan->getEntry()));
1188   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
1189     for (VPRecipeBase &Recipe : *VPBB) {
1190       if (auto *WidenRec = dyn_cast<VPWidenMemoryInstructionRecipe>(&Recipe)) {
1191         Instruction *UnderlyingInstr = WidenRec->getUnderlyingInstr();
1192         VPDef *AddrDef = WidenRec->getAddr()->getDef();
1193         if (AddrDef && WidenRec->isConsecutive() && UnderlyingInstr &&
1194             Legal->blockNeedsPredication(UnderlyingInstr->getParent()))
1195           collectPoisonGeneratingInstrsInBackwardSlice(
1196               cast<VPRecipeBase>(AddrDef));
1197       } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) {
1198         VPDef *AddrDef = InterleaveRec->getAddr()->getDef();
1199         if (AddrDef) {
1200           // Check if any member of the interleave group needs predication.
1201           const InterleaveGroup<Instruction> *InterGroup =
1202               InterleaveRec->getInterleaveGroup();
1203           bool NeedPredication = false;
1204           for (int I = 0, NumMembers = InterGroup->getNumMembers();
1205                I < NumMembers; ++I) {
1206             Instruction *Member = InterGroup->getMember(I);
1207             if (Member)
1208               NeedPredication |=
1209                   Legal->blockNeedsPredication(Member->getParent());
1210           }
1211 
1212           if (NeedPredication)
1213             collectPoisonGeneratingInstrsInBackwardSlice(
1214                 cast<VPRecipeBase>(AddrDef));
1215         }
1216       }
1217     }
1218   }
1219 }
1220 
1221 void InnerLoopVectorizer::addMetadata(Instruction *To,
1222                                       Instruction *From) {
1223   propagateMetadata(To, From);
1224   addNewMetadata(To, From);
1225 }
1226 
1227 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,
1228                                       Instruction *From) {
1229   for (Value *V : To) {
1230     if (Instruction *I = dyn_cast<Instruction>(V))
1231       addMetadata(I, From);
1232   }
1233 }
1234 
1235 namespace llvm {
1236 
1237 // Loop vectorization cost-model hints how the scalar epilogue loop should be
1238 // lowered.
1239 enum ScalarEpilogueLowering {
1240 
1241   // The default: allowing scalar epilogues.
1242   CM_ScalarEpilogueAllowed,
1243 
1244   // Vectorization with OptForSize: don't allow epilogues.
1245   CM_ScalarEpilogueNotAllowedOptSize,
1246 
1247   // A special case of vectorisation with OptForSize: loops with a very small
1248   // trip count are considered for vectorization under OptForSize, thereby
1249   // making sure the cost of their loop body is dominant, free of runtime
1250   // guards and scalar iteration overheads.
1251   CM_ScalarEpilogueNotAllowedLowTripLoop,
1252 
1253   // Loop hint predicate indicating an epilogue is undesired.
1254   CM_ScalarEpilogueNotNeededUsePredicate,
1255 
1256   // Directive indicating we must either tail fold or not vectorize
1257   CM_ScalarEpilogueNotAllowedUsePredicate
1258 };
1259 
1260 /// ElementCountComparator creates a total ordering for ElementCount
1261 /// for the purposes of using it in a set structure.
1262 struct ElementCountComparator {
1263   bool operator()(const ElementCount &LHS, const ElementCount &RHS) const {
1264     return std::make_tuple(LHS.isScalable(), LHS.getKnownMinValue()) <
1265            std::make_tuple(RHS.isScalable(), RHS.getKnownMinValue());
1266   }
1267 };
1268 using ElementCountSet = SmallSet<ElementCount, 16, ElementCountComparator>;
1269 
1270 /// LoopVectorizationCostModel - estimates the expected speedups due to
1271 /// vectorization.
1272 /// In many cases vectorization is not profitable. This can happen because of
1273 /// a number of reasons. In this class we mainly attempt to predict the
1274 /// expected speedup/slowdowns due to the supported instruction set. We use the
1275 /// TargetTransformInfo to query the different backends for the cost of
1276 /// different operations.
1277 class LoopVectorizationCostModel {
1278 public:
1279   LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
1280                              PredicatedScalarEvolution &PSE, LoopInfo *LI,
1281                              LoopVectorizationLegality *Legal,
1282                              const TargetTransformInfo &TTI,
1283                              const TargetLibraryInfo *TLI, DemandedBits *DB,
1284                              AssumptionCache *AC,
1285                              OptimizationRemarkEmitter *ORE, const Function *F,
1286                              const LoopVectorizeHints *Hints,
1287                              InterleavedAccessInfo &IAI)
1288       : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
1289         TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
1290         Hints(Hints), InterleaveInfo(IAI) {}
1291 
1292   /// \return An upper bound for the vectorization factors (both fixed and
1293   /// scalable). If the factors are 0, vectorization and interleaving should be
1294   /// avoided up front.
1295   FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC);
1296 
1297   /// \return True if runtime checks are required for vectorization, and false
1298   /// otherwise.
1299   bool runtimeChecksRequired();
1300 
1301   /// \return The most profitable vectorization factor and the cost of that VF.
1302   /// This method checks every VF in \p CandidateVFs. If UserVF is not ZERO
1303   /// then this vectorization factor will be selected if vectorization is
1304   /// possible.
1305   VectorizationFactor
1306   selectVectorizationFactor(const ElementCountSet &CandidateVFs);
1307 
1308   VectorizationFactor
1309   selectEpilogueVectorizationFactor(const ElementCount MaxVF,
1310                                     const LoopVectorizationPlanner &LVP);
1311 
1312   /// Setup cost-based decisions for user vectorization factor.
1313   /// \return true if the UserVF is a feasible VF to be chosen.
1314   bool selectUserVectorizationFactor(ElementCount UserVF) {
1315     collectUniformsAndScalars(UserVF);
1316     collectInstsToScalarize(UserVF);
1317     return expectedCost(UserVF).first.isValid();
1318   }
1319 
1320   /// \return The size (in bits) of the smallest and widest types in the code
1321   /// that needs to be vectorized. We ignore values that remain scalar such as
1322   /// 64 bit loop indices.
1323   std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
1324 
1325   /// \return The desired interleave count.
1326   /// If interleave count has been specified by metadata it will be returned.
1327   /// Otherwise, the interleave count is computed and returned. VF and LoopCost
1328   /// are the selected vectorization factor and the cost of the selected VF.
1329   unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost);
1330 
1331   /// Memory access instruction may be vectorized in more than one way.
1332   /// Form of instruction after vectorization depends on cost.
1333   /// This function takes cost-based decisions for Load/Store instructions
1334   /// and collects them in a map. This decisions map is used for building
1335   /// the lists of loop-uniform and loop-scalar instructions.
1336   /// The calculated cost is saved with widening decision in order to
1337   /// avoid redundant calculations.
1338   void setCostBasedWideningDecision(ElementCount VF);
1339 
1340   /// A struct that represents some properties of the register usage
1341   /// of a loop.
1342   struct RegisterUsage {
1343     /// Holds the number of loop invariant values that are used in the loop.
1344     /// The key is ClassID of target-provided register class.
1345     SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs;
1346     /// Holds the maximum number of concurrent live intervals in the loop.
1347     /// The key is ClassID of target-provided register class.
1348     SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
1349   };
1350 
1351   /// \return Returns information about the register usages of the loop for the
1352   /// given vectorization factors.
1353   SmallVector<RegisterUsage, 8>
1354   calculateRegisterUsage(ArrayRef<ElementCount> VFs);
1355 
1356   /// Collect values we want to ignore in the cost model.
1357   void collectValuesToIgnore();
1358 
1359   /// Collect all element types in the loop for which widening is needed.
1360   void collectElementTypesForWidening();
1361 
1362   /// Split reductions into those that happen in the loop, and those that happen
1363   /// outside. In loop reductions are collected into InLoopReductionChains.
1364   void collectInLoopReductions();
1365 
1366   /// Returns true if we should use strict in-order reductions for the given
1367   /// RdxDesc. This is true if the -enable-strict-reductions flag is passed,
1368   /// the IsOrdered flag of RdxDesc is set and we do not allow reordering
1369   /// of FP operations.
1370   bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) {
1371     return !Hints->allowReordering() && RdxDesc.isOrdered();
1372   }
1373 
1374   /// \returns The smallest bitwidth each instruction can be represented with.
1375   /// The vector equivalents of these instructions should be truncated to this
1376   /// type.
1377   const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
1378     return MinBWs;
1379   }
1380 
1381   /// \returns True if it is more profitable to scalarize instruction \p I for
1382   /// vectorization factor \p VF.
1383   bool isProfitableToScalarize(Instruction *I, ElementCount VF) const {
1384     assert(VF.isVector() &&
1385            "Profitable to scalarize relevant only for VF > 1.");
1386 
1387     // Cost model is not run in the VPlan-native path - return conservative
1388     // result until this changes.
1389     if (EnableVPlanNativePath)
1390       return false;
1391 
1392     auto Scalars = InstsToScalarize.find(VF);
1393     assert(Scalars != InstsToScalarize.end() &&
1394            "VF not yet analyzed for scalarization profitability");
1395     return Scalars->second.find(I) != Scalars->second.end();
1396   }
1397 
1398   /// Returns true if \p I is known to be uniform after vectorization.
1399   bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const {
1400     if (VF.isScalar())
1401       return true;
1402 
1403     // Cost model is not run in the VPlan-native path - return conservative
1404     // result until this changes.
1405     if (EnableVPlanNativePath)
1406       return false;
1407 
1408     auto UniformsPerVF = Uniforms.find(VF);
1409     assert(UniformsPerVF != Uniforms.end() &&
1410            "VF not yet analyzed for uniformity");
1411     return UniformsPerVF->second.count(I);
1412   }
1413 
1414   /// Returns true if \p I is known to be scalar after vectorization.
1415   bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const {
1416     if (VF.isScalar())
1417       return true;
1418 
1419     // Cost model is not run in the VPlan-native path - return conservative
1420     // result until this changes.
1421     if (EnableVPlanNativePath)
1422       return false;
1423 
1424     auto ScalarsPerVF = Scalars.find(VF);
1425     assert(ScalarsPerVF != Scalars.end() &&
1426            "Scalar values are not calculated for VF");
1427     return ScalarsPerVF->second.count(I);
1428   }
1429 
1430   /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1431   /// for vectorization factor \p VF.
1432   bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const {
1433     return VF.isVector() && MinBWs.find(I) != MinBWs.end() &&
1434            !isProfitableToScalarize(I, VF) &&
1435            !isScalarAfterVectorization(I, VF);
1436   }
1437 
1438   /// Decision that was taken during cost calculation for memory instruction.
1439   enum InstWidening {
1440     CM_Unknown,
1441     CM_Widen,         // For consecutive accesses with stride +1.
1442     CM_Widen_Reverse, // For consecutive accesses with stride -1.
1443     CM_Interleave,
1444     CM_GatherScatter,
1445     CM_Scalarize
1446   };
1447 
1448   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1449   /// instruction \p I and vector width \p VF.
1450   void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W,
1451                            InstructionCost Cost) {
1452     assert(VF.isVector() && "Expected VF >=2");
1453     WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1454   }
1455 
1456   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1457   /// interleaving group \p Grp and vector width \p VF.
1458   void setWideningDecision(const InterleaveGroup<Instruction> *Grp,
1459                            ElementCount VF, InstWidening W,
1460                            InstructionCost Cost) {
1461     assert(VF.isVector() && "Expected VF >=2");
1462     /// Broadcast this decicion to all instructions inside the group.
1463     /// But the cost will be assigned to one instruction only.
1464     for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1465       if (auto *I = Grp->getMember(i)) {
1466         if (Grp->getInsertPos() == I)
1467           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1468         else
1469           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1470       }
1471     }
1472   }
1473 
1474   /// Return the cost model decision for the given instruction \p I and vector
1475   /// width \p VF. Return CM_Unknown if this instruction did not pass
1476   /// through the cost modeling.
1477   InstWidening getWideningDecision(Instruction *I, ElementCount VF) const {
1478     assert(VF.isVector() && "Expected VF to be a vector VF");
1479     // Cost model is not run in the VPlan-native path - return conservative
1480     // result until this changes.
1481     if (EnableVPlanNativePath)
1482       return CM_GatherScatter;
1483 
1484     std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1485     auto Itr = WideningDecisions.find(InstOnVF);
1486     if (Itr == WideningDecisions.end())
1487       return CM_Unknown;
1488     return Itr->second.first;
1489   }
1490 
1491   /// Return the vectorization cost for the given instruction \p I and vector
1492   /// width \p VF.
1493   InstructionCost getWideningCost(Instruction *I, ElementCount VF) {
1494     assert(VF.isVector() && "Expected VF >=2");
1495     std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1496     assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&
1497            "The cost is not calculated");
1498     return WideningDecisions[InstOnVF].second;
1499   }
1500 
1501   /// Return True if instruction \p I is an optimizable truncate whose operand
1502   /// is an induction variable. Such a truncate will be removed by adding a new
1503   /// induction variable with the destination type.
1504   bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) {
1505     // If the instruction is not a truncate, return false.
1506     auto *Trunc = dyn_cast<TruncInst>(I);
1507     if (!Trunc)
1508       return false;
1509 
1510     // Get the source and destination types of the truncate.
1511     Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1512     Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1513 
1514     // If the truncate is free for the given types, return false. Replacing a
1515     // free truncate with an induction variable would add an induction variable
1516     // update instruction to each iteration of the loop. We exclude from this
1517     // check the primary induction variable since it will need an update
1518     // instruction regardless.
1519     Value *Op = Trunc->getOperand(0);
1520     if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1521       return false;
1522 
1523     // If the truncated value is not an induction variable, return false.
1524     return Legal->isInductionPhi(Op);
1525   }
1526 
1527   /// Collects the instructions to scalarize for each predicated instruction in
1528   /// the loop.
1529   void collectInstsToScalarize(ElementCount VF);
1530 
1531   /// Collect Uniform and Scalar values for the given \p VF.
1532   /// The sets depend on CM decision for Load/Store instructions
1533   /// that may be vectorized as interleave, gather-scatter or scalarized.
1534   void collectUniformsAndScalars(ElementCount VF) {
1535     // Do the analysis once.
1536     if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end())
1537       return;
1538     setCostBasedWideningDecision(VF);
1539     collectLoopUniforms(VF);
1540     collectLoopScalars(VF);
1541   }
1542 
1543   /// Returns true if the target machine supports masked store operation
1544   /// for the given \p DataType and kind of access to \p Ptr.
1545   bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const {
1546     return Legal->isConsecutivePtr(DataType, Ptr) &&
1547            TTI.isLegalMaskedStore(DataType, Alignment);
1548   }
1549 
1550   /// Returns true if the target machine supports masked load operation
1551   /// for the given \p DataType and kind of access to \p Ptr.
1552   bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const {
1553     return Legal->isConsecutivePtr(DataType, Ptr) &&
1554            TTI.isLegalMaskedLoad(DataType, Alignment);
1555   }
1556 
1557   /// Returns true if the target machine can represent \p V as a masked gather
1558   /// or scatter operation.
1559   bool isLegalGatherOrScatter(Value *V) {
1560     bool LI = isa<LoadInst>(V);
1561     bool SI = isa<StoreInst>(V);
1562     if (!LI && !SI)
1563       return false;
1564     auto *Ty = getLoadStoreType(V);
1565     Align Align = getLoadStoreAlignment(V);
1566     return (LI && TTI.isLegalMaskedGather(Ty, Align)) ||
1567            (SI && TTI.isLegalMaskedScatter(Ty, Align));
1568   }
1569 
1570   /// Returns true if the target machine supports all of the reduction
1571   /// variables found for the given VF.
1572   bool canVectorizeReductions(ElementCount VF) const {
1573     return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
1574       const RecurrenceDescriptor &RdxDesc = Reduction.second;
1575       return TTI.isLegalToVectorizeReduction(RdxDesc, VF);
1576     }));
1577   }
1578 
1579   /// Returns true if \p I is an instruction that will be scalarized with
1580   /// predication. Such instructions include conditional stores and
1581   /// instructions that may divide by zero.
1582   /// If a non-zero VF has been calculated, we check if I will be scalarized
1583   /// predication for that VF.
1584   bool isScalarWithPredication(Instruction *I) const;
1585 
1586   // Returns true if \p I is an instruction that will be predicated either
1587   // through scalar predication or masked load/store or masked gather/scatter.
1588   // Superset of instructions that return true for isScalarWithPredication.
1589   bool isPredicatedInst(Instruction *I, bool IsKnownUniform = false) {
1590     // When we know the load is uniform and the original scalar loop was not
1591     // predicated we don't need to mark it as a predicated instruction. Any
1592     // vectorised blocks created when tail-folding are something artificial we
1593     // have introduced and we know there is always at least one active lane.
1594     // That's why we call Legal->blockNeedsPredication here because it doesn't
1595     // query tail-folding.
1596     if (IsKnownUniform && isa<LoadInst>(I) &&
1597         !Legal->blockNeedsPredication(I->getParent()))
1598       return false;
1599     if (!blockNeedsPredicationForAnyReason(I->getParent()))
1600       return false;
1601     // Loads and stores that need some form of masked operation are predicated
1602     // instructions.
1603     if (isa<LoadInst>(I) || isa<StoreInst>(I))
1604       return Legal->isMaskRequired(I);
1605     return isScalarWithPredication(I);
1606   }
1607 
1608   /// Returns true if \p I is a memory instruction with consecutive memory
1609   /// access that can be widened.
1610   bool
1611   memoryInstructionCanBeWidened(Instruction *I,
1612                                 ElementCount VF = ElementCount::getFixed(1));
1613 
1614   /// Returns true if \p I is a memory instruction in an interleaved-group
1615   /// of memory accesses that can be vectorized with wide vector loads/stores
1616   /// and shuffles.
1617   bool
1618   interleavedAccessCanBeWidened(Instruction *I,
1619                                 ElementCount VF = ElementCount::getFixed(1));
1620 
1621   /// Check if \p Instr belongs to any interleaved access group.
1622   bool isAccessInterleaved(Instruction *Instr) {
1623     return InterleaveInfo.isInterleaved(Instr);
1624   }
1625 
1626   /// Get the interleaved access group that \p Instr belongs to.
1627   const InterleaveGroup<Instruction> *
1628   getInterleavedAccessGroup(Instruction *Instr) {
1629     return InterleaveInfo.getInterleaveGroup(Instr);
1630   }
1631 
1632   /// Returns true if we're required to use a scalar epilogue for at least
1633   /// the final iteration of the original loop.
1634   bool requiresScalarEpilogue(ElementCount VF) const {
1635     if (!isScalarEpilogueAllowed())
1636       return false;
1637     // If we might exit from anywhere but the latch, must run the exiting
1638     // iteration in scalar form.
1639     if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch())
1640       return true;
1641     return VF.isVector() && InterleaveInfo.requiresScalarEpilogue();
1642   }
1643 
1644   /// Returns true if a scalar epilogue is not allowed due to optsize or a
1645   /// loop hint annotation.
1646   bool isScalarEpilogueAllowed() const {
1647     return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1648   }
1649 
1650   /// Returns true if all loop blocks should be masked to fold tail loop.
1651   bool foldTailByMasking() const { return FoldTailByMasking; }
1652 
1653   /// Returns true if the instructions in this block requires predication
1654   /// for any reason, e.g. because tail folding now requires a predicate
1655   /// or because the block in the original loop was predicated.
1656   bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const {
1657     return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1658   }
1659 
1660   /// A SmallMapVector to store the InLoop reduction op chains, mapping phi
1661   /// nodes to the chain of instructions representing the reductions. Uses a
1662   /// MapVector to ensure deterministic iteration order.
1663   using ReductionChainMap =
1664       SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>;
1665 
1666   /// Return the chain of instructions representing an inloop reduction.
1667   const ReductionChainMap &getInLoopReductionChains() const {
1668     return InLoopReductionChains;
1669   }
1670 
1671   /// Returns true if the Phi is part of an inloop reduction.
1672   bool isInLoopReduction(PHINode *Phi) const {
1673     return InLoopReductionChains.count(Phi);
1674   }
1675 
1676   /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1677   /// with factor VF.  Return the cost of the instruction, including
1678   /// scalarization overhead if it's needed.
1679   InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const;
1680 
1681   /// Estimate cost of a call instruction CI if it were vectorized with factor
1682   /// VF. Return the cost of the instruction, including scalarization overhead
1683   /// if it's needed. The flag NeedToScalarize shows if the call needs to be
1684   /// scalarized -
1685   /// i.e. either vector version isn't available, or is too expensive.
1686   InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF,
1687                                     bool &NeedToScalarize) const;
1688 
1689   /// Returns true if the per-lane cost of VectorizationFactor A is lower than
1690   /// that of B.
1691   bool isMoreProfitable(const VectorizationFactor &A,
1692                         const VectorizationFactor &B) const;
1693 
1694   /// Invalidates decisions already taken by the cost model.
1695   void invalidateCostModelingDecisions() {
1696     WideningDecisions.clear();
1697     Uniforms.clear();
1698     Scalars.clear();
1699   }
1700 
1701 private:
1702   unsigned NumPredStores = 0;
1703 
1704   /// \return An upper bound for the vectorization factors for both
1705   /// fixed and scalable vectorization, where the minimum-known number of
1706   /// elements is a power-of-2 larger than zero. If scalable vectorization is
1707   /// disabled or unsupported, then the scalable part will be equal to
1708   /// ElementCount::getScalable(0).
1709   FixedScalableVFPair computeFeasibleMaxVF(unsigned ConstTripCount,
1710                                            ElementCount UserVF,
1711                                            bool FoldTailByMasking);
1712 
1713   /// \return the maximized element count based on the targets vector
1714   /// registers and the loop trip-count, but limited to a maximum safe VF.
1715   /// This is a helper function of computeFeasibleMaxVF.
1716   /// FIXME: MaxSafeVF is currently passed by reference to avoid some obscure
1717   /// issue that occurred on one of the buildbots which cannot be reproduced
1718   /// without having access to the properietary compiler (see comments on
1719   /// D98509). The issue is currently under investigation and this workaround
1720   /// will be removed as soon as possible.
1721   ElementCount getMaximizedVFForTarget(unsigned ConstTripCount,
1722                                        unsigned SmallestType,
1723                                        unsigned WidestType,
1724                                        const ElementCount &MaxSafeVF,
1725                                        bool FoldTailByMasking);
1726 
1727   /// \return the maximum legal scalable VF, based on the safe max number
1728   /// of elements.
1729   ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements);
1730 
1731   /// The vectorization cost is a combination of the cost itself and a boolean
1732   /// indicating whether any of the contributing operations will actually
1733   /// operate on vector values after type legalization in the backend. If this
1734   /// latter value is false, then all operations will be scalarized (i.e. no
1735   /// vectorization has actually taken place).
1736   using VectorizationCostTy = std::pair<InstructionCost, bool>;
1737 
1738   /// Returns the expected execution cost. The unit of the cost does
1739   /// not matter because we use the 'cost' units to compare different
1740   /// vector widths. The cost that is returned is *not* normalized by
1741   /// the factor width. If \p Invalid is not nullptr, this function
1742   /// will add a pair(Instruction*, ElementCount) to \p Invalid for
1743   /// each instruction that has an Invalid cost for the given VF.
1744   using InstructionVFPair = std::pair<Instruction *, ElementCount>;
1745   VectorizationCostTy
1746   expectedCost(ElementCount VF,
1747                SmallVectorImpl<InstructionVFPair> *Invalid = nullptr);
1748 
1749   /// Returns the execution time cost of an instruction for a given vector
1750   /// width. Vector width of one means scalar.
1751   VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF);
1752 
1753   /// The cost-computation logic from getInstructionCost which provides
1754   /// the vector type as an output parameter.
1755   InstructionCost getInstructionCost(Instruction *I, ElementCount VF,
1756                                      Type *&VectorTy);
1757 
1758   /// Return the cost of instructions in an inloop reduction pattern, if I is
1759   /// part of that pattern.
1760   Optional<InstructionCost>
1761   getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy,
1762                           TTI::TargetCostKind CostKind);
1763 
1764   /// Calculate vectorization cost of memory instruction \p I.
1765   InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF);
1766 
1767   /// The cost computation for scalarized memory instruction.
1768   InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF);
1769 
1770   /// The cost computation for interleaving group of memory instructions.
1771   InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF);
1772 
1773   /// The cost computation for Gather/Scatter instruction.
1774   InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF);
1775 
1776   /// The cost computation for widening instruction \p I with consecutive
1777   /// memory access.
1778   InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
1779 
1780   /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1781   /// Load: scalar load + broadcast.
1782   /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1783   /// element)
1784   InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF);
1785 
1786   /// Estimate the overhead of scalarizing an instruction. This is a
1787   /// convenience wrapper for the type-based getScalarizationOverhead API.
1788   InstructionCost getScalarizationOverhead(Instruction *I,
1789                                            ElementCount VF) const;
1790 
1791   /// Returns whether the instruction is a load or store and will be a emitted
1792   /// as a vector operation.
1793   bool isConsecutiveLoadOrStore(Instruction *I);
1794 
1795   /// Returns true if an artificially high cost for emulated masked memrefs
1796   /// should be used.
1797   bool useEmulatedMaskMemRefHack(Instruction *I);
1798 
1799   /// Map of scalar integer values to the smallest bitwidth they can be legally
1800   /// represented as. The vector equivalents of these values should be truncated
1801   /// to this type.
1802   MapVector<Instruction *, uint64_t> MinBWs;
1803 
1804   /// A type representing the costs for instructions if they were to be
1805   /// scalarized rather than vectorized. The entries are Instruction-Cost
1806   /// pairs.
1807   using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>;
1808 
1809   /// A set containing all BasicBlocks that are known to present after
1810   /// vectorization as a predicated block.
1811   SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;
1812 
1813   /// Records whether it is allowed to have the original scalar loop execute at
1814   /// least once. This may be needed as a fallback loop in case runtime
1815   /// aliasing/dependence checks fail, or to handle the tail/remainder
1816   /// iterations when the trip count is unknown or doesn't divide by the VF,
1817   /// or as a peel-loop to handle gaps in interleave-groups.
1818   /// Under optsize and when the trip count is very small we don't allow any
1819   /// iterations to execute in the scalar loop.
1820   ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1821 
1822   /// All blocks of loop are to be masked to fold tail of scalar iterations.
1823   bool FoldTailByMasking = false;
1824 
1825   /// A map holding scalar costs for different vectorization factors. The
1826   /// presence of a cost for an instruction in the mapping indicates that the
1827   /// instruction will be scalarized when vectorizing with the associated
1828   /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1829   DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize;
1830 
1831   /// Holds the instructions known to be uniform after vectorization.
1832   /// The data is collected per VF.
1833   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms;
1834 
1835   /// Holds the instructions known to be scalar after vectorization.
1836   /// The data is collected per VF.
1837   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars;
1838 
1839   /// Holds the instructions (address computations) that are forced to be
1840   /// scalarized.
1841   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1842 
1843   /// PHINodes of the reductions that should be expanded in-loop along with
1844   /// their associated chains of reduction operations, in program order from top
1845   /// (PHI) to bottom
1846   ReductionChainMap InLoopReductionChains;
1847 
1848   /// A Map of inloop reduction operations and their immediate chain operand.
1849   /// FIXME: This can be removed once reductions can be costed correctly in
1850   /// vplan. This was added to allow quick lookup to the inloop operations,
1851   /// without having to loop through InLoopReductionChains.
1852   DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains;
1853 
1854   /// Returns the expected difference in cost from scalarizing the expression
1855   /// feeding a predicated instruction \p PredInst. The instructions to
1856   /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1857   /// non-negative return value implies the expression will be scalarized.
1858   /// Currently, only single-use chains are considered for scalarization.
1859   int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
1860                               ElementCount VF);
1861 
1862   /// Collect the instructions that are uniform after vectorization. An
1863   /// instruction is uniform if we represent it with a single scalar value in
1864   /// the vectorized loop corresponding to each vector iteration. Examples of
1865   /// uniform instructions include pointer operands of consecutive or
1866   /// interleaved memory accesses. Note that although uniformity implies an
1867   /// instruction will be scalar, the reverse is not true. In general, a
1868   /// scalarized instruction will be represented by VF scalar values in the
1869   /// vectorized loop, each corresponding to an iteration of the original
1870   /// scalar loop.
1871   void collectLoopUniforms(ElementCount VF);
1872 
1873   /// Collect the instructions that are scalar after vectorization. An
1874   /// instruction is scalar if it is known to be uniform or will be scalarized
1875   /// during vectorization. collectLoopScalars should only add non-uniform nodes
1876   /// to the list if they are used by a load/store instruction that is marked as
1877   /// CM_Scalarize. Non-uniform scalarized instructions will be represented by
1878   /// VF values in the vectorized loop, each corresponding to an iteration of
1879   /// the original scalar loop.
1880   void collectLoopScalars(ElementCount VF);
1881 
1882   /// Keeps cost model vectorization decision and cost for instructions.
1883   /// Right now it is used for memory instructions only.
1884   using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>,
1885                                 std::pair<InstWidening, InstructionCost>>;
1886 
1887   DecisionList WideningDecisions;
1888 
1889   /// Returns true if \p V is expected to be vectorized and it needs to be
1890   /// extracted.
1891   bool needsExtract(Value *V, ElementCount VF) const {
1892     Instruction *I = dyn_cast<Instruction>(V);
1893     if (VF.isScalar() || !I || !TheLoop->contains(I) ||
1894         TheLoop->isLoopInvariant(I))
1895       return false;
1896 
1897     // Assume we can vectorize V (and hence we need extraction) if the
1898     // scalars are not computed yet. This can happen, because it is called
1899     // via getScalarizationOverhead from setCostBasedWideningDecision, before
1900     // the scalars are collected. That should be a safe assumption in most
1901     // cases, because we check if the operands have vectorizable types
1902     // beforehand in LoopVectorizationLegality.
1903     return Scalars.find(VF) == Scalars.end() ||
1904            !isScalarAfterVectorization(I, VF);
1905   };
1906 
1907   /// Returns a range containing only operands needing to be extracted.
1908   SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1909                                                    ElementCount VF) const {
1910     return SmallVector<Value *, 4>(make_filter_range(
1911         Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1912   }
1913 
1914   /// Determines if we have the infrastructure to vectorize loop \p L and its
1915   /// epilogue, assuming the main loop is vectorized by \p VF.
1916   bool isCandidateForEpilogueVectorization(const Loop &L,
1917                                            const ElementCount VF) const;
1918 
1919   /// Returns true if epilogue vectorization is considered profitable, and
1920   /// false otherwise.
1921   /// \p VF is the vectorization factor chosen for the original loop.
1922   bool isEpilogueVectorizationProfitable(const ElementCount VF) const;
1923 
1924 public:
1925   /// The loop that we evaluate.
1926   Loop *TheLoop;
1927 
1928   /// Predicated scalar evolution analysis.
1929   PredicatedScalarEvolution &PSE;
1930 
1931   /// Loop Info analysis.
1932   LoopInfo *LI;
1933 
1934   /// Vectorization legality.
1935   LoopVectorizationLegality *Legal;
1936 
1937   /// Vector target information.
1938   const TargetTransformInfo &TTI;
1939 
1940   /// Target Library Info.
1941   const TargetLibraryInfo *TLI;
1942 
1943   /// Demanded bits analysis.
1944   DemandedBits *DB;
1945 
1946   /// Assumption cache.
1947   AssumptionCache *AC;
1948 
1949   /// Interface to emit optimization remarks.
1950   OptimizationRemarkEmitter *ORE;
1951 
1952   const Function *TheFunction;
1953 
1954   /// Loop Vectorize Hint.
1955   const LoopVectorizeHints *Hints;
1956 
1957   /// The interleave access information contains groups of interleaved accesses
1958   /// with the same stride and close to each other.
1959   InterleavedAccessInfo &InterleaveInfo;
1960 
1961   /// Values to ignore in the cost model.
1962   SmallPtrSet<const Value *, 16> ValuesToIgnore;
1963 
1964   /// Values to ignore in the cost model when VF > 1.
1965   SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1966 
1967   /// All element types found in the loop.
1968   SmallPtrSet<Type *, 16> ElementTypesInLoop;
1969 
1970   /// Profitable vector factors.
1971   SmallVector<VectorizationFactor, 8> ProfitableVFs;
1972 };
1973 } // end namespace llvm
1974 
1975 /// Helper struct to manage generating runtime checks for vectorization.
1976 ///
1977 /// The runtime checks are created up-front in temporary blocks to allow better
1978 /// estimating the cost and un-linked from the existing IR. After deciding to
1979 /// vectorize, the checks are moved back. If deciding not to vectorize, the
1980 /// temporary blocks are completely removed.
1981 class GeneratedRTChecks {
1982   /// Basic block which contains the generated SCEV checks, if any.
1983   BasicBlock *SCEVCheckBlock = nullptr;
1984 
1985   /// The value representing the result of the generated SCEV checks. If it is
1986   /// nullptr, either no SCEV checks have been generated or they have been used.
1987   Value *SCEVCheckCond = nullptr;
1988 
1989   /// Basic block which contains the generated memory runtime checks, if any.
1990   BasicBlock *MemCheckBlock = nullptr;
1991 
1992   /// The value representing the result of the generated memory runtime checks.
1993   /// If it is nullptr, either no memory runtime checks have been generated or
1994   /// they have been used.
1995   Value *MemRuntimeCheckCond = nullptr;
1996 
1997   DominatorTree *DT;
1998   LoopInfo *LI;
1999 
2000   SCEVExpander SCEVExp;
2001   SCEVExpander MemCheckExp;
2002 
2003 public:
2004   GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI,
2005                     const DataLayout &DL)
2006       : DT(DT), LI(LI), SCEVExp(SE, DL, "scev.check"),
2007         MemCheckExp(SE, DL, "scev.check") {}
2008 
2009   /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
2010   /// accurately estimate the cost of the runtime checks. The blocks are
2011   /// un-linked from the IR and is added back during vector code generation. If
2012   /// there is no vector code generation, the check blocks are removed
2013   /// completely.
2014   void Create(Loop *L, const LoopAccessInfo &LAI,
2015               const SCEVUnionPredicate &UnionPred) {
2016 
2017     BasicBlock *LoopHeader = L->getHeader();
2018     BasicBlock *Preheader = L->getLoopPreheader();
2019 
2020     // Use SplitBlock to create blocks for SCEV & memory runtime checks to
2021     // ensure the blocks are properly added to LoopInfo & DominatorTree. Those
2022     // may be used by SCEVExpander. The blocks will be un-linked from their
2023     // predecessors and removed from LI & DT at the end of the function.
2024     if (!UnionPred.isAlwaysTrue()) {
2025       SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI,
2026                                   nullptr, "vector.scevcheck");
2027 
2028       SCEVCheckCond = SCEVExp.expandCodeForPredicate(
2029           &UnionPred, SCEVCheckBlock->getTerminator());
2030     }
2031 
2032     const auto &RtPtrChecking = *LAI.getRuntimePointerChecking();
2033     if (RtPtrChecking.Need) {
2034       auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
2035       MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr,
2036                                  "vector.memcheck");
2037 
2038       MemRuntimeCheckCond =
2039           addRuntimeChecks(MemCheckBlock->getTerminator(), L,
2040                            RtPtrChecking.getChecks(), MemCheckExp);
2041       assert(MemRuntimeCheckCond &&
2042              "no RT checks generated although RtPtrChecking "
2043              "claimed checks are required");
2044     }
2045 
2046     if (!MemCheckBlock && !SCEVCheckBlock)
2047       return;
2048 
2049     // Unhook the temporary block with the checks, update various places
2050     // accordingly.
2051     if (SCEVCheckBlock)
2052       SCEVCheckBlock->replaceAllUsesWith(Preheader);
2053     if (MemCheckBlock)
2054       MemCheckBlock->replaceAllUsesWith(Preheader);
2055 
2056     if (SCEVCheckBlock) {
2057       SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
2058       new UnreachableInst(Preheader->getContext(), SCEVCheckBlock);
2059       Preheader->getTerminator()->eraseFromParent();
2060     }
2061     if (MemCheckBlock) {
2062       MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
2063       new UnreachableInst(Preheader->getContext(), MemCheckBlock);
2064       Preheader->getTerminator()->eraseFromParent();
2065     }
2066 
2067     DT->changeImmediateDominator(LoopHeader, Preheader);
2068     if (MemCheckBlock) {
2069       DT->eraseNode(MemCheckBlock);
2070       LI->removeBlock(MemCheckBlock);
2071     }
2072     if (SCEVCheckBlock) {
2073       DT->eraseNode(SCEVCheckBlock);
2074       LI->removeBlock(SCEVCheckBlock);
2075     }
2076   }
2077 
2078   /// Remove the created SCEV & memory runtime check blocks & instructions, if
2079   /// unused.
2080   ~GeneratedRTChecks() {
2081     SCEVExpanderCleaner SCEVCleaner(SCEVExp, *DT);
2082     SCEVExpanderCleaner MemCheckCleaner(MemCheckExp, *DT);
2083     if (!SCEVCheckCond)
2084       SCEVCleaner.markResultUsed();
2085 
2086     if (!MemRuntimeCheckCond)
2087       MemCheckCleaner.markResultUsed();
2088 
2089     if (MemRuntimeCheckCond) {
2090       auto &SE = *MemCheckExp.getSE();
2091       // Memory runtime check generation creates compares that use expanded
2092       // values. Remove them before running the SCEVExpanderCleaners.
2093       for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) {
2094         if (MemCheckExp.isInsertedInstruction(&I))
2095           continue;
2096         SE.forgetValue(&I);
2097         I.eraseFromParent();
2098       }
2099     }
2100     MemCheckCleaner.cleanup();
2101     SCEVCleaner.cleanup();
2102 
2103     if (SCEVCheckCond)
2104       SCEVCheckBlock->eraseFromParent();
2105     if (MemRuntimeCheckCond)
2106       MemCheckBlock->eraseFromParent();
2107   }
2108 
2109   /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and
2110   /// adjusts the branches to branch to the vector preheader or \p Bypass,
2111   /// depending on the generated condition.
2112   BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass,
2113                              BasicBlock *LoopVectorPreHeader,
2114                              BasicBlock *LoopExitBlock) {
2115     if (!SCEVCheckCond)
2116       return nullptr;
2117     if (auto *C = dyn_cast<ConstantInt>(SCEVCheckCond))
2118       if (C->isZero())
2119         return nullptr;
2120 
2121     auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2122 
2123     BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock);
2124     // Create new preheader for vector loop.
2125     if (auto *PL = LI->getLoopFor(LoopVectorPreHeader))
2126       PL->addBasicBlockToLoop(SCEVCheckBlock, *LI);
2127 
2128     SCEVCheckBlock->getTerminator()->eraseFromParent();
2129     SCEVCheckBlock->moveBefore(LoopVectorPreHeader);
2130     Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2131                                                 SCEVCheckBlock);
2132 
2133     DT->addNewBlock(SCEVCheckBlock, Pred);
2134     DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock);
2135 
2136     ReplaceInstWithInst(
2137         SCEVCheckBlock->getTerminator(),
2138         BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheckCond));
2139     // Mark the check as used, to prevent it from being removed during cleanup.
2140     SCEVCheckCond = nullptr;
2141     return SCEVCheckBlock;
2142   }
2143 
2144   /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts
2145   /// the branches to branch to the vector preheader or \p Bypass, depending on
2146   /// the generated condition.
2147   BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass,
2148                                    BasicBlock *LoopVectorPreHeader) {
2149     // Check if we generated code that checks in runtime if arrays overlap.
2150     if (!MemRuntimeCheckCond)
2151       return nullptr;
2152 
2153     auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2154     Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2155                                                 MemCheckBlock);
2156 
2157     DT->addNewBlock(MemCheckBlock, Pred);
2158     DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock);
2159     MemCheckBlock->moveBefore(LoopVectorPreHeader);
2160 
2161     if (auto *PL = LI->getLoopFor(LoopVectorPreHeader))
2162       PL->addBasicBlockToLoop(MemCheckBlock, *LI);
2163 
2164     ReplaceInstWithInst(
2165         MemCheckBlock->getTerminator(),
2166         BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond));
2167     MemCheckBlock->getTerminator()->setDebugLoc(
2168         Pred->getTerminator()->getDebugLoc());
2169 
2170     // Mark the check as used, to prevent it from being removed during cleanup.
2171     MemRuntimeCheckCond = nullptr;
2172     return MemCheckBlock;
2173   }
2174 };
2175 
2176 // Return true if \p OuterLp is an outer loop annotated with hints for explicit
2177 // vectorization. The loop needs to be annotated with #pragma omp simd
2178 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
2179 // vector length information is not provided, vectorization is not considered
2180 // explicit. Interleave hints are not allowed either. These limitations will be
2181 // relaxed in the future.
2182 // Please, note that we are currently forced to abuse the pragma 'clang
2183 // vectorize' semantics. This pragma provides *auto-vectorization hints*
2184 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
2185 // provides *explicit vectorization hints* (LV can bypass legal checks and
2186 // assume that vectorization is legal). However, both hints are implemented
2187 // using the same metadata (llvm.loop.vectorize, processed by
2188 // LoopVectorizeHints). This will be fixed in the future when the native IR
2189 // representation for pragma 'omp simd' is introduced.
2190 static bool isExplicitVecOuterLoop(Loop *OuterLp,
2191                                    OptimizationRemarkEmitter *ORE) {
2192   assert(!OuterLp->isInnermost() && "This is not an outer loop");
2193   LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
2194 
2195   // Only outer loops with an explicit vectorization hint are supported.
2196   // Unannotated outer loops are ignored.
2197   if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
2198     return false;
2199 
2200   Function *Fn = OuterLp->getHeader()->getParent();
2201   if (!Hints.allowVectorization(Fn, OuterLp,
2202                                 true /*VectorizeOnlyWhenForced*/)) {
2203     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
2204     return false;
2205   }
2206 
2207   if (Hints.getInterleave() > 1) {
2208     // TODO: Interleave support is future work.
2209     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
2210                          "outer loops.\n");
2211     Hints.emitRemarkWithHints();
2212     return false;
2213   }
2214 
2215   return true;
2216 }
2217 
2218 static void collectSupportedLoops(Loop &L, LoopInfo *LI,
2219                                   OptimizationRemarkEmitter *ORE,
2220                                   SmallVectorImpl<Loop *> &V) {
2221   // Collect inner loops and outer loops without irreducible control flow. For
2222   // now, only collect outer loops that have explicit vectorization hints. If we
2223   // are stress testing the VPlan H-CFG construction, we collect the outermost
2224   // loop of every loop nest.
2225   if (L.isInnermost() || VPlanBuildStressTest ||
2226       (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
2227     LoopBlocksRPO RPOT(&L);
2228     RPOT.perform(LI);
2229     if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
2230       V.push_back(&L);
2231       // TODO: Collect inner loops inside marked outer loops in case
2232       // vectorization fails for the outer loop. Do not invoke
2233       // 'containsIrreducibleCFG' again for inner loops when the outer loop is
2234       // already known to be reducible. We can use an inherited attribute for
2235       // that.
2236       return;
2237     }
2238   }
2239   for (Loop *InnerL : L)
2240     collectSupportedLoops(*InnerL, LI, ORE, V);
2241 }
2242 
2243 namespace {
2244 
2245 /// The LoopVectorize Pass.
2246 struct LoopVectorize : public FunctionPass {
2247   /// Pass identification, replacement for typeid
2248   static char ID;
2249 
2250   LoopVectorizePass Impl;
2251 
2252   explicit LoopVectorize(bool InterleaveOnlyWhenForced = false,
2253                          bool VectorizeOnlyWhenForced = false)
2254       : FunctionPass(ID),
2255         Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) {
2256     initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
2257   }
2258 
2259   bool runOnFunction(Function &F) override {
2260     if (skipFunction(F))
2261       return false;
2262 
2263     auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
2264     auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
2265     auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
2266     auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
2267     auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
2268     auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
2269     auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
2270     auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
2271     auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
2272     auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
2273     auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
2274     auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
2275     auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
2276 
2277     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
2278         [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
2279 
2280     return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC,
2281                         GetLAA, *ORE, PSI).MadeAnyChange;
2282   }
2283 
2284   void getAnalysisUsage(AnalysisUsage &AU) const override {
2285     AU.addRequired<AssumptionCacheTracker>();
2286     AU.addRequired<BlockFrequencyInfoWrapperPass>();
2287     AU.addRequired<DominatorTreeWrapperPass>();
2288     AU.addRequired<LoopInfoWrapperPass>();
2289     AU.addRequired<ScalarEvolutionWrapperPass>();
2290     AU.addRequired<TargetTransformInfoWrapperPass>();
2291     AU.addRequired<AAResultsWrapperPass>();
2292     AU.addRequired<LoopAccessLegacyAnalysis>();
2293     AU.addRequired<DemandedBitsWrapperPass>();
2294     AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
2295     AU.addRequired<InjectTLIMappingsLegacy>();
2296 
2297     // We currently do not preserve loopinfo/dominator analyses with outer loop
2298     // vectorization. Until this is addressed, mark these analyses as preserved
2299     // only for non-VPlan-native path.
2300     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
2301     if (!EnableVPlanNativePath) {
2302       AU.addPreserved<LoopInfoWrapperPass>();
2303       AU.addPreserved<DominatorTreeWrapperPass>();
2304     }
2305 
2306     AU.addPreserved<BasicAAWrapperPass>();
2307     AU.addPreserved<GlobalsAAWrapperPass>();
2308     AU.addRequired<ProfileSummaryInfoWrapperPass>();
2309   }
2310 };
2311 
2312 } // end anonymous namespace
2313 
2314 //===----------------------------------------------------------------------===//
2315 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
2316 // LoopVectorizationCostModel and LoopVectorizationPlanner.
2317 //===----------------------------------------------------------------------===//
2318 
2319 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
2320   // We need to place the broadcast of invariant variables outside the loop,
2321   // but only if it's proven safe to do so. Else, broadcast will be inside
2322   // vector loop body.
2323   Instruction *Instr = dyn_cast<Instruction>(V);
2324   bool SafeToHoist = OrigLoop->isLoopInvariant(V) &&
2325                      (!Instr ||
2326                       DT->dominates(Instr->getParent(), LoopVectorPreHeader));
2327   // Place the code for broadcasting invariant variables in the new preheader.
2328   IRBuilder<>::InsertPointGuard Guard(Builder);
2329   if (SafeToHoist)
2330     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
2331 
2332   // Broadcast the scalar into all locations in the vector.
2333   Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
2334 
2335   return Shuf;
2336 }
2337 
2338 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
2339     const InductionDescriptor &II, Value *Step, Value *Start,
2340     Instruction *EntryVal, VPValue *Def, VPTransformState &State) {
2341   IRBuilder<> &Builder = State.Builder;
2342   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
2343          "Expected either an induction phi-node or a truncate of it!");
2344 
2345   // Construct the initial value of the vector IV in the vector loop preheader
2346   auto CurrIP = Builder.saveIP();
2347   Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
2348   if (isa<TruncInst>(EntryVal)) {
2349     assert(Start->getType()->isIntegerTy() &&
2350            "Truncation requires an integer type");
2351     auto *TruncType = cast<IntegerType>(EntryVal->getType());
2352     Step = Builder.CreateTrunc(Step, TruncType);
2353     Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
2354   }
2355 
2356   Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0);
2357   Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start);
2358   Value *SteppedStart =
2359       getStepVector(SplatStart, Zero, Step, II.getInductionOpcode());
2360 
2361   // We create vector phi nodes for both integer and floating-point induction
2362   // variables. Here, we determine the kind of arithmetic we will perform.
2363   Instruction::BinaryOps AddOp;
2364   Instruction::BinaryOps MulOp;
2365   if (Step->getType()->isIntegerTy()) {
2366     AddOp = Instruction::Add;
2367     MulOp = Instruction::Mul;
2368   } else {
2369     AddOp = II.getInductionOpcode();
2370     MulOp = Instruction::FMul;
2371   }
2372 
2373   // Multiply the vectorization factor by the step using integer or
2374   // floating-point arithmetic as appropriate.
2375   Type *StepType = Step->getType();
2376   Value *RuntimeVF;
2377   if (Step->getType()->isFloatingPointTy())
2378     RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, State.VF);
2379   else
2380     RuntimeVF = getRuntimeVF(Builder, StepType, State.VF);
2381   Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF);
2382 
2383   // Create a vector splat to use in the induction update.
2384   //
2385   // FIXME: If the step is non-constant, we create the vector splat with
2386   //        IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
2387   //        handle a constant vector splat.
2388   Value *SplatVF = isa<Constant>(Mul)
2389                        ? ConstantVector::getSplat(State.VF, cast<Constant>(Mul))
2390                        : Builder.CreateVectorSplat(State.VF, Mul);
2391   Builder.restoreIP(CurrIP);
2392 
2393   // We may need to add the step a number of times, depending on the unroll
2394   // factor. The last of those goes into the PHI.
2395   PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
2396                                     &*LoopVectorBody->getFirstInsertionPt());
2397   VecInd->setDebugLoc(EntryVal->getDebugLoc());
2398   Instruction *LastInduction = VecInd;
2399   for (unsigned Part = 0; Part < UF; ++Part) {
2400     State.set(Def, LastInduction, Part);
2401 
2402     if (isa<TruncInst>(EntryVal))
2403       addMetadata(LastInduction, EntryVal);
2404 
2405     LastInduction = cast<Instruction>(
2406         Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"));
2407     LastInduction->setDebugLoc(EntryVal->getDebugLoc());
2408   }
2409 
2410   // Move the last step to the end of the latch block. This ensures consistent
2411   // placement of all induction updates.
2412   auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
2413   auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator());
2414   auto *ICmp = cast<Instruction>(Br->getCondition());
2415   LastInduction->moveBefore(ICmp);
2416   LastInduction->setName("vec.ind.next");
2417 
2418   VecInd->addIncoming(SteppedStart, LoopVectorPreHeader);
2419   VecInd->addIncoming(LastInduction, LoopVectorLatch);
2420 }
2421 
2422 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const {
2423   return Cost->isScalarAfterVectorization(I, VF) ||
2424          Cost->isProfitableToScalarize(I, VF);
2425 }
2426 
2427 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const {
2428   if (shouldScalarizeInstruction(IV))
2429     return true;
2430   auto isScalarInst = [&](User *U) -> bool {
2431     auto *I = cast<Instruction>(U);
2432     return (OrigLoop->contains(I) && shouldScalarizeInstruction(I));
2433   };
2434   return llvm::any_of(IV->users(), isScalarInst);
2435 }
2436 
2437 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV,
2438                                                 const InductionDescriptor &ID,
2439                                                 Value *Start, TruncInst *Trunc,
2440                                                 VPValue *Def,
2441                                                 VPTransformState &State) {
2442   IRBuilder<> &Builder = State.Builder;
2443   assert((IV->getType()->isIntegerTy() || IV != OldInduction) &&
2444          "Primary induction variable must have an integer type");
2445   assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
2446 
2447   // The value from the original loop to which we are mapping the new induction
2448   // variable.
2449   Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
2450 
2451   auto &DL = EntryVal->getModule()->getDataLayout();
2452 
2453   // Generate code for the induction step. Note that induction steps are
2454   // required to be loop-invariant
2455   auto CreateStepValue = [&](const SCEV *Step) -> Value * {
2456     assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) &&
2457            "Induction step should be loop invariant");
2458     if (PSE.getSE()->isSCEVable(IV->getType())) {
2459       SCEVExpander Exp(*PSE.getSE(), DL, "induction");
2460       return Exp.expandCodeFor(Step, Step->getType(),
2461                                State.CFG.VectorPreHeader->getTerminator());
2462     }
2463     return cast<SCEVUnknown>(Step)->getValue();
2464   };
2465 
2466   // The scalar value to broadcast. This is derived from the canonical
2467   // induction variable. If a truncation type is given, truncate the canonical
2468   // induction variable and step. Otherwise, derive these values from the
2469   // induction descriptor.
2470   auto CreateScalarIV = [&](Value *&Step) -> Value * {
2471     Value *ScalarIV = Induction;
2472     if (IV != OldInduction) {
2473       ScalarIV = IV->getType()->isIntegerTy()
2474                      ? Builder.CreateSExtOrTrunc(Induction, IV->getType())
2475                      : Builder.CreateCast(Instruction::SIToFP, Induction,
2476                                           IV->getType());
2477       ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID,
2478                                       State.CFG.PrevBB);
2479       ScalarIV->setName("offset.idx");
2480     }
2481     if (Trunc) {
2482       auto *TruncType = cast<IntegerType>(Trunc->getType());
2483       assert(Step->getType()->isIntegerTy() &&
2484              "Truncation requires an integer step");
2485       ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType);
2486       Step = Builder.CreateTrunc(Step, TruncType);
2487     }
2488     return ScalarIV;
2489   };
2490 
2491   // Create the vector values from the scalar IV, in the absence of creating a
2492   // vector IV.
2493   auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) {
2494     Value *Broadcasted = getBroadcastInstrs(ScalarIV);
2495     for (unsigned Part = 0; Part < UF; ++Part) {
2496       assert(!State.VF.isScalable() && "scalable vectors not yet supported.");
2497       Value *StartIdx;
2498       if (Step->getType()->isFloatingPointTy())
2499         StartIdx =
2500             getRuntimeVFAsFloat(Builder, Step->getType(), State.VF * Part);
2501       else
2502         StartIdx = getRuntimeVF(Builder, Step->getType(), State.VF * Part);
2503 
2504       Value *EntryPart =
2505           getStepVector(Broadcasted, StartIdx, Step, ID.getInductionOpcode());
2506       State.set(Def, EntryPart, Part);
2507       if (Trunc)
2508         addMetadata(EntryPart, Trunc);
2509     }
2510   };
2511 
2512   // Fast-math-flags propagate from the original induction instruction.
2513   IRBuilder<>::FastMathFlagGuard FMFG(Builder);
2514   if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp()))
2515     Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags());
2516 
2517   // Now do the actual transformations, and start with creating the step value.
2518   Value *Step = CreateStepValue(ID.getStep());
2519   if (State.VF.isZero() || State.VF.isScalar()) {
2520     Value *ScalarIV = CreateScalarIV(Step);
2521     CreateSplatIV(ScalarIV, Step);
2522     return;
2523   }
2524 
2525   // Determine if we want a scalar version of the induction variable. This is
2526   // true if the induction variable itself is not widened, or if it has at
2527   // least one user in the loop that is not widened.
2528   auto NeedsScalarIV = needsScalarInduction(EntryVal);
2529   if (!NeedsScalarIV) {
2530     createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, State);
2531     return;
2532   }
2533 
2534   // Try to create a new independent vector induction variable. If we can't
2535   // create the phi node, we will splat the scalar induction variable in each
2536   // loop iteration.
2537   if (!shouldScalarizeInstruction(EntryVal)) {
2538     createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, State);
2539     Value *ScalarIV = CreateScalarIV(Step);
2540     // Create scalar steps that can be used by instructions we will later
2541     // scalarize. Note that the addition of the scalar steps will not increase
2542     // the number of instructions in the loop in the common case prior to
2543     // InstCombine. We will be trading one vector extract for each scalar step.
2544     buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, State);
2545     return;
2546   }
2547 
2548   // All IV users are scalar instructions, so only emit a scalar IV, not a
2549   // vectorised IV. Except when we tail-fold, then the splat IV feeds the
2550   // predicate used by the masked loads/stores.
2551   Value *ScalarIV = CreateScalarIV(Step);
2552   if (!Cost->isScalarEpilogueAllowed())
2553     CreateSplatIV(ScalarIV, Step);
2554   buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, State);
2555 }
2556 
2557 Value *InnerLoopVectorizer::getStepVector(Value *Val, Value *StartIdx,
2558                                           Value *Step,
2559                                           Instruction::BinaryOps BinOp) {
2560   // Create and check the types.
2561   auto *ValVTy = cast<VectorType>(Val->getType());
2562   ElementCount VLen = ValVTy->getElementCount();
2563 
2564   Type *STy = Val->getType()->getScalarType();
2565   assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
2566          "Induction Step must be an integer or FP");
2567   assert(Step->getType() == STy && "Step has wrong type");
2568 
2569   SmallVector<Constant *, 8> Indices;
2570 
2571   // Create a vector of consecutive numbers from zero to VF.
2572   VectorType *InitVecValVTy = ValVTy;
2573   Type *InitVecValSTy = STy;
2574   if (STy->isFloatingPointTy()) {
2575     InitVecValSTy =
2576         IntegerType::get(STy->getContext(), STy->getScalarSizeInBits());
2577     InitVecValVTy = VectorType::get(InitVecValSTy, VLen);
2578   }
2579   Value *InitVec = Builder.CreateStepVector(InitVecValVTy);
2580 
2581   // Splat the StartIdx
2582   Value *StartIdxSplat = Builder.CreateVectorSplat(VLen, StartIdx);
2583 
2584   if (STy->isIntegerTy()) {
2585     InitVec = Builder.CreateAdd(InitVec, StartIdxSplat);
2586     Step = Builder.CreateVectorSplat(VLen, Step);
2587     assert(Step->getType() == Val->getType() && "Invalid step vec");
2588     // FIXME: The newly created binary instructions should contain nsw/nuw flags,
2589     // which can be found from the original scalar operations.
2590     Step = Builder.CreateMul(InitVec, Step);
2591     return Builder.CreateAdd(Val, Step, "induction");
2592   }
2593 
2594   // Floating point induction.
2595   assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
2596          "Binary Opcode should be specified for FP induction");
2597   InitVec = Builder.CreateUIToFP(InitVec, ValVTy);
2598   InitVec = Builder.CreateFAdd(InitVec, StartIdxSplat);
2599 
2600   Step = Builder.CreateVectorSplat(VLen, Step);
2601   Value *MulOp = Builder.CreateFMul(InitVec, Step);
2602   return Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
2603 }
2604 
2605 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
2606                                            Instruction *EntryVal,
2607                                            const InductionDescriptor &ID,
2608                                            VPValue *Def,
2609                                            VPTransformState &State) {
2610   IRBuilder<> &Builder = State.Builder;
2611   // We shouldn't have to build scalar steps if we aren't vectorizing.
2612   assert(State.VF.isVector() && "VF should be greater than one");
2613   // Get the value type and ensure it and the step have the same integer type.
2614   Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
2615   assert(ScalarIVTy == Step->getType() &&
2616          "Val and Step should have the same type");
2617 
2618   // We build scalar steps for both integer and floating-point induction
2619   // variables. Here, we determine the kind of arithmetic we will perform.
2620   Instruction::BinaryOps AddOp;
2621   Instruction::BinaryOps MulOp;
2622   if (ScalarIVTy->isIntegerTy()) {
2623     AddOp = Instruction::Add;
2624     MulOp = Instruction::Mul;
2625   } else {
2626     AddOp = ID.getInductionOpcode();
2627     MulOp = Instruction::FMul;
2628   }
2629 
2630   // Determine the number of scalars we need to generate for each unroll
2631   // iteration. If EntryVal is uniform, we only need to generate the first
2632   // lane. Otherwise, we generate all VF values.
2633   bool IsUniform =
2634       Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), State.VF);
2635   unsigned Lanes = IsUniform ? 1 : State.VF.getKnownMinValue();
2636   // Compute the scalar steps and save the results in State.
2637   Type *IntStepTy = IntegerType::get(ScalarIVTy->getContext(),
2638                                      ScalarIVTy->getScalarSizeInBits());
2639   Type *VecIVTy = nullptr;
2640   Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr;
2641   if (!IsUniform && State.VF.isScalable()) {
2642     VecIVTy = VectorType::get(ScalarIVTy, State.VF);
2643     UnitStepVec =
2644         Builder.CreateStepVector(VectorType::get(IntStepTy, State.VF));
2645     SplatStep = Builder.CreateVectorSplat(State.VF, Step);
2646     SplatIV = Builder.CreateVectorSplat(State.VF, ScalarIV);
2647   }
2648 
2649   for (unsigned Part = 0; Part < State.UF; ++Part) {
2650     Value *StartIdx0 = createStepForVF(Builder, IntStepTy, State.VF, Part);
2651 
2652     if (!IsUniform && State.VF.isScalable()) {
2653       auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0);
2654       auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec);
2655       if (ScalarIVTy->isFloatingPointTy())
2656         InitVec = Builder.CreateSIToFP(InitVec, VecIVTy);
2657       auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep);
2658       auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul);
2659       State.set(Def, Add, Part);
2660       // It's useful to record the lane values too for the known minimum number
2661       // of elements so we do those below. This improves the code quality when
2662       // trying to extract the first element, for example.
2663     }
2664 
2665     if (ScalarIVTy->isFloatingPointTy())
2666       StartIdx0 = Builder.CreateSIToFP(StartIdx0, ScalarIVTy);
2667 
2668     for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
2669       Value *StartIdx = Builder.CreateBinOp(
2670           AddOp, StartIdx0, getSignedIntOrFpConstant(ScalarIVTy, Lane));
2671       // The step returned by `createStepForVF` is a runtime-evaluated value
2672       // when VF is scalable. Otherwise, it should be folded into a Constant.
2673       assert((State.VF.isScalable() || isa<Constant>(StartIdx)) &&
2674              "Expected StartIdx to be folded to a constant when VF is not "
2675              "scalable");
2676       auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step);
2677       auto *Add = Builder.CreateBinOp(AddOp, ScalarIV, Mul);
2678       State.set(Def, Add, VPIteration(Part, Lane));
2679     }
2680   }
2681 }
2682 
2683 void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue *Def,
2684                                                     const VPIteration &Instance,
2685                                                     VPTransformState &State) {
2686   Value *ScalarInst = State.get(Def, Instance);
2687   Value *VectorValue = State.get(Def, Instance.Part);
2688   VectorValue = Builder.CreateInsertElement(
2689       VectorValue, ScalarInst,
2690       Instance.Lane.getAsRuntimeExpr(State.Builder, VF));
2691   State.set(Def, VectorValue, Instance.Part);
2692 }
2693 
2694 Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
2695   assert(Vec->getType()->isVectorTy() && "Invalid type");
2696   return Builder.CreateVectorReverse(Vec, "reverse");
2697 }
2698 
2699 // Return whether we allow using masked interleave-groups (for dealing with
2700 // strided loads/stores that reside in predicated blocks, or for dealing
2701 // with gaps).
2702 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2703   // If an override option has been passed in for interleaved accesses, use it.
2704   if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2705     return EnableMaskedInterleavedMemAccesses;
2706 
2707   return TTI.enableMaskedInterleavedAccessVectorization();
2708 }
2709 
2710 // Try to vectorize the interleave group that \p Instr belongs to.
2711 //
2712 // E.g. Translate following interleaved load group (factor = 3):
2713 //   for (i = 0; i < N; i+=3) {
2714 //     R = Pic[i];             // Member of index 0
2715 //     G = Pic[i+1];           // Member of index 1
2716 //     B = Pic[i+2];           // Member of index 2
2717 //     ... // do something to R, G, B
2718 //   }
2719 // To:
2720 //   %wide.vec = load <12 x i32>                       ; Read 4 tuples of R,G,B
2721 //   %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9>   ; R elements
2722 //   %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10>  ; G elements
2723 //   %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11>  ; B elements
2724 //
2725 // Or translate following interleaved store group (factor = 3):
2726 //   for (i = 0; i < N; i+=3) {
2727 //     ... do something to R, G, B
2728 //     Pic[i]   = R;           // Member of index 0
2729 //     Pic[i+1] = G;           // Member of index 1
2730 //     Pic[i+2] = B;           // Member of index 2
2731 //   }
2732 // To:
2733 //   %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2734 //   %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u>
2735 //   %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2736 //        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements
2737 //   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
2738 void InnerLoopVectorizer::vectorizeInterleaveGroup(
2739     const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs,
2740     VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues,
2741     VPValue *BlockInMask) {
2742   Instruction *Instr = Group->getInsertPos();
2743   const DataLayout &DL = Instr->getModule()->getDataLayout();
2744 
2745   // Prepare for the vector type of the interleaved load/store.
2746   Type *ScalarTy = getLoadStoreType(Instr);
2747   unsigned InterleaveFactor = Group->getFactor();
2748   assert(!VF.isScalable() && "scalable vectors not yet supported.");
2749   auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor);
2750 
2751   // Prepare for the new pointers.
2752   SmallVector<Value *, 2> AddrParts;
2753   unsigned Index = Group->getIndex(Instr);
2754 
2755   // TODO: extend the masked interleaved-group support to reversed access.
2756   assert((!BlockInMask || !Group->isReverse()) &&
2757          "Reversed masked interleave-group not supported.");
2758 
2759   // If the group is reverse, adjust the index to refer to the last vector lane
2760   // instead of the first. We adjust the index from the first vector lane,
2761   // rather than directly getting the pointer for lane VF - 1, because the
2762   // pointer operand of the interleaved access is supposed to be uniform. For
2763   // uniform instructions, we're only required to generate a value for the
2764   // first vector lane in each unroll iteration.
2765   if (Group->isReverse())
2766     Index += (VF.getKnownMinValue() - 1) * Group->getFactor();
2767 
2768   for (unsigned Part = 0; Part < UF; Part++) {
2769     Value *AddrPart = State.get(Addr, VPIteration(Part, 0));
2770     setDebugLocFromInst(AddrPart);
2771 
2772     // Notice current instruction could be any index. Need to adjust the address
2773     // to the member of index 0.
2774     //
2775     // E.g.  a = A[i+1];     // Member of index 1 (Current instruction)
2776     //       b = A[i];       // Member of index 0
2777     // Current pointer is pointed to A[i+1], adjust it to A[i].
2778     //
2779     // E.g.  A[i+1] = a;     // Member of index 1
2780     //       A[i]   = b;     // Member of index 0
2781     //       A[i+2] = c;     // Member of index 2 (Current instruction)
2782     // Current pointer is pointed to A[i+2], adjust it to A[i].
2783 
2784     bool InBounds = false;
2785     if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts()))
2786       InBounds = gep->isInBounds();
2787     AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index));
2788     cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds);
2789 
2790     // Cast to the vector pointer type.
2791     unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace();
2792     Type *PtrTy = VecTy->getPointerTo(AddressSpace);
2793     AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy));
2794   }
2795 
2796   setDebugLocFromInst(Instr);
2797   Value *PoisonVec = PoisonValue::get(VecTy);
2798 
2799   Value *MaskForGaps = nullptr;
2800   if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) {
2801     MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
2802     assert(MaskForGaps && "Mask for Gaps is required but it is null");
2803   }
2804 
2805   // Vectorize the interleaved load group.
2806   if (isa<LoadInst>(Instr)) {
2807     // For each unroll part, create a wide load for the group.
2808     SmallVector<Value *, 2> NewLoads;
2809     for (unsigned Part = 0; Part < UF; Part++) {
2810       Instruction *NewLoad;
2811       if (BlockInMask || MaskForGaps) {
2812         assert(useMaskedInterleavedAccesses(*TTI) &&
2813                "masked interleaved groups are not allowed.");
2814         Value *GroupMask = MaskForGaps;
2815         if (BlockInMask) {
2816           Value *BlockInMaskPart = State.get(BlockInMask, Part);
2817           Value *ShuffledMask = Builder.CreateShuffleVector(
2818               BlockInMaskPart,
2819               createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2820               "interleaved.mask");
2821           GroupMask = MaskForGaps
2822                           ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
2823                                                 MaskForGaps)
2824                           : ShuffledMask;
2825         }
2826         NewLoad =
2827             Builder.CreateMaskedLoad(VecTy, AddrParts[Part], Group->getAlign(),
2828                                      GroupMask, PoisonVec, "wide.masked.vec");
2829       }
2830       else
2831         NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part],
2832                                             Group->getAlign(), "wide.vec");
2833       Group->addMetadata(NewLoad);
2834       NewLoads.push_back(NewLoad);
2835     }
2836 
2837     // For each member in the group, shuffle out the appropriate data from the
2838     // wide loads.
2839     unsigned J = 0;
2840     for (unsigned I = 0; I < InterleaveFactor; ++I) {
2841       Instruction *Member = Group->getMember(I);
2842 
2843       // Skip the gaps in the group.
2844       if (!Member)
2845         continue;
2846 
2847       auto StrideMask =
2848           createStrideMask(I, InterleaveFactor, VF.getKnownMinValue());
2849       for (unsigned Part = 0; Part < UF; Part++) {
2850         Value *StridedVec = Builder.CreateShuffleVector(
2851             NewLoads[Part], StrideMask, "strided.vec");
2852 
2853         // If this member has different type, cast the result type.
2854         if (Member->getType() != ScalarTy) {
2855           assert(!VF.isScalable() && "VF is assumed to be non scalable.");
2856           VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2857           StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2858         }
2859 
2860         if (Group->isReverse())
2861           StridedVec = reverseVector(StridedVec);
2862 
2863         State.set(VPDefs[J], StridedVec, Part);
2864       }
2865       ++J;
2866     }
2867     return;
2868   }
2869 
2870   // The sub vector type for current instruction.
2871   auto *SubVT = VectorType::get(ScalarTy, VF);
2872 
2873   // Vectorize the interleaved store group.
2874   MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
2875   assert((!MaskForGaps || useMaskedInterleavedAccesses(*TTI)) &&
2876          "masked interleaved groups are not allowed.");
2877   assert((!MaskForGaps || !VF.isScalable()) &&
2878          "masking gaps for scalable vectors is not yet supported.");
2879   for (unsigned Part = 0; Part < UF; Part++) {
2880     // Collect the stored vector from each member.
2881     SmallVector<Value *, 4> StoredVecs;
2882     for (unsigned i = 0; i < InterleaveFactor; i++) {
2883       assert((Group->getMember(i) || MaskForGaps) &&
2884              "Fail to get a member from an interleaved store group");
2885       Instruction *Member = Group->getMember(i);
2886 
2887       // Skip the gaps in the group.
2888       if (!Member) {
2889         Value *Undef = PoisonValue::get(SubVT);
2890         StoredVecs.push_back(Undef);
2891         continue;
2892       }
2893 
2894       Value *StoredVec = State.get(StoredValues[i], Part);
2895 
2896       if (Group->isReverse())
2897         StoredVec = reverseVector(StoredVec);
2898 
2899       // If this member has different type, cast it to a unified type.
2900 
2901       if (StoredVec->getType() != SubVT)
2902         StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2903 
2904       StoredVecs.push_back(StoredVec);
2905     }
2906 
2907     // Concatenate all vectors into a wide vector.
2908     Value *WideVec = concatenateVectors(Builder, StoredVecs);
2909 
2910     // Interleave the elements in the wide vector.
2911     Value *IVec = Builder.CreateShuffleVector(
2912         WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor),
2913         "interleaved.vec");
2914 
2915     Instruction *NewStoreInstr;
2916     if (BlockInMask || MaskForGaps) {
2917       Value *GroupMask = MaskForGaps;
2918       if (BlockInMask) {
2919         Value *BlockInMaskPart = State.get(BlockInMask, Part);
2920         Value *ShuffledMask = Builder.CreateShuffleVector(
2921             BlockInMaskPart,
2922             createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2923             "interleaved.mask");
2924         GroupMask = MaskForGaps ? Builder.CreateBinOp(Instruction::And,
2925                                                       ShuffledMask, MaskForGaps)
2926                                 : ShuffledMask;
2927       }
2928       NewStoreInstr = Builder.CreateMaskedStore(IVec, AddrParts[Part],
2929                                                 Group->getAlign(), GroupMask);
2930     } else
2931       NewStoreInstr =
2932           Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign());
2933 
2934     Group->addMetadata(NewStoreInstr);
2935   }
2936 }
2937 
2938 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
2939                                                VPReplicateRecipe *RepRecipe,
2940                                                const VPIteration &Instance,
2941                                                bool IfPredicateInstr,
2942                                                VPTransformState &State) {
2943   assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
2944 
2945   // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for
2946   // the first lane and part.
2947   if (isa<NoAliasScopeDeclInst>(Instr))
2948     if (!Instance.isFirstIteration())
2949       return;
2950 
2951   setDebugLocFromInst(Instr);
2952 
2953   // Does this instruction return a value ?
2954   bool IsVoidRetTy = Instr->getType()->isVoidTy();
2955 
2956   Instruction *Cloned = Instr->clone();
2957   if (!IsVoidRetTy)
2958     Cloned->setName(Instr->getName() + ".cloned");
2959 
2960   // If the scalarized instruction contributes to the address computation of a
2961   // widen masked load/store which was in a basic block that needed predication
2962   // and is not predicated after vectorization, we can't propagate
2963   // poison-generating flags (nuw/nsw, exact, inbounds, etc.). The scalarized
2964   // instruction could feed a poison value to the base address of the widen
2965   // load/store.
2966   if (State.MayGeneratePoisonRecipes.count(RepRecipe) > 0)
2967     Cloned->dropPoisonGeneratingFlags();
2968 
2969   State.Builder.SetInsertPoint(Builder.GetInsertBlock(),
2970                                Builder.GetInsertPoint());
2971   // Replace the operands of the cloned instructions with their scalar
2972   // equivalents in the new loop.
2973   for (auto &I : enumerate(RepRecipe->operands())) {
2974     auto InputInstance = Instance;
2975     VPValue *Operand = I.value();
2976     if (State.Plan->isUniformAfterVectorization(Operand))
2977       InputInstance.Lane = VPLane::getFirstLane();
2978     Cloned->setOperand(I.index(), State.get(Operand, InputInstance));
2979   }
2980   addNewMetadata(Cloned, Instr);
2981 
2982   // Place the cloned scalar in the new loop.
2983   Builder.Insert(Cloned);
2984 
2985   State.set(RepRecipe, Cloned, Instance);
2986 
2987   // If we just cloned a new assumption, add it the assumption cache.
2988   if (auto *II = dyn_cast<AssumeInst>(Cloned))
2989     AC->registerAssumption(II);
2990 
2991   // End if-block.
2992   if (IfPredicateInstr)
2993     PredicatedInstructions.push_back(Cloned);
2994 }
2995 
2996 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start,
2997                                                       Value *End, Value *Step,
2998                                                       Instruction *DL) {
2999   BasicBlock *Header = L->getHeader();
3000   BasicBlock *Latch = L->getLoopLatch();
3001   // As we're just creating this loop, it's possible no latch exists
3002   // yet. If so, use the header as this will be a single block loop.
3003   if (!Latch)
3004     Latch = Header;
3005 
3006   IRBuilder<> B(&*Header->getFirstInsertionPt());
3007   Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction);
3008   setDebugLocFromInst(OldInst, &B);
3009   auto *Induction = B.CreatePHI(Start->getType(), 2, "index");
3010 
3011   B.SetInsertPoint(Latch->getTerminator());
3012   setDebugLocFromInst(OldInst, &B);
3013 
3014   // Create i+1 and fill the PHINode.
3015   //
3016   // If the tail is not folded, we know that End - Start >= Step (either
3017   // statically or through the minimum iteration checks). We also know that both
3018   // Start % Step == 0 and End % Step == 0. We exit the vector loop if %IV +
3019   // %Step == %End. Hence we must exit the loop before %IV + %Step unsigned
3020   // overflows and we can mark the induction increment as NUW.
3021   Value *Next = B.CreateAdd(Induction, Step, "index.next",
3022                             /*NUW=*/!Cost->foldTailByMasking(), /*NSW=*/false);
3023   Induction->addIncoming(Start, L->getLoopPreheader());
3024   Induction->addIncoming(Next, Latch);
3025   // Create the compare.
3026   Value *ICmp = B.CreateICmpEQ(Next, End);
3027   B.CreateCondBr(ICmp, L->getUniqueExitBlock(), Header);
3028 
3029   // Now we have two terminators. Remove the old one from the block.
3030   Latch->getTerminator()->eraseFromParent();
3031 
3032   return Induction;
3033 }
3034 
3035 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
3036   if (TripCount)
3037     return TripCount;
3038 
3039   assert(L && "Create Trip Count for null loop.");
3040   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
3041   // Find the loop boundaries.
3042   ScalarEvolution *SE = PSE.getSE();
3043   const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
3044   assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) &&
3045          "Invalid loop count");
3046 
3047   Type *IdxTy = Legal->getWidestInductionType();
3048   assert(IdxTy && "No type for induction");
3049 
3050   // The exit count might have the type of i64 while the phi is i32. This can
3051   // happen if we have an induction variable that is sign extended before the
3052   // compare. The only way that we get a backedge taken count is that the
3053   // induction variable was signed and as such will not overflow. In such a case
3054   // truncation is legal.
3055   if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) >
3056       IdxTy->getPrimitiveSizeInBits())
3057     BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
3058   BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
3059 
3060   // Get the total trip count from the count by adding 1.
3061   const SCEV *ExitCount = SE->getAddExpr(
3062       BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
3063 
3064   const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
3065 
3066   // Expand the trip count and place the new instructions in the preheader.
3067   // Notice that the pre-header does not change, only the loop body.
3068   SCEVExpander Exp(*SE, DL, "induction");
3069 
3070   // Count holds the overall loop count (N).
3071   TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
3072                                 L->getLoopPreheader()->getTerminator());
3073 
3074   if (TripCount->getType()->isPointerTy())
3075     TripCount =
3076         CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
3077                                     L->getLoopPreheader()->getTerminator());
3078 
3079   return TripCount;
3080 }
3081 
3082 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
3083   if (VectorTripCount)
3084     return VectorTripCount;
3085 
3086   Value *TC = getOrCreateTripCount(L);
3087   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
3088 
3089   Type *Ty = TC->getType();
3090   // This is where we can make the step a runtime constant.
3091   Value *Step = createStepForVF(Builder, Ty, VF, UF);
3092 
3093   // If the tail is to be folded by masking, round the number of iterations N
3094   // up to a multiple of Step instead of rounding down. This is done by first
3095   // adding Step-1 and then rounding down. Note that it's ok if this addition
3096   // overflows: the vector induction variable will eventually wrap to zero given
3097   // that it starts at zero and its Step is a power of two; the loop will then
3098   // exit, with the last early-exit vector comparison also producing all-true.
3099   if (Cost->foldTailByMasking()) {
3100     assert(isPowerOf2_32(VF.getKnownMinValue() * UF) &&
3101            "VF*UF must be a power of 2 when folding tail by masking");
3102     assert(!VF.isScalable() &&
3103            "Tail folding not yet supported for scalable vectors");
3104     TC = Builder.CreateAdd(
3105         TC, ConstantInt::get(Ty, VF.getKnownMinValue() * UF - 1), "n.rnd.up");
3106   }
3107 
3108   // Now we need to generate the expression for the part of the loop that the
3109   // vectorized body will execute. This is equal to N - (N % Step) if scalar
3110   // iterations are not required for correctness, or N - Step, otherwise. Step
3111   // is equal to the vectorization factor (number of SIMD elements) times the
3112   // unroll factor (number of SIMD instructions).
3113   Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
3114 
3115   // There are cases where we *must* run at least one iteration in the remainder
3116   // loop.  See the cost model for when this can happen.  If the step evenly
3117   // divides the trip count, we set the remainder to be equal to the step. If
3118   // the step does not evenly divide the trip count, no adjustment is necessary
3119   // since there will already be scalar iterations. Note that the minimum
3120   // iterations check ensures that N >= Step.
3121   if (Cost->requiresScalarEpilogue(VF)) {
3122     auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
3123     R = Builder.CreateSelect(IsZero, Step, R);
3124   }
3125 
3126   VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
3127 
3128   return VectorTripCount;
3129 }
3130 
3131 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
3132                                                    const DataLayout &DL) {
3133   // Verify that V is a vector type with same number of elements as DstVTy.
3134   auto *DstFVTy = cast<FixedVectorType>(DstVTy);
3135   unsigned VF = DstFVTy->getNumElements();
3136   auto *SrcVecTy = cast<FixedVectorType>(V->getType());
3137   assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match");
3138   Type *SrcElemTy = SrcVecTy->getElementType();
3139   Type *DstElemTy = DstFVTy->getElementType();
3140   assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
3141          "Vector elements must have same size");
3142 
3143   // Do a direct cast if element types are castable.
3144   if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
3145     return Builder.CreateBitOrPointerCast(V, DstFVTy);
3146   }
3147   // V cannot be directly casted to desired vector type.
3148   // May happen when V is a floating point vector but DstVTy is a vector of
3149   // pointers or vice-versa. Handle this using a two-step bitcast using an
3150   // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
3151   assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
3152          "Only one type should be a pointer type");
3153   assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
3154          "Only one type should be a floating point type");
3155   Type *IntTy =
3156       IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
3157   auto *VecIntTy = FixedVectorType::get(IntTy, VF);
3158   Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
3159   return Builder.CreateBitOrPointerCast(CastVal, DstFVTy);
3160 }
3161 
3162 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
3163                                                          BasicBlock *Bypass) {
3164   Value *Count = getOrCreateTripCount(L);
3165   // Reuse existing vector loop preheader for TC checks.
3166   // Note that new preheader block is generated for vector loop.
3167   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
3168   IRBuilder<> Builder(TCCheckBlock->getTerminator());
3169 
3170   // Generate code to check if the loop's trip count is less than VF * UF, or
3171   // equal to it in case a scalar epilogue is required; this implies that the
3172   // vector trip count is zero. This check also covers the case where adding one
3173   // to the backedge-taken count overflowed leading to an incorrect trip count
3174   // of zero. In this case we will also jump to the scalar loop.
3175   auto P = Cost->requiresScalarEpilogue(VF) ? ICmpInst::ICMP_ULE
3176                                             : ICmpInst::ICMP_ULT;
3177 
3178   // If tail is to be folded, vector loop takes care of all iterations.
3179   Value *CheckMinIters = Builder.getFalse();
3180   if (!Cost->foldTailByMasking()) {
3181     Value *Step = createStepForVF(Builder, Count->getType(), VF, UF);
3182     CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check");
3183   }
3184   // Create new preheader for vector loop.
3185   LoopVectorPreHeader =
3186       SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
3187                  "vector.ph");
3188 
3189   assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
3190                                DT->getNode(Bypass)->getIDom()) &&
3191          "TC check is expected to dominate Bypass");
3192 
3193   // Update dominator for Bypass & LoopExit (if needed).
3194   DT->changeImmediateDominator(Bypass, TCCheckBlock);
3195   if (!Cost->requiresScalarEpilogue(VF))
3196     // If there is an epilogue which must run, there's no edge from the
3197     // middle block to exit blocks  and thus no need to update the immediate
3198     // dominator of the exit blocks.
3199     DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
3200 
3201   ReplaceInstWithInst(
3202       TCCheckBlock->getTerminator(),
3203       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
3204   LoopBypassBlocks.push_back(TCCheckBlock);
3205 }
3206 
3207 BasicBlock *InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
3208 
3209   BasicBlock *const SCEVCheckBlock =
3210       RTChecks.emitSCEVChecks(L, Bypass, LoopVectorPreHeader, LoopExitBlock);
3211   if (!SCEVCheckBlock)
3212     return nullptr;
3213 
3214   assert(!(SCEVCheckBlock->getParent()->hasOptSize() ||
3215            (OptForSizeBasedOnProfile &&
3216             Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
3217          "Cannot SCEV check stride or overflow when optimizing for size");
3218 
3219 
3220   // Update dominator only if this is first RT check.
3221   if (LoopBypassBlocks.empty()) {
3222     DT->changeImmediateDominator(Bypass, SCEVCheckBlock);
3223     if (!Cost->requiresScalarEpilogue(VF))
3224       // If there is an epilogue which must run, there's no edge from the
3225       // middle block to exit blocks  and thus no need to update the immediate
3226       // dominator of the exit blocks.
3227       DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock);
3228   }
3229 
3230   LoopBypassBlocks.push_back(SCEVCheckBlock);
3231   AddedSafetyChecks = true;
3232   return SCEVCheckBlock;
3233 }
3234 
3235 BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L,
3236                                                       BasicBlock *Bypass) {
3237   // VPlan-native path does not do any analysis for runtime checks currently.
3238   if (EnableVPlanNativePath)
3239     return nullptr;
3240 
3241   BasicBlock *const MemCheckBlock =
3242       RTChecks.emitMemRuntimeChecks(L, Bypass, LoopVectorPreHeader);
3243 
3244   // Check if we generated code that checks in runtime if arrays overlap. We put
3245   // the checks into a separate block to make the more common case of few
3246   // elements faster.
3247   if (!MemCheckBlock)
3248     return nullptr;
3249 
3250   if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) {
3251     assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
3252            "Cannot emit memory checks when optimizing for size, unless forced "
3253            "to vectorize.");
3254     ORE->emit([&]() {
3255       return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
3256                                         L->getStartLoc(), L->getHeader())
3257              << "Code-size may be reduced by not forcing "
3258                 "vectorization, or by source-code modifications "
3259                 "eliminating the need for runtime checks "
3260                 "(e.g., adding 'restrict').";
3261     });
3262   }
3263 
3264   LoopBypassBlocks.push_back(MemCheckBlock);
3265 
3266   AddedSafetyChecks = true;
3267 
3268   // We currently don't use LoopVersioning for the actual loop cloning but we
3269   // still use it to add the noalias metadata.
3270   LVer = std::make_unique<LoopVersioning>(
3271       *Legal->getLAI(),
3272       Legal->getLAI()->getRuntimePointerChecking()->getChecks(), OrigLoop, LI,
3273       DT, PSE.getSE());
3274   LVer->prepareNoAliasMetadata();
3275   return MemCheckBlock;
3276 }
3277 
3278 Value *InnerLoopVectorizer::emitTransformedIndex(
3279     IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL,
3280     const InductionDescriptor &ID, BasicBlock *VectorHeader) const {
3281 
3282   SCEVExpander Exp(*SE, DL, "induction");
3283   auto Step = ID.getStep();
3284   auto StartValue = ID.getStartValue();
3285   assert(Index->getType()->getScalarType() == Step->getType() &&
3286          "Index scalar type does not match StepValue type");
3287 
3288   // Note: the IR at this point is broken. We cannot use SE to create any new
3289   // SCEV and then expand it, hoping that SCEV's simplification will give us
3290   // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
3291   // lead to various SCEV crashes. So all we can do is to use builder and rely
3292   // on InstCombine for future simplifications. Here we handle some trivial
3293   // cases only.
3294   auto CreateAdd = [&B](Value *X, Value *Y) {
3295     assert(X->getType() == Y->getType() && "Types don't match!");
3296     if (auto *CX = dyn_cast<ConstantInt>(X))
3297       if (CX->isZero())
3298         return Y;
3299     if (auto *CY = dyn_cast<ConstantInt>(Y))
3300       if (CY->isZero())
3301         return X;
3302     return B.CreateAdd(X, Y);
3303   };
3304 
3305   // We allow X to be a vector type, in which case Y will potentially be
3306   // splatted into a vector with the same element count.
3307   auto CreateMul = [&B](Value *X, Value *Y) {
3308     assert(X->getType()->getScalarType() == Y->getType() &&
3309            "Types don't match!");
3310     if (auto *CX = dyn_cast<ConstantInt>(X))
3311       if (CX->isOne())
3312         return Y;
3313     if (auto *CY = dyn_cast<ConstantInt>(Y))
3314       if (CY->isOne())
3315         return X;
3316     VectorType *XVTy = dyn_cast<VectorType>(X->getType());
3317     if (XVTy && !isa<VectorType>(Y->getType()))
3318       Y = B.CreateVectorSplat(XVTy->getElementCount(), Y);
3319     return B.CreateMul(X, Y);
3320   };
3321 
3322   // Get a suitable insert point for SCEV expansion. For blocks in the vector
3323   // loop, choose the end of the vector loop header (=VectorHeader), because
3324   // the DomTree is not kept up-to-date for additional blocks generated in the
3325   // vector loop. By using the header as insertion point, we guarantee that the
3326   // expanded instructions dominate all their uses.
3327   auto GetInsertPoint = [this, &B, VectorHeader]() {
3328     BasicBlock *InsertBB = B.GetInsertPoint()->getParent();
3329     if (InsertBB != LoopVectorBody &&
3330         LI->getLoopFor(VectorHeader) == LI->getLoopFor(InsertBB))
3331       return VectorHeader->getTerminator();
3332     return &*B.GetInsertPoint();
3333   };
3334 
3335   switch (ID.getKind()) {
3336   case InductionDescriptor::IK_IntInduction: {
3337     assert(!isa<VectorType>(Index->getType()) &&
3338            "Vector indices not supported for integer inductions yet");
3339     assert(Index->getType() == StartValue->getType() &&
3340            "Index type does not match StartValue type");
3341     if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne())
3342       return B.CreateSub(StartValue, Index);
3343     auto *Offset = CreateMul(
3344         Index, Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint()));
3345     return CreateAdd(StartValue, Offset);
3346   }
3347   case InductionDescriptor::IK_PtrInduction: {
3348     assert(isa<SCEVConstant>(Step) &&
3349            "Expected constant step for pointer induction");
3350     return B.CreateGEP(
3351         ID.getElementType(), StartValue,
3352         CreateMul(Index,
3353                   Exp.expandCodeFor(Step, Index->getType()->getScalarType(),
3354                                     GetInsertPoint())));
3355   }
3356   case InductionDescriptor::IK_FpInduction: {
3357     assert(!isa<VectorType>(Index->getType()) &&
3358            "Vector indices not supported for FP inductions yet");
3359     assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
3360     auto InductionBinOp = ID.getInductionBinOp();
3361     assert(InductionBinOp &&
3362            (InductionBinOp->getOpcode() == Instruction::FAdd ||
3363             InductionBinOp->getOpcode() == Instruction::FSub) &&
3364            "Original bin op should be defined for FP induction");
3365 
3366     Value *StepValue = cast<SCEVUnknown>(Step)->getValue();
3367     Value *MulExp = B.CreateFMul(StepValue, Index);
3368     return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
3369                          "induction");
3370   }
3371   case InductionDescriptor::IK_NoInduction:
3372     return nullptr;
3373   }
3374   llvm_unreachable("invalid enum");
3375 }
3376 
3377 Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
3378   LoopScalarBody = OrigLoop->getHeader();
3379   LoopVectorPreHeader = OrigLoop->getLoopPreheader();
3380   assert(LoopVectorPreHeader && "Invalid loop structure");
3381   LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr
3382   assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF)) &&
3383          "multiple exit loop without required epilogue?");
3384 
3385   LoopMiddleBlock =
3386       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
3387                  LI, nullptr, Twine(Prefix) + "middle.block");
3388   LoopScalarPreHeader =
3389       SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,
3390                  nullptr, Twine(Prefix) + "scalar.ph");
3391 
3392   auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
3393 
3394   // Set up the middle block terminator.  Two cases:
3395   // 1) If we know that we must execute the scalar epilogue, emit an
3396   //    unconditional branch.
3397   // 2) Otherwise, we must have a single unique exit block (due to how we
3398   //    implement the multiple exit case).  In this case, set up a conditonal
3399   //    branch from the middle block to the loop scalar preheader, and the
3400   //    exit block.  completeLoopSkeleton will update the condition to use an
3401   //    iteration check, if required to decide whether to execute the remainder.
3402   BranchInst *BrInst = Cost->requiresScalarEpilogue(VF) ?
3403     BranchInst::Create(LoopScalarPreHeader) :
3404     BranchInst::Create(LoopExitBlock, LoopScalarPreHeader,
3405                        Builder.getTrue());
3406   BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc());
3407   ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst);
3408 
3409   // We intentionally don't let SplitBlock to update LoopInfo since
3410   // LoopVectorBody should belong to another loop than LoopVectorPreHeader.
3411   // LoopVectorBody is explicitly added to the correct place few lines later.
3412   LoopVectorBody =
3413       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
3414                  nullptr, nullptr, Twine(Prefix) + "vector.body");
3415 
3416   // Update dominator for loop exit.
3417   if (!Cost->requiresScalarEpilogue(VF))
3418     // If there is an epilogue which must run, there's no edge from the
3419     // middle block to exit blocks  and thus no need to update the immediate
3420     // dominator of the exit blocks.
3421     DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
3422 
3423   // Create and register the new vector loop.
3424   Loop *Lp = LI->AllocateLoop();
3425   Loop *ParentLoop = OrigLoop->getParentLoop();
3426 
3427   // Insert the new loop into the loop nest and register the new basic blocks
3428   // before calling any utilities such as SCEV that require valid LoopInfo.
3429   if (ParentLoop) {
3430     ParentLoop->addChildLoop(Lp);
3431   } else {
3432     LI->addTopLevelLoop(Lp);
3433   }
3434   Lp->addBasicBlockToLoop(LoopVectorBody, *LI);
3435   return Lp;
3436 }
3437 
3438 void InnerLoopVectorizer::createInductionResumeValues(
3439     Loop *L, Value *VectorTripCount,
3440     std::pair<BasicBlock *, Value *> AdditionalBypass) {
3441   assert(VectorTripCount && L && "Expected valid arguments");
3442   assert(((AdditionalBypass.first && AdditionalBypass.second) ||
3443           (!AdditionalBypass.first && !AdditionalBypass.second)) &&
3444          "Inconsistent information about additional bypass.");
3445   // We are going to resume the execution of the scalar loop.
3446   // Go over all of the induction variables that we found and fix the
3447   // PHIs that are left in the scalar version of the loop.
3448   // The starting values of PHI nodes depend on the counter of the last
3449   // iteration in the vectorized loop.
3450   // If we come from a bypass edge then we need to start from the original
3451   // start value.
3452   for (auto &InductionEntry : Legal->getInductionVars()) {
3453     PHINode *OrigPhi = InductionEntry.first;
3454     InductionDescriptor II = InductionEntry.second;
3455 
3456     // Create phi nodes to merge from the  backedge-taken check block.
3457     PHINode *BCResumeVal =
3458         PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
3459                         LoopScalarPreHeader->getTerminator());
3460     // Copy original phi DL over to the new one.
3461     BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
3462     Value *&EndValue = IVEndValues[OrigPhi];
3463     Value *EndValueFromAdditionalBypass = AdditionalBypass.second;
3464     if (OrigPhi == OldInduction) {
3465       // We know what the end value is.
3466       EndValue = VectorTripCount;
3467     } else {
3468       IRBuilder<> B(L->getLoopPreheader()->getTerminator());
3469 
3470       // Fast-math-flags propagate from the original induction instruction.
3471       if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
3472         B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
3473 
3474       Type *StepType = II.getStep()->getType();
3475       Instruction::CastOps CastOp =
3476           CastInst::getCastOpcode(VectorTripCount, true, StepType, true);
3477       Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd");
3478       const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout();
3479       EndValue =
3480           emitTransformedIndex(B, CRD, PSE.getSE(), DL, II, LoopVectorBody);
3481       EndValue->setName("ind.end");
3482 
3483       // Compute the end value for the additional bypass (if applicable).
3484       if (AdditionalBypass.first) {
3485         B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt()));
3486         CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true,
3487                                          StepType, true);
3488         CRD =
3489             B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.crd");
3490         EndValueFromAdditionalBypass =
3491             emitTransformedIndex(B, CRD, PSE.getSE(), DL, II, LoopVectorBody);
3492         EndValueFromAdditionalBypass->setName("ind.end");
3493       }
3494     }
3495     // The new PHI merges the original incoming value, in case of a bypass,
3496     // or the value at the end of the vectorized loop.
3497     BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);
3498 
3499     // Fix the scalar body counter (PHI node).
3500     // The old induction's phi node in the scalar body needs the truncated
3501     // value.
3502     for (BasicBlock *BB : LoopBypassBlocks)
3503       BCResumeVal->addIncoming(II.getStartValue(), BB);
3504 
3505     if (AdditionalBypass.first)
3506       BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first,
3507                                             EndValueFromAdditionalBypass);
3508 
3509     OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
3510   }
3511 }
3512 
3513 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L,
3514                                                       MDNode *OrigLoopID) {
3515   assert(L && "Expected valid loop.");
3516 
3517   // The trip counts should be cached by now.
3518   Value *Count = getOrCreateTripCount(L);
3519   Value *VectorTripCount = getOrCreateVectorTripCount(L);
3520 
3521   auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
3522 
3523   // Add a check in the middle block to see if we have completed
3524   // all of the iterations in the first vector loop.  Three cases:
3525   // 1) If we require a scalar epilogue, there is no conditional branch as
3526   //    we unconditionally branch to the scalar preheader.  Do nothing.
3527   // 2) If (N - N%VF) == N, then we *don't* need to run the remainder.
3528   //    Thus if tail is to be folded, we know we don't need to run the
3529   //    remainder and we can use the previous value for the condition (true).
3530   // 3) Otherwise, construct a runtime check.
3531   if (!Cost->requiresScalarEpilogue(VF) && !Cost->foldTailByMasking()) {
3532     Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ,
3533                                         Count, VectorTripCount, "cmp.n",
3534                                         LoopMiddleBlock->getTerminator());
3535 
3536     // Here we use the same DebugLoc as the scalar loop latch terminator instead
3537     // of the corresponding compare because they may have ended up with
3538     // different line numbers and we want to avoid awkward line stepping while
3539     // debugging. Eg. if the compare has got a line number inside the loop.
3540     CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc());
3541     cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN);
3542   }
3543 
3544   // Get ready to start creating new instructions into the vectorized body.
3545   assert(LoopVectorPreHeader == L->getLoopPreheader() &&
3546          "Inconsistent vector loop preheader");
3547   Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt());
3548 
3549   Optional<MDNode *> VectorizedLoopID =
3550       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
3551                                       LLVMLoopVectorizeFollowupVectorized});
3552   if (VectorizedLoopID.hasValue()) {
3553     L->setLoopID(VectorizedLoopID.getValue());
3554 
3555     // Do not setAlreadyVectorized if loop attributes have been defined
3556     // explicitly.
3557     return LoopVectorPreHeader;
3558   }
3559 
3560   // Keep all loop hints from the original loop on the vector loop (we'll
3561   // replace the vectorizer-specific hints below).
3562   if (MDNode *LID = OrigLoop->getLoopID())
3563     L->setLoopID(LID);
3564 
3565   LoopVectorizeHints Hints(L, true, *ORE);
3566   Hints.setAlreadyVectorized();
3567 
3568 #ifdef EXPENSIVE_CHECKS
3569   assert(DT->verify(DominatorTree::VerificationLevel::Fast));
3570   LI->verify(*DT);
3571 #endif
3572 
3573   return LoopVectorPreHeader;
3574 }
3575 
3576 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
3577   /*
3578    In this function we generate a new loop. The new loop will contain
3579    the vectorized instructions while the old loop will continue to run the
3580    scalar remainder.
3581 
3582        [ ] <-- loop iteration number check.
3583     /   |
3584    /    v
3585   |    [ ] <-- vector loop bypass (may consist of multiple blocks).
3586   |  /  |
3587   | /   v
3588   ||   [ ]     <-- vector pre header.
3589   |/    |
3590   |     v
3591   |    [  ] \
3592   |    [  ]_|   <-- vector loop.
3593   |     |
3594   |     v
3595   \   -[ ]   <--- middle-block.
3596    \/   |
3597    /\   v
3598    | ->[ ]     <--- new preheader.
3599    |    |
3600  (opt)  v      <-- edge from middle to exit iff epilogue is not required.
3601    |   [ ] \
3602    |   [ ]_|   <-- old scalar loop to handle remainder (scalar epilogue).
3603     \   |
3604      \  v
3605       >[ ]     <-- exit block(s).
3606    ...
3607    */
3608 
3609   // Get the metadata of the original loop before it gets modified.
3610   MDNode *OrigLoopID = OrigLoop->getLoopID();
3611 
3612   // Workaround!  Compute the trip count of the original loop and cache it
3613   // before we start modifying the CFG.  This code has a systemic problem
3614   // wherein it tries to run analysis over partially constructed IR; this is
3615   // wrong, and not simply for SCEV.  The trip count of the original loop
3616   // simply happens to be prone to hitting this in practice.  In theory, we
3617   // can hit the same issue for any SCEV, or ValueTracking query done during
3618   // mutation.  See PR49900.
3619   getOrCreateTripCount(OrigLoop);
3620 
3621   // Create an empty vector loop, and prepare basic blocks for the runtime
3622   // checks.
3623   Loop *Lp = createVectorLoopSkeleton("");
3624 
3625   // Now, compare the new count to zero. If it is zero skip the vector loop and
3626   // jump to the scalar loop. This check also covers the case where the
3627   // backedge-taken count is uint##_max: adding one to it will overflow leading
3628   // to an incorrect trip count of zero. In this (rare) case we will also jump
3629   // to the scalar loop.
3630   emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader);
3631 
3632   // Generate the code to check any assumptions that we've made for SCEV
3633   // expressions.
3634   emitSCEVChecks(Lp, LoopScalarPreHeader);
3635 
3636   // Generate the code that checks in runtime if arrays overlap. We put the
3637   // checks into a separate block to make the more common case of few elements
3638   // faster.
3639   emitMemRuntimeChecks(Lp, LoopScalarPreHeader);
3640 
3641   // Some loops have a single integer induction variable, while other loops
3642   // don't. One example is c++ iterators that often have multiple pointer
3643   // induction variables. In the code below we also support a case where we
3644   // don't have a single induction variable.
3645   //
3646   // We try to obtain an induction variable from the original loop as hard
3647   // as possible. However if we don't find one that:
3648   //   - is an integer
3649   //   - counts from zero, stepping by one
3650   //   - is the size of the widest induction variable type
3651   // then we create a new one.
3652   OldInduction = Legal->getPrimaryInduction();
3653   Type *IdxTy = Legal->getWidestInductionType();
3654   Value *StartIdx = ConstantInt::get(IdxTy, 0);
3655   // The loop step is equal to the vectorization factor (num of SIMD elements)
3656   // times the unroll factor (num of SIMD instructions).
3657   Builder.SetInsertPoint(&*Lp->getHeader()->getFirstInsertionPt());
3658   Value *Step = createStepForVF(Builder, IdxTy, VF, UF);
3659   Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
3660   Induction =
3661       createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
3662                               getDebugLocFromInstOrOperands(OldInduction));
3663 
3664   // Emit phis for the new starting index of the scalar loop.
3665   createInductionResumeValues(Lp, CountRoundDown);
3666 
3667   return completeLoopSkeleton(Lp, OrigLoopID);
3668 }
3669 
3670 // Fix up external users of the induction variable. At this point, we are
3671 // in LCSSA form, with all external PHIs that use the IV having one input value,
3672 // coming from the remainder loop. We need those PHIs to also have a correct
3673 // value for the IV when arriving directly from the middle block.
3674 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
3675                                        const InductionDescriptor &II,
3676                                        Value *CountRoundDown, Value *EndValue,
3677                                        BasicBlock *MiddleBlock) {
3678   // There are two kinds of external IV usages - those that use the value
3679   // computed in the last iteration (the PHI) and those that use the penultimate
3680   // value (the value that feeds into the phi from the loop latch).
3681   // We allow both, but they, obviously, have different values.
3682 
3683   assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block");
3684 
3685   DenseMap<Value *, Value *> MissingVals;
3686 
3687   // An external user of the last iteration's value should see the value that
3688   // the remainder loop uses to initialize its own IV.
3689   Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
3690   for (User *U : PostInc->users()) {
3691     Instruction *UI = cast<Instruction>(U);
3692     if (!OrigLoop->contains(UI)) {
3693       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3694       MissingVals[UI] = EndValue;
3695     }
3696   }
3697 
3698   // An external user of the penultimate value need to see EndValue - Step.
3699   // The simplest way to get this is to recompute it from the constituent SCEVs,
3700   // that is Start + (Step * (CRD - 1)).
3701   for (User *U : OrigPhi->users()) {
3702     auto *UI = cast<Instruction>(U);
3703     if (!OrigLoop->contains(UI)) {
3704       const DataLayout &DL =
3705           OrigLoop->getHeader()->getModule()->getDataLayout();
3706       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3707 
3708       IRBuilder<> B(MiddleBlock->getTerminator());
3709 
3710       // Fast-math-flags propagate from the original induction instruction.
3711       if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
3712         B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
3713 
3714       Value *CountMinusOne = B.CreateSub(
3715           CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1));
3716       Value *CMO =
3717           !II.getStep()->getType()->isIntegerTy()
3718               ? B.CreateCast(Instruction::SIToFP, CountMinusOne,
3719                              II.getStep()->getType())
3720               : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType());
3721       CMO->setName("cast.cmo");
3722       Value *Escape =
3723           emitTransformedIndex(B, CMO, PSE.getSE(), DL, II, LoopVectorBody);
3724       Escape->setName("ind.escape");
3725       MissingVals[UI] = Escape;
3726     }
3727   }
3728 
3729   for (auto &I : MissingVals) {
3730     PHINode *PHI = cast<PHINode>(I.first);
3731     // One corner case we have to handle is two IVs "chasing" each-other,
3732     // that is %IV2 = phi [...], [ %IV1, %latch ]
3733     // In this case, if IV1 has an external use, we need to avoid adding both
3734     // "last value of IV1" and "penultimate value of IV2". So, verify that we
3735     // don't already have an incoming value for the middle block.
3736     if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
3737       PHI->addIncoming(I.second, MiddleBlock);
3738   }
3739 }
3740 
3741 namespace {
3742 
3743 struct CSEDenseMapInfo {
3744   static bool canHandle(const Instruction *I) {
3745     return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
3746            isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
3747   }
3748 
3749   static inline Instruction *getEmptyKey() {
3750     return DenseMapInfo<Instruction *>::getEmptyKey();
3751   }
3752 
3753   static inline Instruction *getTombstoneKey() {
3754     return DenseMapInfo<Instruction *>::getTombstoneKey();
3755   }
3756 
3757   static unsigned getHashValue(const Instruction *I) {
3758     assert(canHandle(I) && "Unknown instruction!");
3759     return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
3760                                                            I->value_op_end()));
3761   }
3762 
3763   static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
3764     if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3765         LHS == getTombstoneKey() || RHS == getTombstoneKey())
3766       return LHS == RHS;
3767     return LHS->isIdenticalTo(RHS);
3768   }
3769 };
3770 
3771 } // end anonymous namespace
3772 
3773 ///Perform cse of induction variable instructions.
3774 static void cse(BasicBlock *BB) {
3775   // Perform simple cse.
3776   SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
3777   for (Instruction &In : llvm::make_early_inc_range(*BB)) {
3778     if (!CSEDenseMapInfo::canHandle(&In))
3779       continue;
3780 
3781     // Check if we can replace this instruction with any of the
3782     // visited instructions.
3783     if (Instruction *V = CSEMap.lookup(&In)) {
3784       In.replaceAllUsesWith(V);
3785       In.eraseFromParent();
3786       continue;
3787     }
3788 
3789     CSEMap[&In] = &In;
3790   }
3791 }
3792 
3793 InstructionCost
3794 LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF,
3795                                               bool &NeedToScalarize) const {
3796   Function *F = CI->getCalledFunction();
3797   Type *ScalarRetTy = CI->getType();
3798   SmallVector<Type *, 4> Tys, ScalarTys;
3799   for (auto &ArgOp : CI->args())
3800     ScalarTys.push_back(ArgOp->getType());
3801 
3802   // Estimate cost of scalarized vector call. The source operands are assumed
3803   // to be vectors, so we need to extract individual elements from there,
3804   // execute VF scalar calls, and then gather the result into the vector return
3805   // value.
3806   InstructionCost ScalarCallCost =
3807       TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, TTI::TCK_RecipThroughput);
3808   if (VF.isScalar())
3809     return ScalarCallCost;
3810 
3811   // Compute corresponding vector type for return value and arguments.
3812   Type *RetTy = ToVectorTy(ScalarRetTy, VF);
3813   for (Type *ScalarTy : ScalarTys)
3814     Tys.push_back(ToVectorTy(ScalarTy, VF));
3815 
3816   // Compute costs of unpacking argument values for the scalar calls and
3817   // packing the return values to a vector.
3818   InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF);
3819 
3820   InstructionCost Cost =
3821       ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
3822 
3823   // If we can't emit a vector call for this function, then the currently found
3824   // cost is the cost we need to return.
3825   NeedToScalarize = true;
3826   VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
3827   Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
3828 
3829   if (!TLI || CI->isNoBuiltin() || !VecFunc)
3830     return Cost;
3831 
3832   // If the corresponding vector cost is cheaper, return its cost.
3833   InstructionCost VectorCallCost =
3834       TTI.getCallInstrCost(nullptr, RetTy, Tys, TTI::TCK_RecipThroughput);
3835   if (VectorCallCost < Cost) {
3836     NeedToScalarize = false;
3837     Cost = VectorCallCost;
3838   }
3839   return Cost;
3840 }
3841 
3842 static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) {
3843   if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy()))
3844     return Elt;
3845   return VectorType::get(Elt, VF);
3846 }
3847 
3848 InstructionCost
3849 LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
3850                                                    ElementCount VF) const {
3851   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
3852   assert(ID && "Expected intrinsic call!");
3853   Type *RetTy = MaybeVectorizeType(CI->getType(), VF);
3854   FastMathFlags FMF;
3855   if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
3856     FMF = FPMO->getFastMathFlags();
3857 
3858   SmallVector<const Value *> Arguments(CI->args());
3859   FunctionType *FTy = CI->getCalledFunction()->getFunctionType();
3860   SmallVector<Type *> ParamTys;
3861   std::transform(FTy->param_begin(), FTy->param_end(),
3862                  std::back_inserter(ParamTys),
3863                  [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); });
3864 
3865   IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF,
3866                                     dyn_cast<IntrinsicInst>(CI));
3867   return TTI.getIntrinsicInstrCost(CostAttrs,
3868                                    TargetTransformInfo::TCK_RecipThroughput);
3869 }
3870 
3871 static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
3872   auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3873   auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3874   return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3875 }
3876 
3877 static Type *largestIntegerVectorType(Type *T1, Type *T2) {
3878   auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3879   auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3880   return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3881 }
3882 
3883 void InnerLoopVectorizer::truncateToMinimalBitwidths(VPTransformState &State) {
3884   // For every instruction `I` in MinBWs, truncate the operands, create a
3885   // truncated version of `I` and reextend its result. InstCombine runs
3886   // later and will remove any ext/trunc pairs.
3887   SmallPtrSet<Value *, 4> Erased;
3888   for (const auto &KV : Cost->getMinimalBitwidths()) {
3889     // If the value wasn't vectorized, we must maintain the original scalar
3890     // type. The absence of the value from State indicates that it
3891     // wasn't vectorized.
3892     // FIXME: Should not rely on getVPValue at this point.
3893     VPValue *Def = State.Plan->getVPValue(KV.first, true);
3894     if (!State.hasAnyVectorValue(Def))
3895       continue;
3896     for (unsigned Part = 0; Part < UF; ++Part) {
3897       Value *I = State.get(Def, Part);
3898       if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I))
3899         continue;
3900       Type *OriginalTy = I->getType();
3901       Type *ScalarTruncatedTy =
3902           IntegerType::get(OriginalTy->getContext(), KV.second);
3903       auto *TruncatedTy = VectorType::get(
3904           ScalarTruncatedTy, cast<VectorType>(OriginalTy)->getElementCount());
3905       if (TruncatedTy == OriginalTy)
3906         continue;
3907 
3908       IRBuilder<> B(cast<Instruction>(I));
3909       auto ShrinkOperand = [&](Value *V) -> Value * {
3910         if (auto *ZI = dyn_cast<ZExtInst>(V))
3911           if (ZI->getSrcTy() == TruncatedTy)
3912             return ZI->getOperand(0);
3913         return B.CreateZExtOrTrunc(V, TruncatedTy);
3914       };
3915 
3916       // The actual instruction modification depends on the instruction type,
3917       // unfortunately.
3918       Value *NewI = nullptr;
3919       if (auto *BO = dyn_cast<BinaryOperator>(I)) {
3920         NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
3921                              ShrinkOperand(BO->getOperand(1)));
3922 
3923         // Any wrapping introduced by shrinking this operation shouldn't be
3924         // considered undefined behavior. So, we can't unconditionally copy
3925         // arithmetic wrapping flags to NewI.
3926         cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
3927       } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
3928         NewI =
3929             B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
3930                          ShrinkOperand(CI->getOperand(1)));
3931       } else if (auto *SI = dyn_cast<SelectInst>(I)) {
3932         NewI = B.CreateSelect(SI->getCondition(),
3933                               ShrinkOperand(SI->getTrueValue()),
3934                               ShrinkOperand(SI->getFalseValue()));
3935       } else if (auto *CI = dyn_cast<CastInst>(I)) {
3936         switch (CI->getOpcode()) {
3937         default:
3938           llvm_unreachable("Unhandled cast!");
3939         case Instruction::Trunc:
3940           NewI = ShrinkOperand(CI->getOperand(0));
3941           break;
3942         case Instruction::SExt:
3943           NewI = B.CreateSExtOrTrunc(
3944               CI->getOperand(0),
3945               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3946           break;
3947         case Instruction::ZExt:
3948           NewI = B.CreateZExtOrTrunc(
3949               CI->getOperand(0),
3950               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3951           break;
3952         }
3953       } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
3954         auto Elements0 =
3955             cast<VectorType>(SI->getOperand(0)->getType())->getElementCount();
3956         auto *O0 = B.CreateZExtOrTrunc(
3957             SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0));
3958         auto Elements1 =
3959             cast<VectorType>(SI->getOperand(1)->getType())->getElementCount();
3960         auto *O1 = B.CreateZExtOrTrunc(
3961             SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1));
3962 
3963         NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask());
3964       } else if (isa<LoadInst>(I) || isa<PHINode>(I)) {
3965         // Don't do anything with the operands, just extend the result.
3966         continue;
3967       } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
3968         auto Elements =
3969             cast<VectorType>(IE->getOperand(0)->getType())->getElementCount();
3970         auto *O0 = B.CreateZExtOrTrunc(
3971             IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3972         auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
3973         NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
3974       } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
3975         auto Elements =
3976             cast<VectorType>(EE->getOperand(0)->getType())->getElementCount();
3977         auto *O0 = B.CreateZExtOrTrunc(
3978             EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3979         NewI = B.CreateExtractElement(O0, EE->getOperand(2));
3980       } else {
3981         // If we don't know what to do, be conservative and don't do anything.
3982         continue;
3983       }
3984 
3985       // Lastly, extend the result.
3986       NewI->takeName(cast<Instruction>(I));
3987       Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
3988       I->replaceAllUsesWith(Res);
3989       cast<Instruction>(I)->eraseFromParent();
3990       Erased.insert(I);
3991       State.reset(Def, Res, Part);
3992     }
3993   }
3994 
3995   // We'll have created a bunch of ZExts that are now parentless. Clean up.
3996   for (const auto &KV : Cost->getMinimalBitwidths()) {
3997     // If the value wasn't vectorized, we must maintain the original scalar
3998     // type. The absence of the value from State indicates that it
3999     // wasn't vectorized.
4000     // FIXME: Should not rely on getVPValue at this point.
4001     VPValue *Def = State.Plan->getVPValue(KV.first, true);
4002     if (!State.hasAnyVectorValue(Def))
4003       continue;
4004     for (unsigned Part = 0; Part < UF; ++Part) {
4005       Value *I = State.get(Def, Part);
4006       ZExtInst *Inst = dyn_cast<ZExtInst>(I);
4007       if (Inst && Inst->use_empty()) {
4008         Value *NewI = Inst->getOperand(0);
4009         Inst->eraseFromParent();
4010         State.reset(Def, NewI, Part);
4011       }
4012     }
4013   }
4014 }
4015 
4016 void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
4017   // Insert truncates and extends for any truncated instructions as hints to
4018   // InstCombine.
4019   if (VF.isVector())
4020     truncateToMinimalBitwidths(State);
4021 
4022   // Fix widened non-induction PHIs by setting up the PHI operands.
4023   if (OrigPHIsToFix.size()) {
4024     assert(EnableVPlanNativePath &&
4025            "Unexpected non-induction PHIs for fixup in non VPlan-native path");
4026     fixNonInductionPHIs(State);
4027   }
4028 
4029   // At this point every instruction in the original loop is widened to a
4030   // vector form. Now we need to fix the recurrences in the loop. These PHI
4031   // nodes are currently empty because we did not want to introduce cycles.
4032   // This is the second stage of vectorizing recurrences.
4033   fixCrossIterationPHIs(State);
4034 
4035   // Forget the original basic block.
4036   PSE.getSE()->forgetLoop(OrigLoop);
4037 
4038   // If we inserted an edge from the middle block to the unique exit block,
4039   // update uses outside the loop (phis) to account for the newly inserted
4040   // edge.
4041   if (!Cost->requiresScalarEpilogue(VF)) {
4042     // Fix-up external users of the induction variables.
4043     for (auto &Entry : Legal->getInductionVars())
4044       fixupIVUsers(Entry.first, Entry.second,
4045                    getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)),
4046                    IVEndValues[Entry.first], LoopMiddleBlock);
4047 
4048     fixLCSSAPHIs(State);
4049   }
4050 
4051   for (Instruction *PI : PredicatedInstructions)
4052     sinkScalarOperands(&*PI);
4053 
4054   // Remove redundant induction instructions.
4055   cse(LoopVectorBody);
4056 
4057   // Set/update profile weights for the vector and remainder loops as original
4058   // loop iterations are now distributed among them. Note that original loop
4059   // represented by LoopScalarBody becomes remainder loop after vectorization.
4060   //
4061   // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
4062   // end up getting slightly roughened result but that should be OK since
4063   // profile is not inherently precise anyway. Note also possible bypass of
4064   // vector code caused by legality checks is ignored, assigning all the weight
4065   // to the vector loop, optimistically.
4066   //
4067   // For scalable vectorization we can't know at compile time how many iterations
4068   // of the loop are handled in one vector iteration, so instead assume a pessimistic
4069   // vscale of '1'.
4070   setProfileInfoAfterUnrolling(
4071       LI->getLoopFor(LoopScalarBody), LI->getLoopFor(LoopVectorBody),
4072       LI->getLoopFor(LoopScalarBody), VF.getKnownMinValue() * UF);
4073 }
4074 
4075 void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) {
4076   // In order to support recurrences we need to be able to vectorize Phi nodes.
4077   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
4078   // stage #2: We now need to fix the recurrences by adding incoming edges to
4079   // the currently empty PHI nodes. At this point every instruction in the
4080   // original loop is widened to a vector form so we can use them to construct
4081   // the incoming edges.
4082   VPBasicBlock *Header = State.Plan->getEntry()->getEntryBasicBlock();
4083   for (VPRecipeBase &R : Header->phis()) {
4084     if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R))
4085       fixReduction(ReductionPhi, State);
4086     else if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R))
4087       fixFirstOrderRecurrence(FOR, State);
4088   }
4089 }
4090 
4091 void InnerLoopVectorizer::fixFirstOrderRecurrence(VPWidenPHIRecipe *PhiR,
4092                                                   VPTransformState &State) {
4093   // This is the second phase of vectorizing first-order recurrences. An
4094   // overview of the transformation is described below. Suppose we have the
4095   // following loop.
4096   //
4097   //   for (int i = 0; i < n; ++i)
4098   //     b[i] = a[i] - a[i - 1];
4099   //
4100   // There is a first-order recurrence on "a". For this loop, the shorthand
4101   // scalar IR looks like:
4102   //
4103   //   scalar.ph:
4104   //     s_init = a[-1]
4105   //     br scalar.body
4106   //
4107   //   scalar.body:
4108   //     i = phi [0, scalar.ph], [i+1, scalar.body]
4109   //     s1 = phi [s_init, scalar.ph], [s2, scalar.body]
4110   //     s2 = a[i]
4111   //     b[i] = s2 - s1
4112   //     br cond, scalar.body, ...
4113   //
4114   // In this example, s1 is a recurrence because it's value depends on the
4115   // previous iteration. In the first phase of vectorization, we created a
4116   // vector phi v1 for s1. We now complete the vectorization and produce the
4117   // shorthand vector IR shown below (for VF = 4, UF = 1).
4118   //
4119   //   vector.ph:
4120   //     v_init = vector(..., ..., ..., a[-1])
4121   //     br vector.body
4122   //
4123   //   vector.body
4124   //     i = phi [0, vector.ph], [i+4, vector.body]
4125   //     v1 = phi [v_init, vector.ph], [v2, vector.body]
4126   //     v2 = a[i, i+1, i+2, i+3];
4127   //     v3 = vector(v1(3), v2(0, 1, 2))
4128   //     b[i, i+1, i+2, i+3] = v2 - v3
4129   //     br cond, vector.body, middle.block
4130   //
4131   //   middle.block:
4132   //     x = v2(3)
4133   //     br scalar.ph
4134   //
4135   //   scalar.ph:
4136   //     s_init = phi [x, middle.block], [a[-1], otherwise]
4137   //     br scalar.body
4138   //
4139   // After execution completes the vector loop, we extract the next value of
4140   // the recurrence (x) to use as the initial value in the scalar loop.
4141 
4142   // Extract the last vector element in the middle block. This will be the
4143   // initial value for the recurrence when jumping to the scalar loop.
4144   VPValue *PreviousDef = PhiR->getBackedgeValue();
4145   Value *Incoming = State.get(PreviousDef, UF - 1);
4146   auto *ExtractForScalar = Incoming;
4147   auto *IdxTy = Builder.getInt32Ty();
4148   if (VF.isVector()) {
4149     auto *One = ConstantInt::get(IdxTy, 1);
4150     Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
4151     auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF);
4152     auto *LastIdx = Builder.CreateSub(RuntimeVF, One);
4153     ExtractForScalar = Builder.CreateExtractElement(ExtractForScalar, LastIdx,
4154                                                     "vector.recur.extract");
4155   }
4156   // Extract the second last element in the middle block if the
4157   // Phi is used outside the loop. We need to extract the phi itself
4158   // and not the last element (the phi update in the current iteration). This
4159   // will be the value when jumping to the exit block from the LoopMiddleBlock,
4160   // when the scalar loop is not run at all.
4161   Value *ExtractForPhiUsedOutsideLoop = nullptr;
4162   if (VF.isVector()) {
4163     auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF);
4164     auto *Idx = Builder.CreateSub(RuntimeVF, ConstantInt::get(IdxTy, 2));
4165     ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
4166         Incoming, Idx, "vector.recur.extract.for.phi");
4167   } else if (UF > 1)
4168     // When loop is unrolled without vectorizing, initialize
4169     // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value
4170     // of `Incoming`. This is analogous to the vectorized case above: extracting
4171     // the second last element when VF > 1.
4172     ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2);
4173 
4174   // Fix the initial value of the original recurrence in the scalar loop.
4175   Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
4176   PHINode *Phi = cast<PHINode>(PhiR->getUnderlyingValue());
4177   auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
4178   auto *ScalarInit = PhiR->getStartValue()->getLiveInIRValue();
4179   for (auto *BB : predecessors(LoopScalarPreHeader)) {
4180     auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
4181     Start->addIncoming(Incoming, BB);
4182   }
4183 
4184   Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start);
4185   Phi->setName("scalar.recur");
4186 
4187   // Finally, fix users of the recurrence outside the loop. The users will need
4188   // either the last value of the scalar recurrence or the last value of the
4189   // vector recurrence we extracted in the middle block. Since the loop is in
4190   // LCSSA form, we just need to find all the phi nodes for the original scalar
4191   // recurrence in the exit block, and then add an edge for the middle block.
4192   // Note that LCSSA does not imply single entry when the original scalar loop
4193   // had multiple exiting edges (as we always run the last iteration in the
4194   // scalar epilogue); in that case, there is no edge from middle to exit and
4195   // and thus no phis which needed updated.
4196   if (!Cost->requiresScalarEpilogue(VF))
4197     for (PHINode &LCSSAPhi : LoopExitBlock->phis())
4198       if (llvm::is_contained(LCSSAPhi.incoming_values(), Phi))
4199         LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
4200 }
4201 
4202 void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR,
4203                                        VPTransformState &State) {
4204   PHINode *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
4205   // Get it's reduction variable descriptor.
4206   assert(Legal->isReductionVariable(OrigPhi) &&
4207          "Unable to find the reduction variable");
4208   const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
4209 
4210   RecurKind RK = RdxDesc.getRecurrenceKind();
4211   TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
4212   Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
4213   setDebugLocFromInst(ReductionStartValue);
4214 
4215   VPValue *LoopExitInstDef = PhiR->getBackedgeValue();
4216   // This is the vector-clone of the value that leaves the loop.
4217   Type *VecTy = State.get(LoopExitInstDef, 0)->getType();
4218 
4219   // Wrap flags are in general invalid after vectorization, clear them.
4220   clearReductionWrapFlags(RdxDesc, State);
4221 
4222   // Before each round, move the insertion point right between
4223   // the PHIs and the values we are going to write.
4224   // This allows us to write both PHINodes and the extractelement
4225   // instructions.
4226   Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
4227 
4228   setDebugLocFromInst(LoopExitInst);
4229 
4230   Type *PhiTy = OrigPhi->getType();
4231   // If tail is folded by masking, the vector value to leave the loop should be
4232   // a Select choosing between the vectorized LoopExitInst and vectorized Phi,
4233   // instead of the former. For an inloop reduction the reduction will already
4234   // be predicated, and does not need to be handled here.
4235   if (Cost->foldTailByMasking() && !PhiR->isInLoop()) {
4236     for (unsigned Part = 0; Part < UF; ++Part) {
4237       Value *VecLoopExitInst = State.get(LoopExitInstDef, Part);
4238       Value *Sel = nullptr;
4239       for (User *U : VecLoopExitInst->users()) {
4240         if (isa<SelectInst>(U)) {
4241           assert(!Sel && "Reduction exit feeding two selects");
4242           Sel = U;
4243         } else
4244           assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select");
4245       }
4246       assert(Sel && "Reduction exit feeds no select");
4247       State.reset(LoopExitInstDef, Sel, Part);
4248 
4249       // If the target can create a predicated operator for the reduction at no
4250       // extra cost in the loop (for example a predicated vadd), it can be
4251       // cheaper for the select to remain in the loop than be sunk out of it,
4252       // and so use the select value for the phi instead of the old
4253       // LoopExitValue.
4254       if (PreferPredicatedReductionSelect ||
4255           TTI->preferPredicatedReductionSelect(
4256               RdxDesc.getOpcode(), PhiTy,
4257               TargetTransformInfo::ReductionFlags())) {
4258         auto *VecRdxPhi =
4259             cast<PHINode>(State.get(PhiR, Part));
4260         VecRdxPhi->setIncomingValueForBlock(
4261             LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel);
4262       }
4263     }
4264   }
4265 
4266   // If the vector reduction can be performed in a smaller type, we truncate
4267   // then extend the loop exit value to enable InstCombine to evaluate the
4268   // entire expression in the smaller type.
4269   if (VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) {
4270     assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!");
4271     Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
4272     Builder.SetInsertPoint(
4273         LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator());
4274     VectorParts RdxParts(UF);
4275     for (unsigned Part = 0; Part < UF; ++Part) {
4276       RdxParts[Part] = State.get(LoopExitInstDef, Part);
4277       Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
4278       Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
4279                                         : Builder.CreateZExt(Trunc, VecTy);
4280       for (User *U : llvm::make_early_inc_range(RdxParts[Part]->users()))
4281         if (U != Trunc) {
4282           U->replaceUsesOfWith(RdxParts[Part], Extnd);
4283           RdxParts[Part] = Extnd;
4284         }
4285     }
4286     Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
4287     for (unsigned Part = 0; Part < UF; ++Part) {
4288       RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
4289       State.reset(LoopExitInstDef, RdxParts[Part], Part);
4290     }
4291   }
4292 
4293   // Reduce all of the unrolled parts into a single vector.
4294   Value *ReducedPartRdx = State.get(LoopExitInstDef, 0);
4295   unsigned Op = RecurrenceDescriptor::getOpcode(RK);
4296 
4297   // The middle block terminator has already been assigned a DebugLoc here (the
4298   // OrigLoop's single latch terminator). We want the whole middle block to
4299   // appear to execute on this line because: (a) it is all compiler generated,
4300   // (b) these instructions are always executed after evaluating the latch
4301   // conditional branch, and (c) other passes may add new predecessors which
4302   // terminate on this line. This is the easiest way to ensure we don't
4303   // accidentally cause an extra step back into the loop while debugging.
4304   setDebugLocFromInst(LoopMiddleBlock->getTerminator());
4305   if (PhiR->isOrdered())
4306     ReducedPartRdx = State.get(LoopExitInstDef, UF - 1);
4307   else {
4308     // Floating-point operations should have some FMF to enable the reduction.
4309     IRBuilderBase::FastMathFlagGuard FMFG(Builder);
4310     Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
4311     for (unsigned Part = 1; Part < UF; ++Part) {
4312       Value *RdxPart = State.get(LoopExitInstDef, Part);
4313       if (Op != Instruction::ICmp && Op != Instruction::FCmp) {
4314         ReducedPartRdx = Builder.CreateBinOp(
4315             (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx");
4316       } else if (RecurrenceDescriptor::isSelectCmpRecurrenceKind(RK))
4317         ReducedPartRdx = createSelectCmpOp(Builder, ReductionStartValue, RK,
4318                                            ReducedPartRdx, RdxPart);
4319       else
4320         ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart);
4321     }
4322   }
4323 
4324   // Create the reduction after the loop. Note that inloop reductions create the
4325   // target reduction in the loop using a Reduction recipe.
4326   if (VF.isVector() && !PhiR->isInLoop()) {
4327     ReducedPartRdx =
4328         createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, OrigPhi);
4329     // If the reduction can be performed in a smaller type, we need to extend
4330     // the reduction to the wider type before we branch to the original loop.
4331     if (PhiTy != RdxDesc.getRecurrenceType())
4332       ReducedPartRdx = RdxDesc.isSigned()
4333                            ? Builder.CreateSExt(ReducedPartRdx, PhiTy)
4334                            : Builder.CreateZExt(ReducedPartRdx, PhiTy);
4335   }
4336 
4337   // Create a phi node that merges control-flow from the backedge-taken check
4338   // block and the middle block.
4339   PHINode *BCBlockPhi = PHINode::Create(PhiTy, 2, "bc.merge.rdx",
4340                                         LoopScalarPreHeader->getTerminator());
4341   for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
4342     BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]);
4343   BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
4344 
4345   // Now, we need to fix the users of the reduction variable
4346   // inside and outside of the scalar remainder loop.
4347 
4348   // We know that the loop is in LCSSA form. We need to update the PHI nodes
4349   // in the exit blocks.  See comment on analogous loop in
4350   // fixFirstOrderRecurrence for a more complete explaination of the logic.
4351   if (!Cost->requiresScalarEpilogue(VF))
4352     for (PHINode &LCSSAPhi : LoopExitBlock->phis())
4353       if (llvm::is_contained(LCSSAPhi.incoming_values(), LoopExitInst))
4354         LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
4355 
4356   // Fix the scalar loop reduction variable with the incoming reduction sum
4357   // from the vector body and from the backedge value.
4358   int IncomingEdgeBlockIdx =
4359       OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch());
4360   assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
4361   // Pick the other block.
4362   int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
4363   OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
4364   OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
4365 }
4366 
4367 void InnerLoopVectorizer::clearReductionWrapFlags(const RecurrenceDescriptor &RdxDesc,
4368                                                   VPTransformState &State) {
4369   RecurKind RK = RdxDesc.getRecurrenceKind();
4370   if (RK != RecurKind::Add && RK != RecurKind::Mul)
4371     return;
4372 
4373   Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr();
4374   assert(LoopExitInstr && "null loop exit instruction");
4375   SmallVector<Instruction *, 8> Worklist;
4376   SmallPtrSet<Instruction *, 8> Visited;
4377   Worklist.push_back(LoopExitInstr);
4378   Visited.insert(LoopExitInstr);
4379 
4380   while (!Worklist.empty()) {
4381     Instruction *Cur = Worklist.pop_back_val();
4382     if (isa<OverflowingBinaryOperator>(Cur))
4383       for (unsigned Part = 0; Part < UF; ++Part) {
4384         // FIXME: Should not rely on getVPValue at this point.
4385         Value *V = State.get(State.Plan->getVPValue(Cur, true), Part);
4386         cast<Instruction>(V)->dropPoisonGeneratingFlags();
4387       }
4388 
4389     for (User *U : Cur->users()) {
4390       Instruction *UI = cast<Instruction>(U);
4391       if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) &&
4392           Visited.insert(UI).second)
4393         Worklist.push_back(UI);
4394     }
4395   }
4396 }
4397 
4398 void InnerLoopVectorizer::fixLCSSAPHIs(VPTransformState &State) {
4399   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
4400     if (LCSSAPhi.getBasicBlockIndex(LoopMiddleBlock) != -1)
4401       // Some phis were already hand updated by the reduction and recurrence
4402       // code above, leave them alone.
4403       continue;
4404 
4405     auto *IncomingValue = LCSSAPhi.getIncomingValue(0);
4406     // Non-instruction incoming values will have only one value.
4407 
4408     VPLane Lane = VPLane::getFirstLane();
4409     if (isa<Instruction>(IncomingValue) &&
4410         !Cost->isUniformAfterVectorization(cast<Instruction>(IncomingValue),
4411                                            VF))
4412       Lane = VPLane::getLastLaneForVF(VF);
4413 
4414     // Can be a loop invariant incoming value or the last scalar value to be
4415     // extracted from the vectorized loop.
4416     // FIXME: Should not rely on getVPValue at this point.
4417     Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
4418     Value *lastIncomingValue =
4419         OrigLoop->isLoopInvariant(IncomingValue)
4420             ? IncomingValue
4421             : State.get(State.Plan->getVPValue(IncomingValue, true),
4422                         VPIteration(UF - 1, Lane));
4423     LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock);
4424   }
4425 }
4426 
4427 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
4428   // The basic block and loop containing the predicated instruction.
4429   auto *PredBB = PredInst->getParent();
4430   auto *VectorLoop = LI->getLoopFor(PredBB);
4431 
4432   // Initialize a worklist with the operands of the predicated instruction.
4433   SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
4434 
4435   // Holds instructions that we need to analyze again. An instruction may be
4436   // reanalyzed if we don't yet know if we can sink it or not.
4437   SmallVector<Instruction *, 8> InstsToReanalyze;
4438 
4439   // Returns true if a given use occurs in the predicated block. Phi nodes use
4440   // their operands in their corresponding predecessor blocks.
4441   auto isBlockOfUsePredicated = [&](Use &U) -> bool {
4442     auto *I = cast<Instruction>(U.getUser());
4443     BasicBlock *BB = I->getParent();
4444     if (auto *Phi = dyn_cast<PHINode>(I))
4445       BB = Phi->getIncomingBlock(
4446           PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
4447     return BB == PredBB;
4448   };
4449 
4450   // Iteratively sink the scalarized operands of the predicated instruction
4451   // into the block we created for it. When an instruction is sunk, it's
4452   // operands are then added to the worklist. The algorithm ends after one pass
4453   // through the worklist doesn't sink a single instruction.
4454   bool Changed;
4455   do {
4456     // Add the instructions that need to be reanalyzed to the worklist, and
4457     // reset the changed indicator.
4458     Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
4459     InstsToReanalyze.clear();
4460     Changed = false;
4461 
4462     while (!Worklist.empty()) {
4463       auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
4464 
4465       // We can't sink an instruction if it is a phi node, is not in the loop,
4466       // or may have side effects.
4467       if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) ||
4468           I->mayHaveSideEffects())
4469         continue;
4470 
4471       // If the instruction is already in PredBB, check if we can sink its
4472       // operands. In that case, VPlan's sinkScalarOperands() succeeded in
4473       // sinking the scalar instruction I, hence it appears in PredBB; but it
4474       // may have failed to sink I's operands (recursively), which we try
4475       // (again) here.
4476       if (I->getParent() == PredBB) {
4477         Worklist.insert(I->op_begin(), I->op_end());
4478         continue;
4479       }
4480 
4481       // It's legal to sink the instruction if all its uses occur in the
4482       // predicated block. Otherwise, there's nothing to do yet, and we may
4483       // need to reanalyze the instruction.
4484       if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
4485         InstsToReanalyze.push_back(I);
4486         continue;
4487       }
4488 
4489       // Move the instruction to the beginning of the predicated block, and add
4490       // it's operands to the worklist.
4491       I->moveBefore(&*PredBB->getFirstInsertionPt());
4492       Worklist.insert(I->op_begin(), I->op_end());
4493 
4494       // The sinking may have enabled other instructions to be sunk, so we will
4495       // need to iterate.
4496       Changed = true;
4497     }
4498   } while (Changed);
4499 }
4500 
4501 void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) {
4502   for (PHINode *OrigPhi : OrigPHIsToFix) {
4503     VPWidenPHIRecipe *VPPhi =
4504         cast<VPWidenPHIRecipe>(State.Plan->getVPValue(OrigPhi));
4505     PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0));
4506     // Make sure the builder has a valid insert point.
4507     Builder.SetInsertPoint(NewPhi);
4508     for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) {
4509       VPValue *Inc = VPPhi->getIncomingValue(i);
4510       VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i);
4511       NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]);
4512     }
4513   }
4514 }
4515 
4516 bool InnerLoopVectorizer::useOrderedReductions(
4517     const RecurrenceDescriptor &RdxDesc) {
4518   return Cost->useOrderedReductions(RdxDesc);
4519 }
4520 
4521 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,
4522                                               VPWidenPHIRecipe *PhiR,
4523                                               VPTransformState &State) {
4524   PHINode *P = cast<PHINode>(PN);
4525   if (EnableVPlanNativePath) {
4526     // Currently we enter here in the VPlan-native path for non-induction
4527     // PHIs where all control flow is uniform. We simply widen these PHIs.
4528     // Create a vector phi with no operands - the vector phi operands will be
4529     // set at the end of vector code generation.
4530     Type *VecTy = (State.VF.isScalar())
4531                       ? PN->getType()
4532                       : VectorType::get(PN->getType(), State.VF);
4533     Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi");
4534     State.set(PhiR, VecPhi, 0);
4535     OrigPHIsToFix.push_back(P);
4536 
4537     return;
4538   }
4539 
4540   assert(PN->getParent() == OrigLoop->getHeader() &&
4541          "Non-header phis should have been handled elsewhere");
4542 
4543   // In order to support recurrences we need to be able to vectorize Phi nodes.
4544   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
4545   // stage #1: We create a new vector PHI node with no incoming edges. We'll use
4546   // this value when we vectorize all of the instructions that use the PHI.
4547 
4548   assert(!Legal->isReductionVariable(P) &&
4549          "reductions should be handled elsewhere");
4550 
4551   setDebugLocFromInst(P);
4552 
4553   // This PHINode must be an induction variable.
4554   // Make sure that we know about it.
4555   assert(Legal->getInductionVars().count(P) && "Not an induction variable");
4556 
4557   InductionDescriptor II = Legal->getInductionVars().lookup(P);
4558   const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
4559 
4560   // FIXME: The newly created binary instructions should contain nsw/nuw flags,
4561   // which can be found from the original scalar operations.
4562   switch (II.getKind()) {
4563   case InductionDescriptor::IK_NoInduction:
4564     llvm_unreachable("Unknown induction");
4565   case InductionDescriptor::IK_IntInduction:
4566   case InductionDescriptor::IK_FpInduction:
4567     llvm_unreachable("Integer/fp induction is handled elsewhere.");
4568   case InductionDescriptor::IK_PtrInduction: {
4569     // Handle the pointer induction variable case.
4570     assert(P->getType()->isPointerTy() && "Unexpected type.");
4571 
4572     if (Cost->isScalarAfterVectorization(P, State.VF)) {
4573       // This is the normalized GEP that starts counting at zero.
4574       Value *PtrInd =
4575           Builder.CreateSExtOrTrunc(Induction, II.getStep()->getType());
4576       // Determine the number of scalars we need to generate for each unroll
4577       // iteration. If the instruction is uniform, we only need to generate the
4578       // first lane. Otherwise, we generate all VF values.
4579       bool IsUniform = Cost->isUniformAfterVectorization(P, State.VF);
4580       assert((IsUniform || !State.VF.isScalable()) &&
4581              "Cannot scalarize a scalable VF");
4582       unsigned Lanes = IsUniform ? 1 : State.VF.getFixedValue();
4583 
4584       for (unsigned Part = 0; Part < UF; ++Part) {
4585         Value *PartStart =
4586             createStepForVF(Builder, PtrInd->getType(), VF, Part);
4587 
4588         for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
4589           Value *Idx = Builder.CreateAdd(
4590               PartStart, ConstantInt::get(PtrInd->getType(), Lane));
4591           Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
4592           Value *SclrGep = emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(),
4593                                                 DL, II, State.CFG.PrevBB);
4594           SclrGep->setName("next.gep");
4595           State.set(PhiR, SclrGep, VPIteration(Part, Lane));
4596         }
4597       }
4598       return;
4599     }
4600     assert(isa<SCEVConstant>(II.getStep()) &&
4601            "Induction step not a SCEV constant!");
4602     Type *PhiType = II.getStep()->getType();
4603 
4604     // Build a pointer phi
4605     Value *ScalarStartValue = II.getStartValue();
4606     Type *ScStValueType = ScalarStartValue->getType();
4607     PHINode *NewPointerPhi =
4608         PHINode::Create(ScStValueType, 2, "pointer.phi", Induction);
4609     NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader);
4610 
4611     // A pointer induction, performed by using a gep
4612     BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
4613     Instruction *InductionLoc = LoopLatch->getTerminator();
4614     const SCEV *ScalarStep = II.getStep();
4615     SCEVExpander Exp(*PSE.getSE(), DL, "induction");
4616     Value *ScalarStepValue =
4617         Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc);
4618     Value *RuntimeVF = getRuntimeVF(Builder, PhiType, VF);
4619     Value *NumUnrolledElems =
4620         Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF));
4621     Value *InductionGEP = GetElementPtrInst::Create(
4622         II.getElementType(), NewPointerPhi,
4623         Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind",
4624         InductionLoc);
4625     NewPointerPhi->addIncoming(InductionGEP, LoopLatch);
4626 
4627     // Create UF many actual address geps that use the pointer
4628     // phi as base and a vectorized version of the step value
4629     // (<step*0, ..., step*N>) as offset.
4630     for (unsigned Part = 0; Part < State.UF; ++Part) {
4631       Type *VecPhiType = VectorType::get(PhiType, State.VF);
4632       Value *StartOffsetScalar =
4633           Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, Part));
4634       Value *StartOffset =
4635           Builder.CreateVectorSplat(State.VF, StartOffsetScalar);
4636       // Create a vector of consecutive numbers from zero to VF.
4637       StartOffset =
4638           Builder.CreateAdd(StartOffset, Builder.CreateStepVector(VecPhiType));
4639 
4640       Value *GEP = Builder.CreateGEP(
4641           II.getElementType(), NewPointerPhi,
4642           Builder.CreateMul(
4643               StartOffset, Builder.CreateVectorSplat(State.VF, ScalarStepValue),
4644               "vector.gep"));
4645       State.set(PhiR, GEP, Part);
4646     }
4647   }
4648   }
4649 }
4650 
4651 /// A helper function for checking whether an integer division-related
4652 /// instruction may divide by zero (in which case it must be predicated if
4653 /// executed conditionally in the scalar code).
4654 /// TODO: It may be worthwhile to generalize and check isKnownNonZero().
4655 /// Non-zero divisors that are non compile-time constants will not be
4656 /// converted into multiplication, so we will still end up scalarizing
4657 /// the division, but can do so w/o predication.
4658 static bool mayDivideByZero(Instruction &I) {
4659   assert((I.getOpcode() == Instruction::UDiv ||
4660           I.getOpcode() == Instruction::SDiv ||
4661           I.getOpcode() == Instruction::URem ||
4662           I.getOpcode() == Instruction::SRem) &&
4663          "Unexpected instruction");
4664   Value *Divisor = I.getOperand(1);
4665   auto *CInt = dyn_cast<ConstantInt>(Divisor);
4666   return !CInt || CInt->isZero();
4667 }
4668 
4669 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def,
4670                                                VPUser &ArgOperands,
4671                                                VPTransformState &State) {
4672   assert(!isa<DbgInfoIntrinsic>(I) &&
4673          "DbgInfoIntrinsic should have been dropped during VPlan construction");
4674   setDebugLocFromInst(&I);
4675 
4676   Module *M = I.getParent()->getParent()->getParent();
4677   auto *CI = cast<CallInst>(&I);
4678 
4679   SmallVector<Type *, 4> Tys;
4680   for (Value *ArgOperand : CI->args())
4681     Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue()));
4682 
4683   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
4684 
4685   // The flag shows whether we use Intrinsic or a usual Call for vectorized
4686   // version of the instruction.
4687   // Is it beneficial to perform intrinsic call compared to lib call?
4688   bool NeedToScalarize = false;
4689   InstructionCost CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize);
4690   InstructionCost IntrinsicCost = ID ? Cost->getVectorIntrinsicCost(CI, VF) : 0;
4691   bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost;
4692   assert((UseVectorIntrinsic || !NeedToScalarize) &&
4693          "Instruction should be scalarized elsewhere.");
4694   assert((IntrinsicCost.isValid() || CallCost.isValid()) &&
4695          "Either the intrinsic cost or vector call cost must be valid");
4696 
4697   for (unsigned Part = 0; Part < UF; ++Part) {
4698     SmallVector<Type *, 2> TysForDecl = {CI->getType()};
4699     SmallVector<Value *, 4> Args;
4700     for (auto &I : enumerate(ArgOperands.operands())) {
4701       // Some intrinsics have a scalar argument - don't replace it with a
4702       // vector.
4703       Value *Arg;
4704       if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index()))
4705         Arg = State.get(I.value(), Part);
4706       else {
4707         Arg = State.get(I.value(), VPIteration(0, 0));
4708         if (hasVectorInstrinsicOverloadedScalarOpd(ID, I.index()))
4709           TysForDecl.push_back(Arg->getType());
4710       }
4711       Args.push_back(Arg);
4712     }
4713 
4714     Function *VectorF;
4715     if (UseVectorIntrinsic) {
4716       // Use vector version of the intrinsic.
4717       if (VF.isVector())
4718         TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
4719       VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
4720       assert(VectorF && "Can't retrieve vector intrinsic.");
4721     } else {
4722       // Use vector version of the function call.
4723       const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
4724 #ifndef NDEBUG
4725       assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr &&
4726              "Can't create vector function.");
4727 #endif
4728         VectorF = VFDatabase(*CI).getVectorizedFunction(Shape);
4729     }
4730       SmallVector<OperandBundleDef, 1> OpBundles;
4731       CI->getOperandBundlesAsDefs(OpBundles);
4732       CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
4733 
4734       if (isa<FPMathOperator>(V))
4735         V->copyFastMathFlags(CI);
4736 
4737       State.set(Def, V, Part);
4738       addMetadata(V, &I);
4739   }
4740 }
4741 
4742 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
4743   // We should not collect Scalars more than once per VF. Right now, this
4744   // function is called from collectUniformsAndScalars(), which already does
4745   // this check. Collecting Scalars for VF=1 does not make any sense.
4746   assert(VF.isVector() && Scalars.find(VF) == Scalars.end() &&
4747          "This function should not be visited twice for the same VF");
4748 
4749   SmallSetVector<Instruction *, 8> Worklist;
4750 
4751   // These sets are used to seed the analysis with pointers used by memory
4752   // accesses that will remain scalar.
4753   SmallSetVector<Instruction *, 8> ScalarPtrs;
4754   SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
4755   auto *Latch = TheLoop->getLoopLatch();
4756 
4757   // A helper that returns true if the use of Ptr by MemAccess will be scalar.
4758   // The pointer operands of loads and stores will be scalar as long as the
4759   // memory access is not a gather or scatter operation. The value operand of a
4760   // store will remain scalar if the store is scalarized.
4761   auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
4762     InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
4763     assert(WideningDecision != CM_Unknown &&
4764            "Widening decision should be ready at this moment");
4765     if (auto *Store = dyn_cast<StoreInst>(MemAccess))
4766       if (Ptr == Store->getValueOperand())
4767         return WideningDecision == CM_Scalarize;
4768     assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
4769            "Ptr is neither a value or pointer operand");
4770     return WideningDecision != CM_GatherScatter;
4771   };
4772 
4773   // A helper that returns true if the given value is a bitcast or
4774   // getelementptr instruction contained in the loop.
4775   auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
4776     return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
4777             isa<GetElementPtrInst>(V)) &&
4778            !TheLoop->isLoopInvariant(V);
4779   };
4780 
4781   // A helper that evaluates a memory access's use of a pointer. If the use will
4782   // be a scalar use and the pointer is only used by memory accesses, we place
4783   // the pointer in ScalarPtrs. Otherwise, the pointer is placed in
4784   // PossibleNonScalarPtrs.
4785   auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
4786     // We only care about bitcast and getelementptr instructions contained in
4787     // the loop.
4788     if (!isLoopVaryingBitCastOrGEP(Ptr))
4789       return;
4790 
4791     // If the pointer has already been identified as scalar (e.g., if it was
4792     // also identified as uniform), there's nothing to do.
4793     auto *I = cast<Instruction>(Ptr);
4794     if (Worklist.count(I))
4795       return;
4796 
4797     // If the use of the pointer will be a scalar use, and all users of the
4798     // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
4799     // place the pointer in PossibleNonScalarPtrs.
4800     if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
4801           return isa<LoadInst>(U) || isa<StoreInst>(U);
4802         }))
4803       ScalarPtrs.insert(I);
4804     else
4805       PossibleNonScalarPtrs.insert(I);
4806   };
4807 
4808   // We seed the scalars analysis with three classes of instructions: (1)
4809   // instructions marked uniform-after-vectorization and (2) bitcast,
4810   // getelementptr and (pointer) phi instructions used by memory accesses
4811   // requiring a scalar use.
4812   //
4813   // (1) Add to the worklist all instructions that have been identified as
4814   // uniform-after-vectorization.
4815   Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
4816 
4817   // (2) Add to the worklist all bitcast and getelementptr instructions used by
4818   // memory accesses requiring a scalar use. The pointer operands of loads and
4819   // stores will be scalar as long as the memory accesses is not a gather or
4820   // scatter operation. The value operand of a store will remain scalar if the
4821   // store is scalarized.
4822   for (auto *BB : TheLoop->blocks())
4823     for (auto &I : *BB) {
4824       if (auto *Load = dyn_cast<LoadInst>(&I)) {
4825         evaluatePtrUse(Load, Load->getPointerOperand());
4826       } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
4827         evaluatePtrUse(Store, Store->getPointerOperand());
4828         evaluatePtrUse(Store, Store->getValueOperand());
4829       }
4830     }
4831   for (auto *I : ScalarPtrs)
4832     if (!PossibleNonScalarPtrs.count(I)) {
4833       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
4834       Worklist.insert(I);
4835     }
4836 
4837   // Insert the forced scalars.
4838   // FIXME: Currently widenPHIInstruction() often creates a dead vector
4839   // induction variable when the PHI user is scalarized.
4840   auto ForcedScalar = ForcedScalars.find(VF);
4841   if (ForcedScalar != ForcedScalars.end())
4842     for (auto *I : ForcedScalar->second)
4843       Worklist.insert(I);
4844 
4845   // Expand the worklist by looking through any bitcasts and getelementptr
4846   // instructions we've already identified as scalar. This is similar to the
4847   // expansion step in collectLoopUniforms(); however, here we're only
4848   // expanding to include additional bitcasts and getelementptr instructions.
4849   unsigned Idx = 0;
4850   while (Idx != Worklist.size()) {
4851     Instruction *Dst = Worklist[Idx++];
4852     if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
4853       continue;
4854     auto *Src = cast<Instruction>(Dst->getOperand(0));
4855     if (llvm::all_of(Src->users(), [&](User *U) -> bool {
4856           auto *J = cast<Instruction>(U);
4857           return !TheLoop->contains(J) || Worklist.count(J) ||
4858                  ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
4859                   isScalarUse(J, Src));
4860         })) {
4861       Worklist.insert(Src);
4862       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
4863     }
4864   }
4865 
4866   // An induction variable will remain scalar if all users of the induction
4867   // variable and induction variable update remain scalar.
4868   for (auto &Induction : Legal->getInductionVars()) {
4869     auto *Ind = Induction.first;
4870     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4871 
4872     // If tail-folding is applied, the primary induction variable will be used
4873     // to feed a vector compare.
4874     if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
4875       continue;
4876 
4877     // Returns true if \p Indvar is a pointer induction that is used directly by
4878     // load/store instruction \p I.
4879     auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar,
4880                                               Instruction *I) {
4881       return Induction.second.getKind() ==
4882                  InductionDescriptor::IK_PtrInduction &&
4883              (isa<LoadInst>(I) || isa<StoreInst>(I)) &&
4884              Indvar == getLoadStorePointerOperand(I) && isScalarUse(I, Indvar);
4885     };
4886 
4887     // Determine if all users of the induction variable are scalar after
4888     // vectorization.
4889     auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4890       auto *I = cast<Instruction>(U);
4891       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
4892              IsDirectLoadStoreFromPtrIndvar(Ind, I);
4893     });
4894     if (!ScalarInd)
4895       continue;
4896 
4897     // Determine if all users of the induction variable update instruction are
4898     // scalar after vectorization.
4899     auto ScalarIndUpdate =
4900         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4901           auto *I = cast<Instruction>(U);
4902           return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
4903                  IsDirectLoadStoreFromPtrIndvar(IndUpdate, I);
4904         });
4905     if (!ScalarIndUpdate)
4906       continue;
4907 
4908     // The induction variable and its update instruction will remain scalar.
4909     Worklist.insert(Ind);
4910     Worklist.insert(IndUpdate);
4911     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
4912     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
4913                       << "\n");
4914   }
4915 
4916   Scalars[VF].insert(Worklist.begin(), Worklist.end());
4917 }
4918 
4919 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I) const {
4920   if (!blockNeedsPredicationForAnyReason(I->getParent()))
4921     return false;
4922   switch(I->getOpcode()) {
4923   default:
4924     break;
4925   case Instruction::Load:
4926   case Instruction::Store: {
4927     if (!Legal->isMaskRequired(I))
4928       return false;
4929     auto *Ptr = getLoadStorePointerOperand(I);
4930     auto *Ty = getLoadStoreType(I);
4931     const Align Alignment = getLoadStoreAlignment(I);
4932     return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
4933                                 TTI.isLegalMaskedGather(Ty, Alignment))
4934                             : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
4935                                 TTI.isLegalMaskedScatter(Ty, Alignment));
4936   }
4937   case Instruction::UDiv:
4938   case Instruction::SDiv:
4939   case Instruction::SRem:
4940   case Instruction::URem:
4941     return mayDivideByZero(*I);
4942   }
4943   return false;
4944 }
4945 
4946 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
4947     Instruction *I, ElementCount VF) {
4948   assert(isAccessInterleaved(I) && "Expecting interleaved access.");
4949   assert(getWideningDecision(I, VF) == CM_Unknown &&
4950          "Decision should not be set yet.");
4951   auto *Group = getInterleavedAccessGroup(I);
4952   assert(Group && "Must have a group.");
4953 
4954   // If the instruction's allocated size doesn't equal it's type size, it
4955   // requires padding and will be scalarized.
4956   auto &DL = I->getModule()->getDataLayout();
4957   auto *ScalarTy = getLoadStoreType(I);
4958   if (hasIrregularType(ScalarTy, DL))
4959     return false;
4960 
4961   // Check if masking is required.
4962   // A Group may need masking for one of two reasons: it resides in a block that
4963   // needs predication, or it was decided to use masking to deal with gaps
4964   // (either a gap at the end of a load-access that may result in a speculative
4965   // load, or any gaps in a store-access).
4966   bool PredicatedAccessRequiresMasking =
4967       blockNeedsPredicationForAnyReason(I->getParent()) &&
4968       Legal->isMaskRequired(I);
4969   bool LoadAccessWithGapsRequiresEpilogMasking =
4970       isa<LoadInst>(I) && Group->requiresScalarEpilogue() &&
4971       !isScalarEpilogueAllowed();
4972   bool StoreAccessWithGapsRequiresMasking =
4973       isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor());
4974   if (!PredicatedAccessRequiresMasking &&
4975       !LoadAccessWithGapsRequiresEpilogMasking &&
4976       !StoreAccessWithGapsRequiresMasking)
4977     return true;
4978 
4979   // If masked interleaving is required, we expect that the user/target had
4980   // enabled it, because otherwise it either wouldn't have been created or
4981   // it should have been invalidated by the CostModel.
4982   assert(useMaskedInterleavedAccesses(TTI) &&
4983          "Masked interleave-groups for predicated accesses are not enabled.");
4984 
4985   if (Group->isReverse())
4986     return false;
4987 
4988   auto *Ty = getLoadStoreType(I);
4989   const Align Alignment = getLoadStoreAlignment(I);
4990   return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
4991                           : TTI.isLegalMaskedStore(Ty, Alignment);
4992 }
4993 
4994 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
4995     Instruction *I, ElementCount VF) {
4996   // Get and ensure we have a valid memory instruction.
4997   assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction");
4998 
4999   auto *Ptr = getLoadStorePointerOperand(I);
5000   auto *ScalarTy = getLoadStoreType(I);
5001 
5002   // In order to be widened, the pointer should be consecutive, first of all.
5003   if (!Legal->isConsecutivePtr(ScalarTy, Ptr))
5004     return false;
5005 
5006   // If the instruction is a store located in a predicated block, it will be
5007   // scalarized.
5008   if (isScalarWithPredication(I))
5009     return false;
5010 
5011   // If the instruction's allocated size doesn't equal it's type size, it
5012   // requires padding and will be scalarized.
5013   auto &DL = I->getModule()->getDataLayout();
5014   if (hasIrregularType(ScalarTy, DL))
5015     return false;
5016 
5017   return true;
5018 }
5019 
5020 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
5021   // We should not collect Uniforms more than once per VF. Right now,
5022   // this function is called from collectUniformsAndScalars(), which
5023   // already does this check. Collecting Uniforms for VF=1 does not make any
5024   // sense.
5025 
5026   assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() &&
5027          "This function should not be visited twice for the same VF");
5028 
5029   // Visit the list of Uniforms. If we'll not find any uniform value, we'll
5030   // not analyze again.  Uniforms.count(VF) will return 1.
5031   Uniforms[VF].clear();
5032 
5033   // We now know that the loop is vectorizable!
5034   // Collect instructions inside the loop that will remain uniform after
5035   // vectorization.
5036 
5037   // Global values, params and instructions outside of current loop are out of
5038   // scope.
5039   auto isOutOfScope = [&](Value *V) -> bool {
5040     Instruction *I = dyn_cast<Instruction>(V);
5041     return (!I || !TheLoop->contains(I));
5042   };
5043 
5044   // Worklist containing uniform instructions demanding lane 0.
5045   SetVector<Instruction *> Worklist;
5046   BasicBlock *Latch = TheLoop->getLoopLatch();
5047 
5048   // Add uniform instructions demanding lane 0 to the worklist. Instructions
5049   // that are scalar with predication must not be considered uniform after
5050   // vectorization, because that would create an erroneous replicating region
5051   // where only a single instance out of VF should be formed.
5052   // TODO: optimize such seldom cases if found important, see PR40816.
5053   auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
5054     if (isOutOfScope(I)) {
5055       LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: "
5056                         << *I << "\n");
5057       return;
5058     }
5059     if (isScalarWithPredication(I)) {
5060       LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "
5061                         << *I << "\n");
5062       return;
5063     }
5064     LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
5065     Worklist.insert(I);
5066   };
5067 
5068   // Start with the conditional branch. If the branch condition is an
5069   // instruction contained in the loop that is only used by the branch, it is
5070   // uniform.
5071   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
5072   if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
5073     addToWorklistIfAllowed(Cmp);
5074 
5075   auto isUniformDecision = [&](Instruction *I, ElementCount VF) {
5076     InstWidening WideningDecision = getWideningDecision(I, VF);
5077     assert(WideningDecision != CM_Unknown &&
5078            "Widening decision should be ready at this moment");
5079 
5080     // A uniform memory op is itself uniform.  We exclude uniform stores
5081     // here as they demand the last lane, not the first one.
5082     if (isa<LoadInst>(I) && Legal->isUniformMemOp(*I)) {
5083       assert(WideningDecision == CM_Scalarize);
5084       return true;
5085     }
5086 
5087     return (WideningDecision == CM_Widen ||
5088             WideningDecision == CM_Widen_Reverse ||
5089             WideningDecision == CM_Interleave);
5090   };
5091 
5092 
5093   // Returns true if Ptr is the pointer operand of a memory access instruction
5094   // I, and I is known to not require scalarization.
5095   auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
5096     return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF);
5097   };
5098 
5099   // Holds a list of values which are known to have at least one uniform use.
5100   // Note that there may be other uses which aren't uniform.  A "uniform use"
5101   // here is something which only demands lane 0 of the unrolled iterations;
5102   // it does not imply that all lanes produce the same value (e.g. this is not
5103   // the usual meaning of uniform)
5104   SetVector<Value *> HasUniformUse;
5105 
5106   // Scan the loop for instructions which are either a) known to have only
5107   // lane 0 demanded or b) are uses which demand only lane 0 of their operand.
5108   for (auto *BB : TheLoop->blocks())
5109     for (auto &I : *BB) {
5110       if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) {
5111         switch (II->getIntrinsicID()) {
5112         case Intrinsic::sideeffect:
5113         case Intrinsic::experimental_noalias_scope_decl:
5114         case Intrinsic::assume:
5115         case Intrinsic::lifetime_start:
5116         case Intrinsic::lifetime_end:
5117           if (TheLoop->hasLoopInvariantOperands(&I))
5118             addToWorklistIfAllowed(&I);
5119           break;
5120         default:
5121           break;
5122         }
5123       }
5124 
5125       // ExtractValue instructions must be uniform, because the operands are
5126       // known to be loop-invariant.
5127       if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) {
5128         assert(isOutOfScope(EVI->getAggregateOperand()) &&
5129                "Expected aggregate value to be loop invariant");
5130         addToWorklistIfAllowed(EVI);
5131         continue;
5132       }
5133 
5134       // If there's no pointer operand, there's nothing to do.
5135       auto *Ptr = getLoadStorePointerOperand(&I);
5136       if (!Ptr)
5137         continue;
5138 
5139       // A uniform memory op is itself uniform.  We exclude uniform stores
5140       // here as they demand the last lane, not the first one.
5141       if (isa<LoadInst>(I) && Legal->isUniformMemOp(I))
5142         addToWorklistIfAllowed(&I);
5143 
5144       if (isUniformDecision(&I, VF)) {
5145         assert(isVectorizedMemAccessUse(&I, Ptr) && "consistency check");
5146         HasUniformUse.insert(Ptr);
5147       }
5148     }
5149 
5150   // Add to the worklist any operands which have *only* uniform (e.g. lane 0
5151   // demanding) users.  Since loops are assumed to be in LCSSA form, this
5152   // disallows uses outside the loop as well.
5153   for (auto *V : HasUniformUse) {
5154     if (isOutOfScope(V))
5155       continue;
5156     auto *I = cast<Instruction>(V);
5157     auto UsersAreMemAccesses =
5158       llvm::all_of(I->users(), [&](User *U) -> bool {
5159         return isVectorizedMemAccessUse(cast<Instruction>(U), V);
5160       });
5161     if (UsersAreMemAccesses)
5162       addToWorklistIfAllowed(I);
5163   }
5164 
5165   // Expand Worklist in topological order: whenever a new instruction
5166   // is added , its users should be already inside Worklist.  It ensures
5167   // a uniform instruction will only be used by uniform instructions.
5168   unsigned idx = 0;
5169   while (idx != Worklist.size()) {
5170     Instruction *I = Worklist[idx++];
5171 
5172     for (auto OV : I->operand_values()) {
5173       // isOutOfScope operands cannot be uniform instructions.
5174       if (isOutOfScope(OV))
5175         continue;
5176       // First order recurrence Phi's should typically be considered
5177       // non-uniform.
5178       auto *OP = dyn_cast<PHINode>(OV);
5179       if (OP && Legal->isFirstOrderRecurrence(OP))
5180         continue;
5181       // If all the users of the operand are uniform, then add the
5182       // operand into the uniform worklist.
5183       auto *OI = cast<Instruction>(OV);
5184       if (llvm::all_of(OI->users(), [&](User *U) -> bool {
5185             auto *J = cast<Instruction>(U);
5186             return Worklist.count(J) || isVectorizedMemAccessUse(J, OI);
5187           }))
5188         addToWorklistIfAllowed(OI);
5189     }
5190   }
5191 
5192   // For an instruction to be added into Worklist above, all its users inside
5193   // the loop should also be in Worklist. However, this condition cannot be
5194   // true for phi nodes that form a cyclic dependence. We must process phi
5195   // nodes separately. An induction variable will remain uniform if all users
5196   // of the induction variable and induction variable update remain uniform.
5197   // The code below handles both pointer and non-pointer induction variables.
5198   for (auto &Induction : Legal->getInductionVars()) {
5199     auto *Ind = Induction.first;
5200     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
5201 
5202     // Determine if all users of the induction variable are uniform after
5203     // vectorization.
5204     auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
5205       auto *I = cast<Instruction>(U);
5206       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
5207              isVectorizedMemAccessUse(I, Ind);
5208     });
5209     if (!UniformInd)
5210       continue;
5211 
5212     // Determine if all users of the induction variable update instruction are
5213     // uniform after vectorization.
5214     auto UniformIndUpdate =
5215         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
5216           auto *I = cast<Instruction>(U);
5217           return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
5218                  isVectorizedMemAccessUse(I, IndUpdate);
5219         });
5220     if (!UniformIndUpdate)
5221       continue;
5222 
5223     // The induction variable and its update instruction will remain uniform.
5224     addToWorklistIfAllowed(Ind);
5225     addToWorklistIfAllowed(IndUpdate);
5226   }
5227 
5228   Uniforms[VF].insert(Worklist.begin(), Worklist.end());
5229 }
5230 
5231 bool LoopVectorizationCostModel::runtimeChecksRequired() {
5232   LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
5233 
5234   if (Legal->getRuntimePointerChecking()->Need) {
5235     reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
5236         "runtime pointer checks needed. Enable vectorization of this "
5237         "loop with '#pragma clang loop vectorize(enable)' when "
5238         "compiling with -Os/-Oz",
5239         "CantVersionLoopWithOptForSize", ORE, TheLoop);
5240     return true;
5241   }
5242 
5243   if (!PSE.getUnionPredicate().getPredicates().empty()) {
5244     reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
5245         "runtime SCEV checks needed. Enable vectorization of this "
5246         "loop with '#pragma clang loop vectorize(enable)' when "
5247         "compiling with -Os/-Oz",
5248         "CantVersionLoopWithOptForSize", ORE, TheLoop);
5249     return true;
5250   }
5251 
5252   // FIXME: Avoid specializing for stride==1 instead of bailing out.
5253   if (!Legal->getLAI()->getSymbolicStrides().empty()) {
5254     reportVectorizationFailure("Runtime stride check for small trip count",
5255         "runtime stride == 1 checks needed. Enable vectorization of "
5256         "this loop without such check by compiling with -Os/-Oz",
5257         "CantVersionLoopWithOptForSize", ORE, TheLoop);
5258     return true;
5259   }
5260 
5261   return false;
5262 }
5263 
5264 ElementCount
5265 LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
5266   if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors)
5267     return ElementCount::getScalable(0);
5268 
5269   if (Hints->isScalableVectorizationDisabled()) {
5270     reportVectorizationInfo("Scalable vectorization is explicitly disabled",
5271                             "ScalableVectorizationDisabled", ORE, TheLoop);
5272     return ElementCount::getScalable(0);
5273   }
5274 
5275   LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n");
5276 
5277   auto MaxScalableVF = ElementCount::getScalable(
5278       std::numeric_limits<ElementCount::ScalarTy>::max());
5279 
5280   // Test that the loop-vectorizer can legalize all operations for this MaxVF.
5281   // FIXME: While for scalable vectors this is currently sufficient, this should
5282   // be replaced by a more detailed mechanism that filters out specific VFs,
5283   // instead of invalidating vectorization for a whole set of VFs based on the
5284   // MaxVF.
5285 
5286   // Disable scalable vectorization if the loop contains unsupported reductions.
5287   if (!canVectorizeReductions(MaxScalableVF)) {
5288     reportVectorizationInfo(
5289         "Scalable vectorization not supported for the reduction "
5290         "operations found in this loop.",
5291         "ScalableVFUnfeasible", ORE, TheLoop);
5292     return ElementCount::getScalable(0);
5293   }
5294 
5295   // Disable scalable vectorization if the loop contains any instructions
5296   // with element types not supported for scalable vectors.
5297   if (any_of(ElementTypesInLoop, [&](Type *Ty) {
5298         return !Ty->isVoidTy() &&
5299                !this->TTI.isElementTypeLegalForScalableVector(Ty);
5300       })) {
5301     reportVectorizationInfo("Scalable vectorization is not supported "
5302                             "for all element types found in this loop.",
5303                             "ScalableVFUnfeasible", ORE, TheLoop);
5304     return ElementCount::getScalable(0);
5305   }
5306 
5307   if (Legal->isSafeForAnyVectorWidth())
5308     return MaxScalableVF;
5309 
5310   // Limit MaxScalableVF by the maximum safe dependence distance.
5311   Optional<unsigned> MaxVScale = TTI.getMaxVScale();
5312   if (!MaxVScale && TheFunction->hasFnAttribute(Attribute::VScaleRange))
5313     MaxVScale =
5314         TheFunction->getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax();
5315   MaxScalableVF = ElementCount::getScalable(
5316       MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0);
5317   if (!MaxScalableVF)
5318     reportVectorizationInfo(
5319         "Max legal vector width too small, scalable vectorization "
5320         "unfeasible.",
5321         "ScalableVFUnfeasible", ORE, TheLoop);
5322 
5323   return MaxScalableVF;
5324 }
5325 
5326 FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF(
5327     unsigned ConstTripCount, ElementCount UserVF, bool FoldTailByMasking) {
5328   MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
5329   unsigned SmallestType, WidestType;
5330   std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
5331 
5332   // Get the maximum safe dependence distance in bits computed by LAA.
5333   // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
5334   // the memory accesses that is most restrictive (involved in the smallest
5335   // dependence distance).
5336   unsigned MaxSafeElements =
5337       PowerOf2Floor(Legal->getMaxSafeVectorWidthInBits() / WidestType);
5338 
5339   auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements);
5340   auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements);
5341 
5342   LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF
5343                     << ".\n");
5344   LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF
5345                     << ".\n");
5346 
5347   // First analyze the UserVF, fall back if the UserVF should be ignored.
5348   if (UserVF) {
5349     auto MaxSafeUserVF =
5350         UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF;
5351 
5352     if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) {
5353       // If `VF=vscale x N` is safe, then so is `VF=N`
5354       if (UserVF.isScalable())
5355         return FixedScalableVFPair(
5356             ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF);
5357       else
5358         return UserVF;
5359     }
5360 
5361     assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF));
5362 
5363     // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it
5364     // is better to ignore the hint and let the compiler choose a suitable VF.
5365     if (!UserVF.isScalable()) {
5366       LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
5367                         << " is unsafe, clamping to max safe VF="
5368                         << MaxSafeFixedVF << ".\n");
5369       ORE->emit([&]() {
5370         return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
5371                                           TheLoop->getStartLoc(),
5372                                           TheLoop->getHeader())
5373                << "User-specified vectorization factor "
5374                << ore::NV("UserVectorizationFactor", UserVF)
5375                << " is unsafe, clamping to maximum safe vectorization factor "
5376                << ore::NV("VectorizationFactor", MaxSafeFixedVF);
5377       });
5378       return MaxSafeFixedVF;
5379     }
5380 
5381     if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) {
5382       LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
5383                         << " is ignored because scalable vectors are not "
5384                            "available.\n");
5385       ORE->emit([&]() {
5386         return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
5387                                           TheLoop->getStartLoc(),
5388                                           TheLoop->getHeader())
5389                << "User-specified vectorization factor "
5390                << ore::NV("UserVectorizationFactor", UserVF)
5391                << " is ignored because the target does not support scalable "
5392                   "vectors. The compiler will pick a more suitable value.";
5393       });
5394     } else {
5395       LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
5396                         << " is unsafe. Ignoring scalable UserVF.\n");
5397       ORE->emit([&]() {
5398         return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
5399                                           TheLoop->getStartLoc(),
5400                                           TheLoop->getHeader())
5401                << "User-specified vectorization factor "
5402                << ore::NV("UserVectorizationFactor", UserVF)
5403                << " is unsafe. Ignoring the hint to let the compiler pick a "
5404                   "more suitable value.";
5405       });
5406     }
5407   }
5408 
5409   LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
5410                     << " / " << WidestType << " bits.\n");
5411 
5412   FixedScalableVFPair Result(ElementCount::getFixed(1),
5413                              ElementCount::getScalable(0));
5414   if (auto MaxVF =
5415           getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType,
5416                                   MaxSafeFixedVF, FoldTailByMasking))
5417     Result.FixedVF = MaxVF;
5418 
5419   if (auto MaxVF =
5420           getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType,
5421                                   MaxSafeScalableVF, FoldTailByMasking))
5422     if (MaxVF.isScalable()) {
5423       Result.ScalableVF = MaxVF;
5424       LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF
5425                         << "\n");
5426     }
5427 
5428   return Result;
5429 }
5430 
5431 FixedScalableVFPair
5432 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
5433   if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
5434     // TODO: It may by useful to do since it's still likely to be dynamically
5435     // uniform if the target can skip.
5436     reportVectorizationFailure(
5437         "Not inserting runtime ptr check for divergent target",
5438         "runtime pointer checks needed. Not enabled for divergent target",
5439         "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
5440     return FixedScalableVFPair::getNone();
5441   }
5442 
5443   unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
5444   LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
5445   if (TC == 1) {
5446     reportVectorizationFailure("Single iteration (non) loop",
5447         "loop trip count is one, irrelevant for vectorization",
5448         "SingleIterationLoop", ORE, TheLoop);
5449     return FixedScalableVFPair::getNone();
5450   }
5451 
5452   switch (ScalarEpilogueStatus) {
5453   case CM_ScalarEpilogueAllowed:
5454     return computeFeasibleMaxVF(TC, UserVF, false);
5455   case CM_ScalarEpilogueNotAllowedUsePredicate:
5456     LLVM_FALLTHROUGH;
5457   case CM_ScalarEpilogueNotNeededUsePredicate:
5458     LLVM_DEBUG(
5459         dbgs() << "LV: vector predicate hint/switch found.\n"
5460                << "LV: Not allowing scalar epilogue, creating predicated "
5461                << "vector loop.\n");
5462     break;
5463   case CM_ScalarEpilogueNotAllowedLowTripLoop:
5464     // fallthrough as a special case of OptForSize
5465   case CM_ScalarEpilogueNotAllowedOptSize:
5466     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
5467       LLVM_DEBUG(
5468           dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
5469     else
5470       LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
5471                         << "count.\n");
5472 
5473     // Bail if runtime checks are required, which are not good when optimising
5474     // for size.
5475     if (runtimeChecksRequired())
5476       return FixedScalableVFPair::getNone();
5477 
5478     break;
5479   }
5480 
5481   // The only loops we can vectorize without a scalar epilogue, are loops with
5482   // a bottom-test and a single exiting block. We'd have to handle the fact
5483   // that not every instruction executes on the last iteration.  This will
5484   // require a lane mask which varies through the vector loop body.  (TODO)
5485   if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
5486     // If there was a tail-folding hint/switch, but we can't fold the tail by
5487     // masking, fallback to a vectorization with a scalar epilogue.
5488     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
5489       LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
5490                            "scalar epilogue instead.\n");
5491       ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
5492       return computeFeasibleMaxVF(TC, UserVF, false);
5493     }
5494     return FixedScalableVFPair::getNone();
5495   }
5496 
5497   // Now try the tail folding
5498 
5499   // Invalidate interleave groups that require an epilogue if we can't mask
5500   // the interleave-group.
5501   if (!useMaskedInterleavedAccesses(TTI)) {
5502     assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() &&
5503            "No decisions should have been taken at this point");
5504     // Note: There is no need to invalidate any cost modeling decisions here, as
5505     // non where taken so far.
5506     InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
5507   }
5508 
5509   FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(TC, UserVF, true);
5510   // Avoid tail folding if the trip count is known to be a multiple of any VF
5511   // we chose.
5512   // FIXME: The condition below pessimises the case for fixed-width vectors,
5513   // when scalable VFs are also candidates for vectorization.
5514   if (MaxFactors.FixedVF.isVector() && !MaxFactors.ScalableVF) {
5515     ElementCount MaxFixedVF = MaxFactors.FixedVF;
5516     assert((UserVF.isNonZero() || isPowerOf2_32(MaxFixedVF.getFixedValue())) &&
5517            "MaxFixedVF must be a power of 2");
5518     unsigned MaxVFtimesIC = UserIC ? MaxFixedVF.getFixedValue() * UserIC
5519                                    : MaxFixedVF.getFixedValue();
5520     ScalarEvolution *SE = PSE.getSE();
5521     const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
5522     const SCEV *ExitCount = SE->getAddExpr(
5523         BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
5524     const SCEV *Rem = SE->getURemExpr(
5525         SE->applyLoopGuards(ExitCount, TheLoop),
5526         SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC));
5527     if (Rem->isZero()) {
5528       // Accept MaxFixedVF if we do not have a tail.
5529       LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
5530       return MaxFactors;
5531     }
5532   }
5533 
5534   // For scalable vectors, don't use tail folding as this is currently not yet
5535   // supported. The code is likely to have ended up here if the tripcount is
5536   // low, in which case it makes sense not to use scalable vectors.
5537   if (MaxFactors.ScalableVF.isVector())
5538     MaxFactors.ScalableVF = ElementCount::getScalable(0);
5539 
5540   // If we don't know the precise trip count, or if the trip count that we
5541   // found modulo the vectorization factor is not zero, try to fold the tail
5542   // by masking.
5543   // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
5544   if (Legal->prepareToFoldTailByMasking()) {
5545     FoldTailByMasking = true;
5546     return MaxFactors;
5547   }
5548 
5549   // If there was a tail-folding hint/switch, but we can't fold the tail by
5550   // masking, fallback to a vectorization with a scalar epilogue.
5551   if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
5552     LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
5553                          "scalar epilogue instead.\n");
5554     ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
5555     return MaxFactors;
5556   }
5557 
5558   if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) {
5559     LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n");
5560     return FixedScalableVFPair::getNone();
5561   }
5562 
5563   if (TC == 0) {
5564     reportVectorizationFailure(
5565         "Unable to calculate the loop count due to complex control flow",
5566         "unable to calculate the loop count due to complex control flow",
5567         "UnknownLoopCountComplexCFG", ORE, TheLoop);
5568     return FixedScalableVFPair::getNone();
5569   }
5570 
5571   reportVectorizationFailure(
5572       "Cannot optimize for size and vectorize at the same time.",
5573       "cannot optimize for size and vectorize at the same time. "
5574       "Enable vectorization of this loop with '#pragma clang loop "
5575       "vectorize(enable)' when compiling with -Os/-Oz",
5576       "NoTailLoopWithOptForSize", ORE, TheLoop);
5577   return FixedScalableVFPair::getNone();
5578 }
5579 
5580 ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
5581     unsigned ConstTripCount, unsigned SmallestType, unsigned WidestType,
5582     const ElementCount &MaxSafeVF, bool FoldTailByMasking) {
5583   bool ComputeScalableMaxVF = MaxSafeVF.isScalable();
5584   TypeSize WidestRegister = TTI.getRegisterBitWidth(
5585       ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
5586                            : TargetTransformInfo::RGK_FixedWidthVector);
5587 
5588   // Convenience function to return the minimum of two ElementCounts.
5589   auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) {
5590     assert((LHS.isScalable() == RHS.isScalable()) &&
5591            "Scalable flags must match");
5592     return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS;
5593   };
5594 
5595   // Ensure MaxVF is a power of 2; the dependence distance bound may not be.
5596   // Note that both WidestRegister and WidestType may not be a powers of 2.
5597   auto MaxVectorElementCount = ElementCount::get(
5598       PowerOf2Floor(WidestRegister.getKnownMinSize() / WidestType),
5599       ComputeScalableMaxVF);
5600   MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF);
5601   LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
5602                     << (MaxVectorElementCount * WidestType) << " bits.\n");
5603 
5604   if (!MaxVectorElementCount) {
5605     LLVM_DEBUG(dbgs() << "LV: The target has no "
5606                       << (ComputeScalableMaxVF ? "scalable" : "fixed")
5607                       << " vector registers.\n");
5608     return ElementCount::getFixed(1);
5609   }
5610 
5611   const auto TripCountEC = ElementCount::getFixed(ConstTripCount);
5612   if (ConstTripCount &&
5613       ElementCount::isKnownLE(TripCountEC, MaxVectorElementCount) &&
5614       (!FoldTailByMasking || isPowerOf2_32(ConstTripCount))) {
5615     // If loop trip count (TC) is known at compile time there is no point in
5616     // choosing VF greater than TC (as done in the loop below). Select maximum
5617     // power of two which doesn't exceed TC.
5618     // If MaxVectorElementCount is scalable, we only fall back on a fixed VF
5619     // when the TC is less than or equal to the known number of lanes.
5620     auto ClampedConstTripCount = PowerOf2Floor(ConstTripCount);
5621     LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not "
5622                          "exceeding the constant trip count: "
5623                       << ClampedConstTripCount << "\n");
5624     return ElementCount::getFixed(ClampedConstTripCount);
5625   }
5626 
5627   ElementCount MaxVF = MaxVectorElementCount;
5628   if (TTI.shouldMaximizeVectorBandwidth() ||
5629       (MaximizeBandwidth && isScalarEpilogueAllowed())) {
5630     auto MaxVectorElementCountMaxBW = ElementCount::get(
5631         PowerOf2Floor(WidestRegister.getKnownMinSize() / SmallestType),
5632         ComputeScalableMaxVF);
5633     MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF);
5634 
5635     // Collect all viable vectorization factors larger than the default MaxVF
5636     // (i.e. MaxVectorElementCount).
5637     SmallVector<ElementCount, 8> VFs;
5638     for (ElementCount VS = MaxVectorElementCount * 2;
5639          ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2)
5640       VFs.push_back(VS);
5641 
5642     // For each VF calculate its register usage.
5643     auto RUs = calculateRegisterUsage(VFs);
5644 
5645     // Select the largest VF which doesn't require more registers than existing
5646     // ones.
5647     for (int i = RUs.size() - 1; i >= 0; --i) {
5648       bool Selected = true;
5649       for (auto &pair : RUs[i].MaxLocalUsers) {
5650         unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5651         if (pair.second > TargetNumRegisters)
5652           Selected = false;
5653       }
5654       if (Selected) {
5655         MaxVF = VFs[i];
5656         break;
5657       }
5658     }
5659     if (ElementCount MinVF =
5660             TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) {
5661       if (ElementCount::isKnownLT(MaxVF, MinVF)) {
5662         LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
5663                           << ") with target's minimum: " << MinVF << '\n');
5664         MaxVF = MinVF;
5665       }
5666     }
5667   }
5668   return MaxVF;
5669 }
5670 
5671 bool LoopVectorizationCostModel::isMoreProfitable(
5672     const VectorizationFactor &A, const VectorizationFactor &B) const {
5673   InstructionCost CostA = A.Cost;
5674   InstructionCost CostB = B.Cost;
5675 
5676   unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop);
5677 
5678   if (!A.Width.isScalable() && !B.Width.isScalable() && FoldTailByMasking &&
5679       MaxTripCount) {
5680     // If we are folding the tail and the trip count is a known (possibly small)
5681     // constant, the trip count will be rounded up to an integer number of
5682     // iterations. The total cost will be PerIterationCost*ceil(TripCount/VF),
5683     // which we compare directly. When not folding the tail, the total cost will
5684     // be PerIterationCost*floor(TC/VF) + Scalar remainder cost, and so is
5685     // approximated with the per-lane cost below instead of using the tripcount
5686     // as here.
5687     auto RTCostA = CostA * divideCeil(MaxTripCount, A.Width.getFixedValue());
5688     auto RTCostB = CostB * divideCeil(MaxTripCount, B.Width.getFixedValue());
5689     return RTCostA < RTCostB;
5690   }
5691 
5692   // Improve estimate for the vector width if it is scalable.
5693   unsigned EstimatedWidthA = A.Width.getKnownMinValue();
5694   unsigned EstimatedWidthB = B.Width.getKnownMinValue();
5695   if (Optional<unsigned> VScale = TTI.getVScaleForTuning()) {
5696     if (A.Width.isScalable())
5697       EstimatedWidthA *= VScale.getValue();
5698     if (B.Width.isScalable())
5699       EstimatedWidthB *= VScale.getValue();
5700   }
5701 
5702   // When set to preferred, for now assume vscale may be larger than 1 (or the
5703   // one being tuned for), so that scalable vectorization is slightly favorable
5704   // over fixed-width vectorization.
5705   if (Hints->isScalableVectorizationPreferred())
5706     if (A.Width.isScalable() && !B.Width.isScalable())
5707       return (CostA * B.Width.getFixedValue()) <= (CostB * EstimatedWidthA);
5708 
5709   // To avoid the need for FP division:
5710   //      (CostA / A.Width) < (CostB / B.Width)
5711   // <=>  (CostA * B.Width) < (CostB * A.Width)
5712   return (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA);
5713 }
5714 
5715 VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor(
5716     const ElementCountSet &VFCandidates) {
5717   InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first;
5718   LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n");
5719   assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop");
5720   assert(VFCandidates.count(ElementCount::getFixed(1)) &&
5721          "Expected Scalar VF to be a candidate");
5722 
5723   const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost);
5724   VectorizationFactor ChosenFactor = ScalarCost;
5725 
5726   bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
5727   if (ForceVectorization && VFCandidates.size() > 1) {
5728     // Ignore scalar width, because the user explicitly wants vectorization.
5729     // Initialize cost to max so that VF = 2 is, at least, chosen during cost
5730     // evaluation.
5731     ChosenFactor.Cost = InstructionCost::getMax();
5732   }
5733 
5734   SmallVector<InstructionVFPair> InvalidCosts;
5735   for (const auto &i : VFCandidates) {
5736     // The cost for scalar VF=1 is already calculated, so ignore it.
5737     if (i.isScalar())
5738       continue;
5739 
5740     VectorizationCostTy C = expectedCost(i, &InvalidCosts);
5741     VectorizationFactor Candidate(i, C.first);
5742 
5743 #ifndef NDEBUG
5744     unsigned AssumedMinimumVscale = 1;
5745     if (Optional<unsigned> VScale = TTI.getVScaleForTuning())
5746       AssumedMinimumVscale = VScale.getValue();
5747     unsigned Width =
5748         Candidate.Width.isScalable()
5749             ? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale
5750             : Candidate.Width.getFixedValue();
5751     LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
5752                       << " costs: " << (Candidate.Cost / Width));
5753     if (i.isScalable())
5754       LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of "
5755                         << AssumedMinimumVscale << ")");
5756     LLVM_DEBUG(dbgs() << ".\n");
5757 #endif
5758 
5759     if (!C.second && !ForceVectorization) {
5760       LLVM_DEBUG(
5761           dbgs() << "LV: Not considering vector loop of width " << i
5762                  << " because it will not generate any vector instructions.\n");
5763       continue;
5764     }
5765 
5766     // If profitable add it to ProfitableVF list.
5767     if (isMoreProfitable(Candidate, ScalarCost))
5768       ProfitableVFs.push_back(Candidate);
5769 
5770     if (isMoreProfitable(Candidate, ChosenFactor))
5771       ChosenFactor = Candidate;
5772   }
5773 
5774   // Emit a report of VFs with invalid costs in the loop.
5775   if (!InvalidCosts.empty()) {
5776     // Group the remarks per instruction, keeping the instruction order from
5777     // InvalidCosts.
5778     std::map<Instruction *, unsigned> Numbering;
5779     unsigned I = 0;
5780     for (auto &Pair : InvalidCosts)
5781       if (!Numbering.count(Pair.first))
5782         Numbering[Pair.first] = I++;
5783 
5784     // Sort the list, first on instruction(number) then on VF.
5785     llvm::sort(InvalidCosts,
5786                [&Numbering](InstructionVFPair &A, InstructionVFPair &B) {
5787                  if (Numbering[A.first] != Numbering[B.first])
5788                    return Numbering[A.first] < Numbering[B.first];
5789                  ElementCountComparator ECC;
5790                  return ECC(A.second, B.second);
5791                });
5792 
5793     // For a list of ordered instruction-vf pairs:
5794     //   [(load, vf1), (load, vf2), (store, vf1)]
5795     // Group the instructions together to emit separate remarks for:
5796     //   load  (vf1, vf2)
5797     //   store (vf1)
5798     auto Tail = ArrayRef<InstructionVFPair>(InvalidCosts);
5799     auto Subset = ArrayRef<InstructionVFPair>();
5800     do {
5801       if (Subset.empty())
5802         Subset = Tail.take_front(1);
5803 
5804       Instruction *I = Subset.front().first;
5805 
5806       // If the next instruction is different, or if there are no other pairs,
5807       // emit a remark for the collated subset. e.g.
5808       //   [(load, vf1), (load, vf2))]
5809       // to emit:
5810       //  remark: invalid costs for 'load' at VF=(vf, vf2)
5811       if (Subset == Tail || Tail[Subset.size()].first != I) {
5812         std::string OutString;
5813         raw_string_ostream OS(OutString);
5814         assert(!Subset.empty() && "Unexpected empty range");
5815         OS << "Instruction with invalid costs prevented vectorization at VF=(";
5816         for (auto &Pair : Subset)
5817           OS << (Pair.second == Subset.front().second ? "" : ", ")
5818              << Pair.second;
5819         OS << "):";
5820         if (auto *CI = dyn_cast<CallInst>(I))
5821           OS << " call to " << CI->getCalledFunction()->getName();
5822         else
5823           OS << " " << I->getOpcodeName();
5824         OS.flush();
5825         reportVectorizationInfo(OutString, "InvalidCost", ORE, TheLoop, I);
5826         Tail = Tail.drop_front(Subset.size());
5827         Subset = {};
5828       } else
5829         // Grow the subset by one element
5830         Subset = Tail.take_front(Subset.size() + 1);
5831     } while (!Tail.empty());
5832   }
5833 
5834   if (!EnableCondStoresVectorization && NumPredStores) {
5835     reportVectorizationFailure("There are conditional stores.",
5836         "store that is conditionally executed prevents vectorization",
5837         "ConditionalStore", ORE, TheLoop);
5838     ChosenFactor = ScalarCost;
5839   }
5840 
5841   LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() &&
5842                  ChosenFactor.Cost >= ScalarCost.Cost) dbgs()
5843              << "LV: Vectorization seems to be not beneficial, "
5844              << "but was forced by a user.\n");
5845   LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n");
5846   return ChosenFactor;
5847 }
5848 
5849 bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization(
5850     const Loop &L, ElementCount VF) const {
5851   // Cross iteration phis such as reductions need special handling and are
5852   // currently unsupported.
5853   if (any_of(L.getHeader()->phis(), [&](PHINode &Phi) {
5854         return Legal->isFirstOrderRecurrence(&Phi) ||
5855                Legal->isReductionVariable(&Phi);
5856       }))
5857     return false;
5858 
5859   // Phis with uses outside of the loop require special handling and are
5860   // currently unsupported.
5861   for (auto &Entry : Legal->getInductionVars()) {
5862     // Look for uses of the value of the induction at the last iteration.
5863     Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch());
5864     for (User *U : PostInc->users())
5865       if (!L.contains(cast<Instruction>(U)))
5866         return false;
5867     // Look for uses of penultimate value of the induction.
5868     for (User *U : Entry.first->users())
5869       if (!L.contains(cast<Instruction>(U)))
5870         return false;
5871   }
5872 
5873   // Induction variables that are widened require special handling that is
5874   // currently not supported.
5875   if (any_of(Legal->getInductionVars(), [&](auto &Entry) {
5876         return !(this->isScalarAfterVectorization(Entry.first, VF) ||
5877                  this->isProfitableToScalarize(Entry.first, VF));
5878       }))
5879     return false;
5880 
5881   // Epilogue vectorization code has not been auditted to ensure it handles
5882   // non-latch exits properly.  It may be fine, but it needs auditted and
5883   // tested.
5884   if (L.getExitingBlock() != L.getLoopLatch())
5885     return false;
5886 
5887   return true;
5888 }
5889 
5890 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
5891     const ElementCount VF) const {
5892   // FIXME: We need a much better cost-model to take different parameters such
5893   // as register pressure, code size increase and cost of extra branches into
5894   // account. For now we apply a very crude heuristic and only consider loops
5895   // with vectorization factors larger than a certain value.
5896   // We also consider epilogue vectorization unprofitable for targets that don't
5897   // consider interleaving beneficial (eg. MVE).
5898   if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1)
5899     return false;
5900   if (VF.getFixedValue() >= EpilogueVectorizationMinVF)
5901     return true;
5902   return false;
5903 }
5904 
5905 VectorizationFactor
5906 LoopVectorizationCostModel::selectEpilogueVectorizationFactor(
5907     const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) {
5908   VectorizationFactor Result = VectorizationFactor::Disabled();
5909   if (!EnableEpilogueVectorization) {
5910     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";);
5911     return Result;
5912   }
5913 
5914   if (!isScalarEpilogueAllowed()) {
5915     LLVM_DEBUG(
5916         dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is "
5917                   "allowed.\n";);
5918     return Result;
5919   }
5920 
5921   // Not really a cost consideration, but check for unsupported cases here to
5922   // simplify the logic.
5923   if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) {
5924     LLVM_DEBUG(
5925         dbgs() << "LEV: Unable to vectorize epilogue because the loop is "
5926                   "not a supported candidate.\n";);
5927     return Result;
5928   }
5929 
5930   if (EpilogueVectorizationForceVF > 1) {
5931     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";);
5932     ElementCount ForcedEC = ElementCount::getFixed(EpilogueVectorizationForceVF);
5933     if (LVP.hasPlanWithVF(ForcedEC))
5934       return {ForcedEC, 0};
5935     else {
5936       LLVM_DEBUG(
5937           dbgs()
5938               << "LEV: Epilogue vectorization forced factor is not viable.\n";);
5939       return Result;
5940     }
5941   }
5942 
5943   if (TheLoop->getHeader()->getParent()->hasOptSize() ||
5944       TheLoop->getHeader()->getParent()->hasMinSize()) {
5945     LLVM_DEBUG(
5946         dbgs()
5947             << "LEV: Epilogue vectorization skipped due to opt for size.\n";);
5948     return Result;
5949   }
5950 
5951   auto FixedMainLoopVF = ElementCount::getFixed(MainLoopVF.getKnownMinValue());
5952   if (MainLoopVF.isScalable())
5953     LLVM_DEBUG(
5954         dbgs() << "LEV: Epilogue vectorization using scalable vectors not "
5955                   "yet supported. Converting to fixed-width (VF="
5956                << FixedMainLoopVF << ") instead\n");
5957 
5958   if (!isEpilogueVectorizationProfitable(FixedMainLoopVF)) {
5959     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for "
5960                          "this loop\n");
5961     return Result;
5962   }
5963 
5964   for (auto &NextVF : ProfitableVFs)
5965     if (ElementCount::isKnownLT(NextVF.Width, FixedMainLoopVF) &&
5966         (Result.Width.getFixedValue() == 1 ||
5967          isMoreProfitable(NextVF, Result)) &&
5968         LVP.hasPlanWithVF(NextVF.Width))
5969       Result = NextVF;
5970 
5971   if (Result != VectorizationFactor::Disabled())
5972     LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = "
5973                       << Result.Width.getFixedValue() << "\n";);
5974   return Result;
5975 }
5976 
5977 std::pair<unsigned, unsigned>
5978 LoopVectorizationCostModel::getSmallestAndWidestTypes() {
5979   unsigned MinWidth = -1U;
5980   unsigned MaxWidth = 8;
5981   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5982   for (Type *T : ElementTypesInLoop) {
5983     MinWidth = std::min<unsigned>(
5984         MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize());
5985     MaxWidth = std::max<unsigned>(
5986         MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize());
5987   }
5988   return {MinWidth, MaxWidth};
5989 }
5990 
5991 void LoopVectorizationCostModel::collectElementTypesForWidening() {
5992   ElementTypesInLoop.clear();
5993   // For each block.
5994   for (BasicBlock *BB : TheLoop->blocks()) {
5995     // For each instruction in the loop.
5996     for (Instruction &I : BB->instructionsWithoutDebug()) {
5997       Type *T = I.getType();
5998 
5999       // Skip ignored values.
6000       if (ValuesToIgnore.count(&I))
6001         continue;
6002 
6003       // Only examine Loads, Stores and PHINodes.
6004       if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
6005         continue;
6006 
6007       // Examine PHI nodes that are reduction variables. Update the type to
6008       // account for the recurrence type.
6009       if (auto *PN = dyn_cast<PHINode>(&I)) {
6010         if (!Legal->isReductionVariable(PN))
6011           continue;
6012         const RecurrenceDescriptor &RdxDesc =
6013             Legal->getReductionVars().find(PN)->second;
6014         if (PreferInLoopReductions || useOrderedReductions(RdxDesc) ||
6015             TTI.preferInLoopReduction(RdxDesc.getOpcode(),
6016                                       RdxDesc.getRecurrenceType(),
6017                                       TargetTransformInfo::ReductionFlags()))
6018           continue;
6019         T = RdxDesc.getRecurrenceType();
6020       }
6021 
6022       // Examine the stored values.
6023       if (auto *ST = dyn_cast<StoreInst>(&I))
6024         T = ST->getValueOperand()->getType();
6025 
6026       // Ignore loaded pointer types and stored pointer types that are not
6027       // vectorizable.
6028       //
6029       // FIXME: The check here attempts to predict whether a load or store will
6030       //        be vectorized. We only know this for certain after a VF has
6031       //        been selected. Here, we assume that if an access can be
6032       //        vectorized, it will be. We should also look at extending this
6033       //        optimization to non-pointer types.
6034       //
6035       if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) &&
6036           !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I))
6037         continue;
6038 
6039       ElementTypesInLoop.insert(T);
6040     }
6041   }
6042 }
6043 
6044 unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
6045                                                            unsigned LoopCost) {
6046   // -- The interleave heuristics --
6047   // We interleave the loop in order to expose ILP and reduce the loop overhead.
6048   // There are many micro-architectural considerations that we can't predict
6049   // at this level. For example, frontend pressure (on decode or fetch) due to
6050   // code size, or the number and capabilities of the execution ports.
6051   //
6052   // We use the following heuristics to select the interleave count:
6053   // 1. If the code has reductions, then we interleave to break the cross
6054   // iteration dependency.
6055   // 2. If the loop is really small, then we interleave to reduce the loop
6056   // overhead.
6057   // 3. We don't interleave if we think that we will spill registers to memory
6058   // due to the increased register pressure.
6059 
6060   if (!isScalarEpilogueAllowed())
6061     return 1;
6062 
6063   // We used the distance for the interleave count.
6064   if (Legal->getMaxSafeDepDistBytes() != -1U)
6065     return 1;
6066 
6067   auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
6068   const bool HasReductions = !Legal->getReductionVars().empty();
6069   // Do not interleave loops with a relatively small known or estimated trip
6070   // count. But we will interleave when InterleaveSmallLoopScalarReduction is
6071   // enabled, and the code has scalar reductions(HasReductions && VF = 1),
6072   // because with the above conditions interleaving can expose ILP and break
6073   // cross iteration dependences for reductions.
6074   if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) &&
6075       !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar()))
6076     return 1;
6077 
6078   RegisterUsage R = calculateRegisterUsage({VF})[0];
6079   // We divide by these constants so assume that we have at least one
6080   // instruction that uses at least one register.
6081   for (auto& pair : R.MaxLocalUsers) {
6082     pair.second = std::max(pair.second, 1U);
6083   }
6084 
6085   // We calculate the interleave count using the following formula.
6086   // Subtract the number of loop invariants from the number of available
6087   // registers. These registers are used by all of the interleaved instances.
6088   // Next, divide the remaining registers by the number of registers that is
6089   // required by the loop, in order to estimate how many parallel instances
6090   // fit without causing spills. All of this is rounded down if necessary to be
6091   // a power of two. We want power of two interleave count to simplify any
6092   // addressing operations or alignment considerations.
6093   // We also want power of two interleave counts to ensure that the induction
6094   // variable of the vector loop wraps to zero, when tail is folded by masking;
6095   // this currently happens when OptForSize, in which case IC is set to 1 above.
6096   unsigned IC = UINT_MAX;
6097 
6098   for (auto& pair : R.MaxLocalUsers) {
6099     unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
6100     LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
6101                       << " registers of "
6102                       << TTI.getRegisterClassName(pair.first) << " register class\n");
6103     if (VF.isScalar()) {
6104       if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
6105         TargetNumRegisters = ForceTargetNumScalarRegs;
6106     } else {
6107       if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
6108         TargetNumRegisters = ForceTargetNumVectorRegs;
6109     }
6110     unsigned MaxLocalUsers = pair.second;
6111     unsigned LoopInvariantRegs = 0;
6112     if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end())
6113       LoopInvariantRegs = R.LoopInvariantRegs[pair.first];
6114 
6115     unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers);
6116     // Don't count the induction variable as interleaved.
6117     if (EnableIndVarRegisterHeur) {
6118       TmpIC =
6119           PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) /
6120                         std::max(1U, (MaxLocalUsers - 1)));
6121     }
6122 
6123     IC = std::min(IC, TmpIC);
6124   }
6125 
6126   // Clamp the interleave ranges to reasonable counts.
6127   unsigned MaxInterleaveCount =
6128       TTI.getMaxInterleaveFactor(VF.getKnownMinValue());
6129 
6130   // Check if the user has overridden the max.
6131   if (VF.isScalar()) {
6132     if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
6133       MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
6134   } else {
6135     if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
6136       MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
6137   }
6138 
6139   // If trip count is known or estimated compile time constant, limit the
6140   // interleave count to be less than the trip count divided by VF, provided it
6141   // is at least 1.
6142   //
6143   // For scalable vectors we can't know if interleaving is beneficial. It may
6144   // not be beneficial for small loops if none of the lanes in the second vector
6145   // iterations is enabled. However, for larger loops, there is likely to be a
6146   // similar benefit as for fixed-width vectors. For now, we choose to leave
6147   // the InterleaveCount as if vscale is '1', although if some information about
6148   // the vector is known (e.g. min vector size), we can make a better decision.
6149   if (BestKnownTC) {
6150     MaxInterleaveCount =
6151         std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount);
6152     // Make sure MaxInterleaveCount is greater than 0.
6153     MaxInterleaveCount = std::max(1u, MaxInterleaveCount);
6154   }
6155 
6156   assert(MaxInterleaveCount > 0 &&
6157          "Maximum interleave count must be greater than 0");
6158 
6159   // Clamp the calculated IC to be between the 1 and the max interleave count
6160   // that the target and trip count allows.
6161   if (IC > MaxInterleaveCount)
6162     IC = MaxInterleaveCount;
6163   else
6164     // Make sure IC is greater than 0.
6165     IC = std::max(1u, IC);
6166 
6167   assert(IC > 0 && "Interleave count must be greater than 0.");
6168 
6169   // If we did not calculate the cost for VF (because the user selected the VF)
6170   // then we calculate the cost of VF here.
6171   if (LoopCost == 0) {
6172     InstructionCost C = expectedCost(VF).first;
6173     assert(C.isValid() && "Expected to have chosen a VF with valid cost");
6174     LoopCost = *C.getValue();
6175   }
6176 
6177   assert(LoopCost && "Non-zero loop cost expected");
6178 
6179   // Interleave if we vectorized this loop and there is a reduction that could
6180   // benefit from interleaving.
6181   if (VF.isVector() && HasReductions) {
6182     LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
6183     return IC;
6184   }
6185 
6186   // Note that if we've already vectorized the loop we will have done the
6187   // runtime check and so interleaving won't require further checks.
6188   bool InterleavingRequiresRuntimePointerCheck =
6189       (VF.isScalar() && Legal->getRuntimePointerChecking()->Need);
6190 
6191   // We want to interleave small loops in order to reduce the loop overhead and
6192   // potentially expose ILP opportunities.
6193   LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'
6194                     << "LV: IC is " << IC << '\n'
6195                     << "LV: VF is " << VF << '\n');
6196   const bool AggressivelyInterleaveReductions =
6197       TTI.enableAggressiveInterleaving(HasReductions);
6198   if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) {
6199     // We assume that the cost overhead is 1 and we use the cost model
6200     // to estimate the cost of the loop and interleave until the cost of the
6201     // loop overhead is about 5% of the cost of the loop.
6202     unsigned SmallIC =
6203         std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost));
6204 
6205     // Interleave until store/load ports (estimated by max interleave count) are
6206     // saturated.
6207     unsigned NumStores = Legal->getNumStores();
6208     unsigned NumLoads = Legal->getNumLoads();
6209     unsigned StoresIC = IC / (NumStores ? NumStores : 1);
6210     unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
6211 
6212     // There is little point in interleaving for reductions containing selects
6213     // and compares when VF=1 since it may just create more overhead than it's
6214     // worth for loops with small trip counts. This is because we still have to
6215     // do the final reduction after the loop.
6216     bool HasSelectCmpReductions =
6217         HasReductions &&
6218         any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
6219           const RecurrenceDescriptor &RdxDesc = Reduction.second;
6220           return RecurrenceDescriptor::isSelectCmpRecurrenceKind(
6221               RdxDesc.getRecurrenceKind());
6222         });
6223     if (HasSelectCmpReductions) {
6224       LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n");
6225       return 1;
6226     }
6227 
6228     // If we have a scalar reduction (vector reductions are already dealt with
6229     // by this point), we can increase the critical path length if the loop
6230     // we're interleaving is inside another loop. For tree-wise reductions
6231     // set the limit to 2, and for ordered reductions it's best to disable
6232     // interleaving entirely.
6233     if (HasReductions && TheLoop->getLoopDepth() > 1) {
6234       bool HasOrderedReductions =
6235           any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
6236             const RecurrenceDescriptor &RdxDesc = Reduction.second;
6237             return RdxDesc.isOrdered();
6238           });
6239       if (HasOrderedReductions) {
6240         LLVM_DEBUG(
6241             dbgs() << "LV: Not interleaving scalar ordered reductions.\n");
6242         return 1;
6243       }
6244 
6245       unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
6246       SmallIC = std::min(SmallIC, F);
6247       StoresIC = std::min(StoresIC, F);
6248       LoadsIC = std::min(LoadsIC, F);
6249     }
6250 
6251     if (EnableLoadStoreRuntimeInterleave &&
6252         std::max(StoresIC, LoadsIC) > SmallIC) {
6253       LLVM_DEBUG(
6254           dbgs() << "LV: Interleaving to saturate store or load ports.\n");
6255       return std::max(StoresIC, LoadsIC);
6256     }
6257 
6258     // If there are scalar reductions and TTI has enabled aggressive
6259     // interleaving for reductions, we will interleave to expose ILP.
6260     if (InterleaveSmallLoopScalarReduction && VF.isScalar() &&
6261         AggressivelyInterleaveReductions) {
6262       LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
6263       // Interleave no less than SmallIC but not as aggressive as the normal IC
6264       // to satisfy the rare situation when resources are too limited.
6265       return std::max(IC / 2, SmallIC);
6266     } else {
6267       LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
6268       return SmallIC;
6269     }
6270   }
6271 
6272   // Interleave if this is a large loop (small loops are already dealt with by
6273   // this point) that could benefit from interleaving.
6274   if (AggressivelyInterleaveReductions) {
6275     LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
6276     return IC;
6277   }
6278 
6279   LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
6280   return 1;
6281 }
6282 
6283 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
6284 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
6285   // This function calculates the register usage by measuring the highest number
6286   // of values that are alive at a single location. Obviously, this is a very
6287   // rough estimation. We scan the loop in a topological order in order and
6288   // assign a number to each instruction. We use RPO to ensure that defs are
6289   // met before their users. We assume that each instruction that has in-loop
6290   // users starts an interval. We record every time that an in-loop value is
6291   // used, so we have a list of the first and last occurrences of each
6292   // instruction. Next, we transpose this data structure into a multi map that
6293   // holds the list of intervals that *end* at a specific location. This multi
6294   // map allows us to perform a linear search. We scan the instructions linearly
6295   // and record each time that a new interval starts, by placing it in a set.
6296   // If we find this value in the multi-map then we remove it from the set.
6297   // The max register usage is the maximum size of the set.
6298   // We also search for instructions that are defined outside the loop, but are
6299   // used inside the loop. We need this number separately from the max-interval
6300   // usage number because when we unroll, loop-invariant values do not take
6301   // more register.
6302   LoopBlocksDFS DFS(TheLoop);
6303   DFS.perform(LI);
6304 
6305   RegisterUsage RU;
6306 
6307   // Each 'key' in the map opens a new interval. The values
6308   // of the map are the index of the 'last seen' usage of the
6309   // instruction that is the key.
6310   using IntervalMap = DenseMap<Instruction *, unsigned>;
6311 
6312   // Maps instruction to its index.
6313   SmallVector<Instruction *, 64> IdxToInstr;
6314   // Marks the end of each interval.
6315   IntervalMap EndPoint;
6316   // Saves the list of instruction indices that are used in the loop.
6317   SmallPtrSet<Instruction *, 8> Ends;
6318   // Saves the list of values that are used in the loop but are
6319   // defined outside the loop, such as arguments and constants.
6320   SmallPtrSet<Value *, 8> LoopInvariants;
6321 
6322   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
6323     for (Instruction &I : BB->instructionsWithoutDebug()) {
6324       IdxToInstr.push_back(&I);
6325 
6326       // Save the end location of each USE.
6327       for (Value *U : I.operands()) {
6328         auto *Instr = dyn_cast<Instruction>(U);
6329 
6330         // Ignore non-instruction values such as arguments, constants, etc.
6331         if (!Instr)
6332           continue;
6333 
6334         // If this instruction is outside the loop then record it and continue.
6335         if (!TheLoop->contains(Instr)) {
6336           LoopInvariants.insert(Instr);
6337           continue;
6338         }
6339 
6340         // Overwrite previous end points.
6341         EndPoint[Instr] = IdxToInstr.size();
6342         Ends.insert(Instr);
6343       }
6344     }
6345   }
6346 
6347   // Saves the list of intervals that end with the index in 'key'.
6348   using InstrList = SmallVector<Instruction *, 2>;
6349   DenseMap<unsigned, InstrList> TransposeEnds;
6350 
6351   // Transpose the EndPoints to a list of values that end at each index.
6352   for (auto &Interval : EndPoint)
6353     TransposeEnds[Interval.second].push_back(Interval.first);
6354 
6355   SmallPtrSet<Instruction *, 8> OpenIntervals;
6356   SmallVector<RegisterUsage, 8> RUs(VFs.size());
6357   SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size());
6358 
6359   LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
6360 
6361   // A lambda that gets the register usage for the given type and VF.
6362   const auto &TTICapture = TTI;
6363   auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned {
6364     if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty))
6365       return 0;
6366     InstructionCost::CostType RegUsage =
6367         *TTICapture.getRegUsageForType(VectorType::get(Ty, VF)).getValue();
6368     assert(RegUsage >= 0 && RegUsage <= std::numeric_limits<unsigned>::max() &&
6369            "Nonsensical values for register usage.");
6370     return RegUsage;
6371   };
6372 
6373   for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
6374     Instruction *I = IdxToInstr[i];
6375 
6376     // Remove all of the instructions that end at this location.
6377     InstrList &List = TransposeEnds[i];
6378     for (Instruction *ToRemove : List)
6379       OpenIntervals.erase(ToRemove);
6380 
6381     // Ignore instructions that are never used within the loop.
6382     if (!Ends.count(I))
6383       continue;
6384 
6385     // Skip ignored values.
6386     if (ValuesToIgnore.count(I))
6387       continue;
6388 
6389     // For each VF find the maximum usage of registers.
6390     for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
6391       // Count the number of live intervals.
6392       SmallMapVector<unsigned, unsigned, 4> RegUsage;
6393 
6394       if (VFs[j].isScalar()) {
6395         for (auto Inst : OpenIntervals) {
6396           unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
6397           if (RegUsage.find(ClassID) == RegUsage.end())
6398             RegUsage[ClassID] = 1;
6399           else
6400             RegUsage[ClassID] += 1;
6401         }
6402       } else {
6403         collectUniformsAndScalars(VFs[j]);
6404         for (auto Inst : OpenIntervals) {
6405           // Skip ignored values for VF > 1.
6406           if (VecValuesToIgnore.count(Inst))
6407             continue;
6408           if (isScalarAfterVectorization(Inst, VFs[j])) {
6409             unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
6410             if (RegUsage.find(ClassID) == RegUsage.end())
6411               RegUsage[ClassID] = 1;
6412             else
6413               RegUsage[ClassID] += 1;
6414           } else {
6415             unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType());
6416             if (RegUsage.find(ClassID) == RegUsage.end())
6417               RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]);
6418             else
6419               RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]);
6420           }
6421         }
6422       }
6423 
6424       for (auto& pair : RegUsage) {
6425         if (MaxUsages[j].find(pair.first) != MaxUsages[j].end())
6426           MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second);
6427         else
6428           MaxUsages[j][pair.first] = pair.second;
6429       }
6430     }
6431 
6432     LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
6433                       << OpenIntervals.size() << '\n');
6434 
6435     // Add the current instruction to the list of open intervals.
6436     OpenIntervals.insert(I);
6437   }
6438 
6439   for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
6440     SmallMapVector<unsigned, unsigned, 4> Invariant;
6441 
6442     for (auto Inst : LoopInvariants) {
6443       unsigned Usage =
6444           VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]);
6445       unsigned ClassID =
6446           TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType());
6447       if (Invariant.find(ClassID) == Invariant.end())
6448         Invariant[ClassID] = Usage;
6449       else
6450         Invariant[ClassID] += Usage;
6451     }
6452 
6453     LLVM_DEBUG({
6454       dbgs() << "LV(REG): VF = " << VFs[i] << '\n';
6455       dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size()
6456              << " item\n";
6457       for (const auto &pair : MaxUsages[i]) {
6458         dbgs() << "LV(REG): RegisterClass: "
6459                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
6460                << " registers\n";
6461       }
6462       dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()
6463              << " item\n";
6464       for (const auto &pair : Invariant) {
6465         dbgs() << "LV(REG): RegisterClass: "
6466                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
6467                << " registers\n";
6468       }
6469     });
6470 
6471     RU.LoopInvariantRegs = Invariant;
6472     RU.MaxLocalUsers = MaxUsages[i];
6473     RUs[i] = RU;
6474   }
6475 
6476   return RUs;
6477 }
6478 
6479 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){
6480   // TODO: Cost model for emulated masked load/store is completely
6481   // broken. This hack guides the cost model to use an artificially
6482   // high enough value to practically disable vectorization with such
6483   // operations, except where previously deployed legality hack allowed
6484   // using very low cost values. This is to avoid regressions coming simply
6485   // from moving "masked load/store" check from legality to cost model.
6486   // Masked Load/Gather emulation was previously never allowed.
6487   // Limited number of Masked Store/Scatter emulation was allowed.
6488   assert(isPredicatedInst(I) &&
6489          "Expecting a scalar emulated instruction");
6490   return isa<LoadInst>(I) ||
6491          (isa<StoreInst>(I) &&
6492           NumPredStores > NumberOfStoresToPredicate);
6493 }
6494 
6495 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) {
6496   // If we aren't vectorizing the loop, or if we've already collected the
6497   // instructions to scalarize, there's nothing to do. Collection may already
6498   // have occurred if we have a user-selected VF and are now computing the
6499   // expected cost for interleaving.
6500   if (VF.isScalar() || VF.isZero() ||
6501       InstsToScalarize.find(VF) != InstsToScalarize.end())
6502     return;
6503 
6504   // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
6505   // not profitable to scalarize any instructions, the presence of VF in the
6506   // map will indicate that we've analyzed it already.
6507   ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
6508 
6509   // Find all the instructions that are scalar with predication in the loop and
6510   // determine if it would be better to not if-convert the blocks they are in.
6511   // If so, we also record the instructions to scalarize.
6512   for (BasicBlock *BB : TheLoop->blocks()) {
6513     if (!blockNeedsPredicationForAnyReason(BB))
6514       continue;
6515     for (Instruction &I : *BB)
6516       if (isScalarWithPredication(&I)) {
6517         ScalarCostsTy ScalarCosts;
6518         // Do not apply discount if scalable, because that would lead to
6519         // invalid scalarization costs.
6520         // Do not apply discount logic if hacked cost is needed
6521         // for emulated masked memrefs.
6522         if (!VF.isScalable() && !useEmulatedMaskMemRefHack(&I) &&
6523             computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
6524           ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
6525         // Remember that BB will remain after vectorization.
6526         PredicatedBBsAfterVectorization.insert(BB);
6527       }
6528   }
6529 }
6530 
6531 int LoopVectorizationCostModel::computePredInstDiscount(
6532     Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) {
6533   assert(!isUniformAfterVectorization(PredInst, VF) &&
6534          "Instruction marked uniform-after-vectorization will be predicated");
6535 
6536   // Initialize the discount to zero, meaning that the scalar version and the
6537   // vector version cost the same.
6538   InstructionCost Discount = 0;
6539 
6540   // Holds instructions to analyze. The instructions we visit are mapped in
6541   // ScalarCosts. Those instructions are the ones that would be scalarized if
6542   // we find that the scalar version costs less.
6543   SmallVector<Instruction *, 8> Worklist;
6544 
6545   // Returns true if the given instruction can be scalarized.
6546   auto canBeScalarized = [&](Instruction *I) -> bool {
6547     // We only attempt to scalarize instructions forming a single-use chain
6548     // from the original predicated block that would otherwise be vectorized.
6549     // Although not strictly necessary, we give up on instructions we know will
6550     // already be scalar to avoid traversing chains that are unlikely to be
6551     // beneficial.
6552     if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
6553         isScalarAfterVectorization(I, VF))
6554       return false;
6555 
6556     // If the instruction is scalar with predication, it will be analyzed
6557     // separately. We ignore it within the context of PredInst.
6558     if (isScalarWithPredication(I))
6559       return false;
6560 
6561     // If any of the instruction's operands are uniform after vectorization,
6562     // the instruction cannot be scalarized. This prevents, for example, a
6563     // masked load from being scalarized.
6564     //
6565     // We assume we will only emit a value for lane zero of an instruction
6566     // marked uniform after vectorization, rather than VF identical values.
6567     // Thus, if we scalarize an instruction that uses a uniform, we would
6568     // create uses of values corresponding to the lanes we aren't emitting code
6569     // for. This behavior can be changed by allowing getScalarValue to clone
6570     // the lane zero values for uniforms rather than asserting.
6571     for (Use &U : I->operands())
6572       if (auto *J = dyn_cast<Instruction>(U.get()))
6573         if (isUniformAfterVectorization(J, VF))
6574           return false;
6575 
6576     // Otherwise, we can scalarize the instruction.
6577     return true;
6578   };
6579 
6580   // Compute the expected cost discount from scalarizing the entire expression
6581   // feeding the predicated instruction. We currently only consider expressions
6582   // that are single-use instruction chains.
6583   Worklist.push_back(PredInst);
6584   while (!Worklist.empty()) {
6585     Instruction *I = Worklist.pop_back_val();
6586 
6587     // If we've already analyzed the instruction, there's nothing to do.
6588     if (ScalarCosts.find(I) != ScalarCosts.end())
6589       continue;
6590 
6591     // Compute the cost of the vector instruction. Note that this cost already
6592     // includes the scalarization overhead of the predicated instruction.
6593     InstructionCost VectorCost = getInstructionCost(I, VF).first;
6594 
6595     // Compute the cost of the scalarized instruction. This cost is the cost of
6596     // the instruction as if it wasn't if-converted and instead remained in the
6597     // predicated block. We will scale this cost by block probability after
6598     // computing the scalarization overhead.
6599     InstructionCost ScalarCost =
6600         VF.getFixedValue() *
6601         getInstructionCost(I, ElementCount::getFixed(1)).first;
6602 
6603     // Compute the scalarization overhead of needed insertelement instructions
6604     // and phi nodes.
6605     if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) {
6606       ScalarCost += TTI.getScalarizationOverhead(
6607           cast<VectorType>(ToVectorTy(I->getType(), VF)),
6608           APInt::getAllOnes(VF.getFixedValue()), true, false);
6609       ScalarCost +=
6610           VF.getFixedValue() *
6611           TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput);
6612     }
6613 
6614     // Compute the scalarization overhead of needed extractelement
6615     // instructions. For each of the instruction's operands, if the operand can
6616     // be scalarized, add it to the worklist; otherwise, account for the
6617     // overhead.
6618     for (Use &U : I->operands())
6619       if (auto *J = dyn_cast<Instruction>(U.get())) {
6620         assert(VectorType::isValidElementType(J->getType()) &&
6621                "Instruction has non-scalar type");
6622         if (canBeScalarized(J))
6623           Worklist.push_back(J);
6624         else if (needsExtract(J, VF)) {
6625           ScalarCost += TTI.getScalarizationOverhead(
6626               cast<VectorType>(ToVectorTy(J->getType(), VF)),
6627               APInt::getAllOnes(VF.getFixedValue()), false, true);
6628         }
6629       }
6630 
6631     // Scale the total scalar cost by block probability.
6632     ScalarCost /= getReciprocalPredBlockProb();
6633 
6634     // Compute the discount. A non-negative discount means the vector version
6635     // of the instruction costs more, and scalarizing would be beneficial.
6636     Discount += VectorCost - ScalarCost;
6637     ScalarCosts[I] = ScalarCost;
6638   }
6639 
6640   return *Discount.getValue();
6641 }
6642 
6643 LoopVectorizationCostModel::VectorizationCostTy
6644 LoopVectorizationCostModel::expectedCost(
6645     ElementCount VF, SmallVectorImpl<InstructionVFPair> *Invalid) {
6646   VectorizationCostTy Cost;
6647 
6648   // For each block.
6649   for (BasicBlock *BB : TheLoop->blocks()) {
6650     VectorizationCostTy BlockCost;
6651 
6652     // For each instruction in the old loop.
6653     for (Instruction &I : BB->instructionsWithoutDebug()) {
6654       // Skip ignored values.
6655       if (ValuesToIgnore.count(&I) ||
6656           (VF.isVector() && VecValuesToIgnore.count(&I)))
6657         continue;
6658 
6659       VectorizationCostTy C = getInstructionCost(&I, VF);
6660 
6661       // Check if we should override the cost.
6662       if (C.first.isValid() &&
6663           ForceTargetInstructionCost.getNumOccurrences() > 0)
6664         C.first = InstructionCost(ForceTargetInstructionCost);
6665 
6666       // Keep a list of instructions with invalid costs.
6667       if (Invalid && !C.first.isValid())
6668         Invalid->emplace_back(&I, VF);
6669 
6670       BlockCost.first += C.first;
6671       BlockCost.second |= C.second;
6672       LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first
6673                         << " for VF " << VF << " For instruction: " << I
6674                         << '\n');
6675     }
6676 
6677     // If we are vectorizing a predicated block, it will have been
6678     // if-converted. This means that the block's instructions (aside from
6679     // stores and instructions that may divide by zero) will now be
6680     // unconditionally executed. For the scalar case, we may not always execute
6681     // the predicated block, if it is an if-else block. Thus, scale the block's
6682     // cost by the probability of executing it. blockNeedsPredication from
6683     // Legal is used so as to not include all blocks in tail folded loops.
6684     if (VF.isScalar() && Legal->blockNeedsPredication(BB))
6685       BlockCost.first /= getReciprocalPredBlockProb();
6686 
6687     Cost.first += BlockCost.first;
6688     Cost.second |= BlockCost.second;
6689   }
6690 
6691   return Cost;
6692 }
6693 
6694 /// Gets Address Access SCEV after verifying that the access pattern
6695 /// is loop invariant except the induction variable dependence.
6696 ///
6697 /// This SCEV can be sent to the Target in order to estimate the address
6698 /// calculation cost.
6699 static const SCEV *getAddressAccessSCEV(
6700               Value *Ptr,
6701               LoopVectorizationLegality *Legal,
6702               PredicatedScalarEvolution &PSE,
6703               const Loop *TheLoop) {
6704 
6705   auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
6706   if (!Gep)
6707     return nullptr;
6708 
6709   // We are looking for a gep with all loop invariant indices except for one
6710   // which should be an induction variable.
6711   auto SE = PSE.getSE();
6712   unsigned NumOperands = Gep->getNumOperands();
6713   for (unsigned i = 1; i < NumOperands; ++i) {
6714     Value *Opd = Gep->getOperand(i);
6715     if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
6716         !Legal->isInductionVariable(Opd))
6717       return nullptr;
6718   }
6719 
6720   // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
6721   return PSE.getSCEV(Ptr);
6722 }
6723 
6724 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
6725   return Legal->hasStride(I->getOperand(0)) ||
6726          Legal->hasStride(I->getOperand(1));
6727 }
6728 
6729 InstructionCost
6730 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
6731                                                         ElementCount VF) {
6732   assert(VF.isVector() &&
6733          "Scalarization cost of instruction implies vectorization.");
6734   if (VF.isScalable())
6735     return InstructionCost::getInvalid();
6736 
6737   Type *ValTy = getLoadStoreType(I);
6738   auto SE = PSE.getSE();
6739 
6740   unsigned AS = getLoadStoreAddressSpace(I);
6741   Value *Ptr = getLoadStorePointerOperand(I);
6742   Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
6743   // NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost`
6744   //       that it is being called from this specific place.
6745 
6746   // Figure out whether the access is strided and get the stride value
6747   // if it's known in compile time
6748   const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
6749 
6750   // Get the cost of the scalar memory instruction and address computation.
6751   InstructionCost Cost =
6752       VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
6753 
6754   // Don't pass *I here, since it is scalar but will actually be part of a
6755   // vectorized loop where the user of it is a vectorized instruction.
6756   const Align Alignment = getLoadStoreAlignment(I);
6757   Cost += VF.getKnownMinValue() *
6758           TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment,
6759                               AS, TTI::TCK_RecipThroughput);
6760 
6761   // Get the overhead of the extractelement and insertelement instructions
6762   // we might create due to scalarization.
6763   Cost += getScalarizationOverhead(I, VF);
6764 
6765   // If we have a predicated load/store, it will need extra i1 extracts and
6766   // conditional branches, but may not be executed for each vector lane. Scale
6767   // the cost by the probability of executing the predicated block.
6768   if (isPredicatedInst(I)) {
6769     Cost /= getReciprocalPredBlockProb();
6770 
6771     // Add the cost of an i1 extract and a branch
6772     auto *Vec_i1Ty =
6773         VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF);
6774     Cost += TTI.getScalarizationOverhead(
6775         Vec_i1Ty, APInt::getAllOnes(VF.getKnownMinValue()),
6776         /*Insert=*/false, /*Extract=*/true);
6777     Cost += TTI.getCFInstrCost(Instruction::Br, TTI::TCK_RecipThroughput);
6778 
6779     if (useEmulatedMaskMemRefHack(I))
6780       // Artificially setting to a high enough value to practically disable
6781       // vectorization with such operations.
6782       Cost = 3000000;
6783   }
6784 
6785   return Cost;
6786 }
6787 
6788 InstructionCost
6789 LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
6790                                                     ElementCount VF) {
6791   Type *ValTy = getLoadStoreType(I);
6792   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6793   Value *Ptr = getLoadStorePointerOperand(I);
6794   unsigned AS = getLoadStoreAddressSpace(I);
6795   int ConsecutiveStride = Legal->isConsecutivePtr(ValTy, Ptr);
6796   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6797 
6798   assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6799          "Stride should be 1 or -1 for consecutive memory access");
6800   const Align Alignment = getLoadStoreAlignment(I);
6801   InstructionCost Cost = 0;
6802   if (Legal->isMaskRequired(I))
6803     Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
6804                                       CostKind);
6805   else
6806     Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
6807                                 CostKind, I);
6808 
6809   bool Reverse = ConsecutiveStride < 0;
6810   if (Reverse)
6811     Cost +=
6812         TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0);
6813   return Cost;
6814 }
6815 
6816 InstructionCost
6817 LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
6818                                                 ElementCount VF) {
6819   assert(Legal->isUniformMemOp(*I));
6820 
6821   Type *ValTy = getLoadStoreType(I);
6822   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6823   const Align Alignment = getLoadStoreAlignment(I);
6824   unsigned AS = getLoadStoreAddressSpace(I);
6825   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6826   if (isa<LoadInst>(I)) {
6827     return TTI.getAddressComputationCost(ValTy) +
6828            TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS,
6829                                CostKind) +
6830            TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
6831   }
6832   StoreInst *SI = cast<StoreInst>(I);
6833 
6834   bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand());
6835   return TTI.getAddressComputationCost(ValTy) +
6836          TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS,
6837                              CostKind) +
6838          (isLoopInvariantStoreValue
6839               ? 0
6840               : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
6841                                        VF.getKnownMinValue() - 1));
6842 }
6843 
6844 InstructionCost
6845 LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
6846                                                  ElementCount VF) {
6847   Type *ValTy = getLoadStoreType(I);
6848   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6849   const Align Alignment = getLoadStoreAlignment(I);
6850   const Value *Ptr = getLoadStorePointerOperand(I);
6851 
6852   return TTI.getAddressComputationCost(VectorTy) +
6853          TTI.getGatherScatterOpCost(
6854              I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment,
6855              TargetTransformInfo::TCK_RecipThroughput, I);
6856 }
6857 
6858 InstructionCost
6859 LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
6860                                                    ElementCount VF) {
6861   // TODO: Once we have support for interleaving with scalable vectors
6862   // we can calculate the cost properly here.
6863   if (VF.isScalable())
6864     return InstructionCost::getInvalid();
6865 
6866   Type *ValTy = getLoadStoreType(I);
6867   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6868   unsigned AS = getLoadStoreAddressSpace(I);
6869 
6870   auto Group = getInterleavedAccessGroup(I);
6871   assert(Group && "Fail to get an interleaved access group.");
6872 
6873   unsigned InterleaveFactor = Group->getFactor();
6874   auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
6875 
6876   // Holds the indices of existing members in the interleaved group.
6877   SmallVector<unsigned, 4> Indices;
6878   for (unsigned IF = 0; IF < InterleaveFactor; IF++)
6879     if (Group->getMember(IF))
6880       Indices.push_back(IF);
6881 
6882   // Calculate the cost of the whole interleaved group.
6883   bool UseMaskForGaps =
6884       (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) ||
6885       (isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor()));
6886   InstructionCost Cost = TTI.getInterleavedMemoryOpCost(
6887       I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(),
6888       AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps);
6889 
6890   if (Group->isReverse()) {
6891     // TODO: Add support for reversed masked interleaved access.
6892     assert(!Legal->isMaskRequired(I) &&
6893            "Reverse masked interleaved access not supported.");
6894     Cost +=
6895         Group->getNumMembers() *
6896         TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0);
6897   }
6898   return Cost;
6899 }
6900 
6901 Optional<InstructionCost> LoopVectorizationCostModel::getReductionPatternCost(
6902     Instruction *I, ElementCount VF, Type *Ty, TTI::TargetCostKind CostKind) {
6903   using namespace llvm::PatternMatch;
6904   // Early exit for no inloop reductions
6905   if (InLoopReductionChains.empty() || VF.isScalar() || !isa<VectorType>(Ty))
6906     return None;
6907   auto *VectorTy = cast<VectorType>(Ty);
6908 
6909   // We are looking for a pattern of, and finding the minimal acceptable cost:
6910   //  reduce(mul(ext(A), ext(B))) or
6911   //  reduce(mul(A, B)) or
6912   //  reduce(ext(A)) or
6913   //  reduce(A).
6914   // The basic idea is that we walk down the tree to do that, finding the root
6915   // reduction instruction in InLoopReductionImmediateChains. From there we find
6916   // the pattern of mul/ext and test the cost of the entire pattern vs the cost
6917   // of the components. If the reduction cost is lower then we return it for the
6918   // reduction instruction and 0 for the other instructions in the pattern. If
6919   // it is not we return an invalid cost specifying the orignal cost method
6920   // should be used.
6921   Instruction *RetI = I;
6922   if (match(RetI, m_ZExtOrSExt(m_Value()))) {
6923     if (!RetI->hasOneUser())
6924       return None;
6925     RetI = RetI->user_back();
6926   }
6927   if (match(RetI, m_Mul(m_Value(), m_Value())) &&
6928       RetI->user_back()->getOpcode() == Instruction::Add) {
6929     if (!RetI->hasOneUser())
6930       return None;
6931     RetI = RetI->user_back();
6932   }
6933 
6934   // Test if the found instruction is a reduction, and if not return an invalid
6935   // cost specifying the parent to use the original cost modelling.
6936   if (!InLoopReductionImmediateChains.count(RetI))
6937     return None;
6938 
6939   // Find the reduction this chain is a part of and calculate the basic cost of
6940   // the reduction on its own.
6941   Instruction *LastChain = InLoopReductionImmediateChains[RetI];
6942   Instruction *ReductionPhi = LastChain;
6943   while (!isa<PHINode>(ReductionPhi))
6944     ReductionPhi = InLoopReductionImmediateChains[ReductionPhi];
6945 
6946   const RecurrenceDescriptor &RdxDesc =
6947       Legal->getReductionVars().find(cast<PHINode>(ReductionPhi))->second;
6948 
6949   InstructionCost BaseCost = TTI.getArithmeticReductionCost(
6950       RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind);
6951 
6952   // For a call to the llvm.fmuladd intrinsic we need to add the cost of a
6953   // normal fmul instruction to the cost of the fadd reduction.
6954   if (RdxDesc.getRecurrenceKind() == RecurKind::FMulAdd)
6955     BaseCost +=
6956         TTI.getArithmeticInstrCost(Instruction::FMul, VectorTy, CostKind);
6957 
6958   // If we're using ordered reductions then we can just return the base cost
6959   // here, since getArithmeticReductionCost calculates the full ordered
6960   // reduction cost when FP reassociation is not allowed.
6961   if (useOrderedReductions(RdxDesc))
6962     return BaseCost;
6963 
6964   // Get the operand that was not the reduction chain and match it to one of the
6965   // patterns, returning the better cost if it is found.
6966   Instruction *RedOp = RetI->getOperand(1) == LastChain
6967                            ? dyn_cast<Instruction>(RetI->getOperand(0))
6968                            : dyn_cast<Instruction>(RetI->getOperand(1));
6969 
6970   VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy);
6971 
6972   Instruction *Op0, *Op1;
6973   if (RedOp &&
6974       match(RedOp,
6975             m_ZExtOrSExt(m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) &&
6976       match(Op0, m_ZExtOrSExt(m_Value())) &&
6977       Op0->getOpcode() == Op1->getOpcode() &&
6978       Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() &&
6979       !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1) &&
6980       (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) {
6981 
6982     // Matched reduce(ext(mul(ext(A), ext(B)))
6983     // Note that the extend opcodes need to all match, or if A==B they will have
6984     // been converted to zext(mul(sext(A), sext(A))) as it is known positive,
6985     // which is equally fine.
6986     bool IsUnsigned = isa<ZExtInst>(Op0);
6987     auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy);
6988     auto *MulType = VectorType::get(Op0->getType(), VectorTy);
6989 
6990     InstructionCost ExtCost =
6991         TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType,
6992                              TTI::CastContextHint::None, CostKind, Op0);
6993     InstructionCost MulCost =
6994         TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind);
6995     InstructionCost Ext2Cost =
6996         TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, MulType,
6997                              TTI::CastContextHint::None, CostKind, RedOp);
6998 
6999     InstructionCost RedCost = TTI.getExtendedAddReductionCost(
7000         /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
7001         CostKind);
7002 
7003     if (RedCost.isValid() &&
7004         RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost)
7005       return I == RetI ? RedCost : 0;
7006   } else if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value())) &&
7007              !TheLoop->isLoopInvariant(RedOp)) {
7008     // Matched reduce(ext(A))
7009     bool IsUnsigned = isa<ZExtInst>(RedOp);
7010     auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy);
7011     InstructionCost RedCost = TTI.getExtendedAddReductionCost(
7012         /*IsMLA=*/false, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
7013         CostKind);
7014 
7015     InstructionCost ExtCost =
7016         TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType,
7017                              TTI::CastContextHint::None, CostKind, RedOp);
7018     if (RedCost.isValid() && RedCost < BaseCost + ExtCost)
7019       return I == RetI ? RedCost : 0;
7020   } else if (RedOp &&
7021              match(RedOp, m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) {
7022     if (match(Op0, m_ZExtOrSExt(m_Value())) &&
7023         Op0->getOpcode() == Op1->getOpcode() &&
7024         !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) {
7025       bool IsUnsigned = isa<ZExtInst>(Op0);
7026       Type *Op0Ty = Op0->getOperand(0)->getType();
7027       Type *Op1Ty = Op1->getOperand(0)->getType();
7028       Type *LargestOpTy =
7029           Op0Ty->getIntegerBitWidth() < Op1Ty->getIntegerBitWidth() ? Op1Ty
7030                                                                     : Op0Ty;
7031       auto *ExtType = VectorType::get(LargestOpTy, VectorTy);
7032 
7033       // Matched reduce(mul(ext(A), ext(B))), where the two ext may be of
7034       // different sizes. We take the largest type as the ext to reduce, and add
7035       // the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))).
7036       InstructionCost ExtCost0 = TTI.getCastInstrCost(
7037           Op0->getOpcode(), VectorTy, VectorType::get(Op0Ty, VectorTy),
7038           TTI::CastContextHint::None, CostKind, Op0);
7039       InstructionCost ExtCost1 = TTI.getCastInstrCost(
7040           Op1->getOpcode(), VectorTy, VectorType::get(Op1Ty, VectorTy),
7041           TTI::CastContextHint::None, CostKind, Op1);
7042       InstructionCost MulCost =
7043           TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
7044 
7045       InstructionCost RedCost = TTI.getExtendedAddReductionCost(
7046           /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
7047           CostKind);
7048       InstructionCost ExtraExtCost = 0;
7049       if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) {
7050         Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1;
7051         ExtraExtCost = TTI.getCastInstrCost(
7052             ExtraExtOp->getOpcode(), ExtType,
7053             VectorType::get(ExtraExtOp->getOperand(0)->getType(), VectorTy),
7054             TTI::CastContextHint::None, CostKind, ExtraExtOp);
7055       }
7056 
7057       if (RedCost.isValid() &&
7058           (RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost))
7059         return I == RetI ? RedCost : 0;
7060     } else if (!match(I, m_ZExtOrSExt(m_Value()))) {
7061       // Matched reduce(mul())
7062       InstructionCost MulCost =
7063           TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
7064 
7065       InstructionCost RedCost = TTI.getExtendedAddReductionCost(
7066           /*IsMLA=*/true, true, RdxDesc.getRecurrenceType(), VectorTy,
7067           CostKind);
7068 
7069       if (RedCost.isValid() && RedCost < MulCost + BaseCost)
7070         return I == RetI ? RedCost : 0;
7071     }
7072   }
7073 
7074   return I == RetI ? Optional<InstructionCost>(BaseCost) : None;
7075 }
7076 
7077 InstructionCost
7078 LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
7079                                                      ElementCount VF) {
7080   // Calculate scalar cost only. Vectorization cost should be ready at this
7081   // moment.
7082   if (VF.isScalar()) {
7083     Type *ValTy = getLoadStoreType(I);
7084     const Align Alignment = getLoadStoreAlignment(I);
7085     unsigned AS = getLoadStoreAddressSpace(I);
7086 
7087     return TTI.getAddressComputationCost(ValTy) +
7088            TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS,
7089                                TTI::TCK_RecipThroughput, I);
7090   }
7091   return getWideningCost(I, VF);
7092 }
7093 
7094 LoopVectorizationCostModel::VectorizationCostTy
7095 LoopVectorizationCostModel::getInstructionCost(Instruction *I,
7096                                                ElementCount VF) {
7097   // If we know that this instruction will remain uniform, check the cost of
7098   // the scalar version.
7099   if (isUniformAfterVectorization(I, VF))
7100     VF = ElementCount::getFixed(1);
7101 
7102   if (VF.isVector() && isProfitableToScalarize(I, VF))
7103     return VectorizationCostTy(InstsToScalarize[VF][I], false);
7104 
7105   // Forced scalars do not have any scalarization overhead.
7106   auto ForcedScalar = ForcedScalars.find(VF);
7107   if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
7108     auto InstSet = ForcedScalar->second;
7109     if (InstSet.count(I))
7110       return VectorizationCostTy(
7111           (getInstructionCost(I, ElementCount::getFixed(1)).first *
7112            VF.getKnownMinValue()),
7113           false);
7114   }
7115 
7116   Type *VectorTy;
7117   InstructionCost C = getInstructionCost(I, VF, VectorTy);
7118 
7119   bool TypeNotScalarized = false;
7120   if (VF.isVector() && VectorTy->isVectorTy()) {
7121     unsigned NumParts = TTI.getNumberOfParts(VectorTy);
7122     if (NumParts)
7123       TypeNotScalarized = NumParts < VF.getKnownMinValue();
7124     else
7125       C = InstructionCost::getInvalid();
7126   }
7127   return VectorizationCostTy(C, TypeNotScalarized);
7128 }
7129 
7130 InstructionCost
7131 LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
7132                                                      ElementCount VF) const {
7133 
7134   // There is no mechanism yet to create a scalable scalarization loop,
7135   // so this is currently Invalid.
7136   if (VF.isScalable())
7137     return InstructionCost::getInvalid();
7138 
7139   if (VF.isScalar())
7140     return 0;
7141 
7142   InstructionCost Cost = 0;
7143   Type *RetTy = ToVectorTy(I->getType(), VF);
7144   if (!RetTy->isVoidTy() &&
7145       (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
7146     Cost += TTI.getScalarizationOverhead(
7147         cast<VectorType>(RetTy), APInt::getAllOnes(VF.getKnownMinValue()), true,
7148         false);
7149 
7150   // Some targets keep addresses scalar.
7151   if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
7152     return Cost;
7153 
7154   // Some targets support efficient element stores.
7155   if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
7156     return Cost;
7157 
7158   // Collect operands to consider.
7159   CallInst *CI = dyn_cast<CallInst>(I);
7160   Instruction::op_range Ops = CI ? CI->args() : I->operands();
7161 
7162   // Skip operands that do not require extraction/scalarization and do not incur
7163   // any overhead.
7164   SmallVector<Type *> Tys;
7165   for (auto *V : filterExtractingOperands(Ops, VF))
7166     Tys.push_back(MaybeVectorizeType(V->getType(), VF));
7167   return Cost + TTI.getOperandsScalarizationOverhead(
7168                     filterExtractingOperands(Ops, VF), Tys);
7169 }
7170 
7171 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
7172   if (VF.isScalar())
7173     return;
7174   NumPredStores = 0;
7175   for (BasicBlock *BB : TheLoop->blocks()) {
7176     // For each instruction in the old loop.
7177     for (Instruction &I : *BB) {
7178       Value *Ptr =  getLoadStorePointerOperand(&I);
7179       if (!Ptr)
7180         continue;
7181 
7182       // TODO: We should generate better code and update the cost model for
7183       // predicated uniform stores. Today they are treated as any other
7184       // predicated store (see added test cases in
7185       // invariant-store-vectorization.ll).
7186       if (isa<StoreInst>(&I) && isScalarWithPredication(&I))
7187         NumPredStores++;
7188 
7189       if (Legal->isUniformMemOp(I)) {
7190         // TODO: Avoid replicating loads and stores instead of
7191         // relying on instcombine to remove them.
7192         // Load: Scalar load + broadcast
7193         // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
7194         InstructionCost Cost;
7195         if (isa<StoreInst>(&I) && VF.isScalable() &&
7196             isLegalGatherOrScatter(&I)) {
7197           Cost = getGatherScatterCost(&I, VF);
7198           setWideningDecision(&I, VF, CM_GatherScatter, Cost);
7199         } else {
7200           assert((isa<LoadInst>(&I) || !VF.isScalable()) &&
7201                  "Cannot yet scalarize uniform stores");
7202           Cost = getUniformMemOpCost(&I, VF);
7203           setWideningDecision(&I, VF, CM_Scalarize, Cost);
7204         }
7205         continue;
7206       }
7207 
7208       // We assume that widening is the best solution when possible.
7209       if (memoryInstructionCanBeWidened(&I, VF)) {
7210         InstructionCost Cost = getConsecutiveMemOpCost(&I, VF);
7211         int ConsecutiveStride = Legal->isConsecutivePtr(
7212             getLoadStoreType(&I), getLoadStorePointerOperand(&I));
7213         assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
7214                "Expected consecutive stride.");
7215         InstWidening Decision =
7216             ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
7217         setWideningDecision(&I, VF, Decision, Cost);
7218         continue;
7219       }
7220 
7221       // Choose between Interleaving, Gather/Scatter or Scalarization.
7222       InstructionCost InterleaveCost = InstructionCost::getInvalid();
7223       unsigned NumAccesses = 1;
7224       if (isAccessInterleaved(&I)) {
7225         auto Group = getInterleavedAccessGroup(&I);
7226         assert(Group && "Fail to get an interleaved access group.");
7227 
7228         // Make one decision for the whole group.
7229         if (getWideningDecision(&I, VF) != CM_Unknown)
7230           continue;
7231 
7232         NumAccesses = Group->getNumMembers();
7233         if (interleavedAccessCanBeWidened(&I, VF))
7234           InterleaveCost = getInterleaveGroupCost(&I, VF);
7235       }
7236 
7237       InstructionCost GatherScatterCost =
7238           isLegalGatherOrScatter(&I)
7239               ? getGatherScatterCost(&I, VF) * NumAccesses
7240               : InstructionCost::getInvalid();
7241 
7242       InstructionCost ScalarizationCost =
7243           getMemInstScalarizationCost(&I, VF) * NumAccesses;
7244 
7245       // Choose better solution for the current VF,
7246       // write down this decision and use it during vectorization.
7247       InstructionCost Cost;
7248       InstWidening Decision;
7249       if (InterleaveCost <= GatherScatterCost &&
7250           InterleaveCost < ScalarizationCost) {
7251         Decision = CM_Interleave;
7252         Cost = InterleaveCost;
7253       } else if (GatherScatterCost < ScalarizationCost) {
7254         Decision = CM_GatherScatter;
7255         Cost = GatherScatterCost;
7256       } else {
7257         Decision = CM_Scalarize;
7258         Cost = ScalarizationCost;
7259       }
7260       // If the instructions belongs to an interleave group, the whole group
7261       // receives the same decision. The whole group receives the cost, but
7262       // the cost will actually be assigned to one instruction.
7263       if (auto Group = getInterleavedAccessGroup(&I))
7264         setWideningDecision(Group, VF, Decision, Cost);
7265       else
7266         setWideningDecision(&I, VF, Decision, Cost);
7267     }
7268   }
7269 
7270   // Make sure that any load of address and any other address computation
7271   // remains scalar unless there is gather/scatter support. This avoids
7272   // inevitable extracts into address registers, and also has the benefit of
7273   // activating LSR more, since that pass can't optimize vectorized
7274   // addresses.
7275   if (TTI.prefersVectorizedAddressing())
7276     return;
7277 
7278   // Start with all scalar pointer uses.
7279   SmallPtrSet<Instruction *, 8> AddrDefs;
7280   for (BasicBlock *BB : TheLoop->blocks())
7281     for (Instruction &I : *BB) {
7282       Instruction *PtrDef =
7283         dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
7284       if (PtrDef && TheLoop->contains(PtrDef) &&
7285           getWideningDecision(&I, VF) != CM_GatherScatter)
7286         AddrDefs.insert(PtrDef);
7287     }
7288 
7289   // Add all instructions used to generate the addresses.
7290   SmallVector<Instruction *, 4> Worklist;
7291   append_range(Worklist, AddrDefs);
7292   while (!Worklist.empty()) {
7293     Instruction *I = Worklist.pop_back_val();
7294     for (auto &Op : I->operands())
7295       if (auto *InstOp = dyn_cast<Instruction>(Op))
7296         if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
7297             AddrDefs.insert(InstOp).second)
7298           Worklist.push_back(InstOp);
7299   }
7300 
7301   for (auto *I : AddrDefs) {
7302     if (isa<LoadInst>(I)) {
7303       // Setting the desired widening decision should ideally be handled in
7304       // by cost functions, but since this involves the task of finding out
7305       // if the loaded register is involved in an address computation, it is
7306       // instead changed here when we know this is the case.
7307       InstWidening Decision = getWideningDecision(I, VF);
7308       if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
7309         // Scalarize a widened load of address.
7310         setWideningDecision(
7311             I, VF, CM_Scalarize,
7312             (VF.getKnownMinValue() *
7313              getMemoryInstructionCost(I, ElementCount::getFixed(1))));
7314       else if (auto Group = getInterleavedAccessGroup(I)) {
7315         // Scalarize an interleave group of address loads.
7316         for (unsigned I = 0; I < Group->getFactor(); ++I) {
7317           if (Instruction *Member = Group->getMember(I))
7318             setWideningDecision(
7319                 Member, VF, CM_Scalarize,
7320                 (VF.getKnownMinValue() *
7321                  getMemoryInstructionCost(Member, ElementCount::getFixed(1))));
7322         }
7323       }
7324     } else
7325       // Make sure I gets scalarized and a cost estimate without
7326       // scalarization overhead.
7327       ForcedScalars[VF].insert(I);
7328   }
7329 }
7330 
7331 InstructionCost
7332 LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
7333                                                Type *&VectorTy) {
7334   Type *RetTy = I->getType();
7335   if (canTruncateToMinimalBitwidth(I, VF))
7336     RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
7337   auto SE = PSE.getSE();
7338   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
7339 
7340   auto hasSingleCopyAfterVectorization = [this](Instruction *I,
7341                                                 ElementCount VF) -> bool {
7342     if (VF.isScalar())
7343       return true;
7344 
7345     auto Scalarized = InstsToScalarize.find(VF);
7346     assert(Scalarized != InstsToScalarize.end() &&
7347            "VF not yet analyzed for scalarization profitability");
7348     return !Scalarized->second.count(I) &&
7349            llvm::all_of(I->users(), [&](User *U) {
7350              auto *UI = cast<Instruction>(U);
7351              return !Scalarized->second.count(UI);
7352            });
7353   };
7354   (void) hasSingleCopyAfterVectorization;
7355 
7356   if (isScalarAfterVectorization(I, VF)) {
7357     // With the exception of GEPs and PHIs, after scalarization there should
7358     // only be one copy of the instruction generated in the loop. This is
7359     // because the VF is either 1, or any instructions that need scalarizing
7360     // have already been dealt with by the the time we get here. As a result,
7361     // it means we don't have to multiply the instruction cost by VF.
7362     assert(I->getOpcode() == Instruction::GetElementPtr ||
7363            I->getOpcode() == Instruction::PHI ||
7364            (I->getOpcode() == Instruction::BitCast &&
7365             I->getType()->isPointerTy()) ||
7366            hasSingleCopyAfterVectorization(I, VF));
7367     VectorTy = RetTy;
7368   } else
7369     VectorTy = ToVectorTy(RetTy, VF);
7370 
7371   // TODO: We need to estimate the cost of intrinsic calls.
7372   switch (I->getOpcode()) {
7373   case Instruction::GetElementPtr:
7374     // We mark this instruction as zero-cost because the cost of GEPs in
7375     // vectorized code depends on whether the corresponding memory instruction
7376     // is scalarized or not. Therefore, we handle GEPs with the memory
7377     // instruction cost.
7378     return 0;
7379   case Instruction::Br: {
7380     // In cases of scalarized and predicated instructions, there will be VF
7381     // predicated blocks in the vectorized loop. Each branch around these
7382     // blocks requires also an extract of its vector compare i1 element.
7383     bool ScalarPredicatedBB = false;
7384     BranchInst *BI = cast<BranchInst>(I);
7385     if (VF.isVector() && BI->isConditional() &&
7386         (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) ||
7387          PredicatedBBsAfterVectorization.count(BI->getSuccessor(1))))
7388       ScalarPredicatedBB = true;
7389 
7390     if (ScalarPredicatedBB) {
7391       // Not possible to scalarize scalable vector with predicated instructions.
7392       if (VF.isScalable())
7393         return InstructionCost::getInvalid();
7394       // Return cost for branches around scalarized and predicated blocks.
7395       auto *Vec_i1Ty =
7396           VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
7397       return (
7398           TTI.getScalarizationOverhead(
7399               Vec_i1Ty, APInt::getAllOnes(VF.getFixedValue()), false, true) +
7400           (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue()));
7401     } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar())
7402       // The back-edge branch will remain, as will all scalar branches.
7403       return TTI.getCFInstrCost(Instruction::Br, CostKind);
7404     else
7405       // This branch will be eliminated by if-conversion.
7406       return 0;
7407     // Note: We currently assume zero cost for an unconditional branch inside
7408     // a predicated block since it will become a fall-through, although we
7409     // may decide in the future to call TTI for all branches.
7410   }
7411   case Instruction::PHI: {
7412     auto *Phi = cast<PHINode>(I);
7413 
7414     // First-order recurrences are replaced by vector shuffles inside the loop.
7415     // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type.
7416     if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi))
7417       return TTI.getShuffleCost(
7418           TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy),
7419           None, VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1));
7420 
7421     // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
7422     // converted into select instructions. We require N - 1 selects per phi
7423     // node, where N is the number of incoming values.
7424     if (VF.isVector() && Phi->getParent() != TheLoop->getHeader())
7425       return (Phi->getNumIncomingValues() - 1) *
7426              TTI.getCmpSelInstrCost(
7427                  Instruction::Select, ToVectorTy(Phi->getType(), VF),
7428                  ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF),
7429                  CmpInst::BAD_ICMP_PREDICATE, CostKind);
7430 
7431     return TTI.getCFInstrCost(Instruction::PHI, CostKind);
7432   }
7433   case Instruction::UDiv:
7434   case Instruction::SDiv:
7435   case Instruction::URem:
7436   case Instruction::SRem:
7437     // If we have a predicated instruction, it may not be executed for each
7438     // vector lane. Get the scalarization cost and scale this amount by the
7439     // probability of executing the predicated block. If the instruction is not
7440     // predicated, we fall through to the next case.
7441     if (VF.isVector() && isScalarWithPredication(I)) {
7442       InstructionCost Cost = 0;
7443 
7444       // These instructions have a non-void type, so account for the phi nodes
7445       // that we will create. This cost is likely to be zero. The phi node
7446       // cost, if any, should be scaled by the block probability because it
7447       // models a copy at the end of each predicated block.
7448       Cost += VF.getKnownMinValue() *
7449               TTI.getCFInstrCost(Instruction::PHI, CostKind);
7450 
7451       // The cost of the non-predicated instruction.
7452       Cost += VF.getKnownMinValue() *
7453               TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind);
7454 
7455       // The cost of insertelement and extractelement instructions needed for
7456       // scalarization.
7457       Cost += getScalarizationOverhead(I, VF);
7458 
7459       // Scale the cost by the probability of executing the predicated blocks.
7460       // This assumes the predicated block for each vector lane is equally
7461       // likely.
7462       return Cost / getReciprocalPredBlockProb();
7463     }
7464     LLVM_FALLTHROUGH;
7465   case Instruction::Add:
7466   case Instruction::FAdd:
7467   case Instruction::Sub:
7468   case Instruction::FSub:
7469   case Instruction::Mul:
7470   case Instruction::FMul:
7471   case Instruction::FDiv:
7472   case Instruction::FRem:
7473   case Instruction::Shl:
7474   case Instruction::LShr:
7475   case Instruction::AShr:
7476   case Instruction::And:
7477   case Instruction::Or:
7478   case Instruction::Xor: {
7479     // Since we will replace the stride by 1 the multiplication should go away.
7480     if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal))
7481       return 0;
7482 
7483     // Detect reduction patterns
7484     if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
7485       return *RedCost;
7486 
7487     // Certain instructions can be cheaper to vectorize if they have a constant
7488     // second vector operand. One example of this are shifts on x86.
7489     Value *Op2 = I->getOperand(1);
7490     TargetTransformInfo::OperandValueProperties Op2VP;
7491     TargetTransformInfo::OperandValueKind Op2VK =
7492         TTI.getOperandInfo(Op2, Op2VP);
7493     if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2))
7494       Op2VK = TargetTransformInfo::OK_UniformValue;
7495 
7496     SmallVector<const Value *, 4> Operands(I->operand_values());
7497     return TTI.getArithmeticInstrCost(
7498         I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue,
7499         Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I);
7500   }
7501   case Instruction::FNeg: {
7502     return TTI.getArithmeticInstrCost(
7503         I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue,
7504         TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None,
7505         TargetTransformInfo::OP_None, I->getOperand(0), I);
7506   }
7507   case Instruction::Select: {
7508     SelectInst *SI = cast<SelectInst>(I);
7509     const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
7510     bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
7511 
7512     const Value *Op0, *Op1;
7513     using namespace llvm::PatternMatch;
7514     if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) ||
7515                         match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) {
7516       // select x, y, false --> x & y
7517       // select x, true, y --> x | y
7518       TTI::OperandValueProperties Op1VP = TTI::OP_None;
7519       TTI::OperandValueProperties Op2VP = TTI::OP_None;
7520       TTI::OperandValueKind Op1VK = TTI::getOperandInfo(Op0, Op1VP);
7521       TTI::OperandValueKind Op2VK = TTI::getOperandInfo(Op1, Op2VP);
7522       assert(Op0->getType()->getScalarSizeInBits() == 1 &&
7523               Op1->getType()->getScalarSizeInBits() == 1);
7524 
7525       SmallVector<const Value *, 2> Operands{Op0, Op1};
7526       return TTI.getArithmeticInstrCost(
7527           match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy,
7528           CostKind, Op1VK, Op2VK, Op1VP, Op2VP, Operands, I);
7529     }
7530 
7531     Type *CondTy = SI->getCondition()->getType();
7532     if (!ScalarCond)
7533       CondTy = VectorType::get(CondTy, VF);
7534 
7535     CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE;
7536     if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition()))
7537       Pred = Cmp->getPredicate();
7538     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, Pred,
7539                                   CostKind, I);
7540   }
7541   case Instruction::ICmp:
7542   case Instruction::FCmp: {
7543     Type *ValTy = I->getOperand(0)->getType();
7544     Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
7545     if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
7546       ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
7547     VectorTy = ToVectorTy(ValTy, VF);
7548     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr,
7549                                   cast<CmpInst>(I)->getPredicate(), CostKind,
7550                                   I);
7551   }
7552   case Instruction::Store:
7553   case Instruction::Load: {
7554     ElementCount Width = VF;
7555     if (Width.isVector()) {
7556       InstWidening Decision = getWideningDecision(I, Width);
7557       assert(Decision != CM_Unknown &&
7558              "CM decision should be taken at this point");
7559       if (Decision == CM_Scalarize)
7560         Width = ElementCount::getFixed(1);
7561     }
7562     VectorTy = ToVectorTy(getLoadStoreType(I), Width);
7563     return getMemoryInstructionCost(I, VF);
7564   }
7565   case Instruction::BitCast:
7566     if (I->getType()->isPointerTy())
7567       return 0;
7568     LLVM_FALLTHROUGH;
7569   case Instruction::ZExt:
7570   case Instruction::SExt:
7571   case Instruction::FPToUI:
7572   case Instruction::FPToSI:
7573   case Instruction::FPExt:
7574   case Instruction::PtrToInt:
7575   case Instruction::IntToPtr:
7576   case Instruction::SIToFP:
7577   case Instruction::UIToFP:
7578   case Instruction::Trunc:
7579   case Instruction::FPTrunc: {
7580     // Computes the CastContextHint from a Load/Store instruction.
7581     auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint {
7582       assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
7583              "Expected a load or a store!");
7584 
7585       if (VF.isScalar() || !TheLoop->contains(I))
7586         return TTI::CastContextHint::Normal;
7587 
7588       switch (getWideningDecision(I, VF)) {
7589       case LoopVectorizationCostModel::CM_GatherScatter:
7590         return TTI::CastContextHint::GatherScatter;
7591       case LoopVectorizationCostModel::CM_Interleave:
7592         return TTI::CastContextHint::Interleave;
7593       case LoopVectorizationCostModel::CM_Scalarize:
7594       case LoopVectorizationCostModel::CM_Widen:
7595         return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked
7596                                         : TTI::CastContextHint::Normal;
7597       case LoopVectorizationCostModel::CM_Widen_Reverse:
7598         return TTI::CastContextHint::Reversed;
7599       case LoopVectorizationCostModel::CM_Unknown:
7600         llvm_unreachable("Instr did not go through cost modelling?");
7601       }
7602 
7603       llvm_unreachable("Unhandled case!");
7604     };
7605 
7606     unsigned Opcode = I->getOpcode();
7607     TTI::CastContextHint CCH = TTI::CastContextHint::None;
7608     // For Trunc, the context is the only user, which must be a StoreInst.
7609     if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
7610       if (I->hasOneUse())
7611         if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin()))
7612           CCH = ComputeCCH(Store);
7613     }
7614     // For Z/Sext, the context is the operand, which must be a LoadInst.
7615     else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
7616              Opcode == Instruction::FPExt) {
7617       if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0)))
7618         CCH = ComputeCCH(Load);
7619     }
7620 
7621     // We optimize the truncation of induction variables having constant
7622     // integer steps. The cost of these truncations is the same as the scalar
7623     // operation.
7624     if (isOptimizableIVTruncate(I, VF)) {
7625       auto *Trunc = cast<TruncInst>(I);
7626       return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
7627                                   Trunc->getSrcTy(), CCH, CostKind, Trunc);
7628     }
7629 
7630     // Detect reduction patterns
7631     if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
7632       return *RedCost;
7633 
7634     Type *SrcScalarTy = I->getOperand(0)->getType();
7635     Type *SrcVecTy =
7636         VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
7637     if (canTruncateToMinimalBitwidth(I, VF)) {
7638       // This cast is going to be shrunk. This may remove the cast or it might
7639       // turn it into slightly different cast. For example, if MinBW == 16,
7640       // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
7641       //
7642       // Calculate the modified src and dest types.
7643       Type *MinVecTy = VectorTy;
7644       if (Opcode == Instruction::Trunc) {
7645         SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
7646         VectorTy =
7647             largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
7648       } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) {
7649         SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy);
7650         VectorTy =
7651             smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
7652       }
7653     }
7654 
7655     return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
7656   }
7657   case Instruction::Call: {
7658     if (RecurrenceDescriptor::isFMulAddIntrinsic(I))
7659       if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
7660         return *RedCost;
7661     bool NeedToScalarize;
7662     CallInst *CI = cast<CallInst>(I);
7663     InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize);
7664     if (getVectorIntrinsicIDForCall(CI, TLI)) {
7665       InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF);
7666       return std::min(CallCost, IntrinsicCost);
7667     }
7668     return CallCost;
7669   }
7670   case Instruction::ExtractValue:
7671     return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput);
7672   case Instruction::Alloca:
7673     // We cannot easily widen alloca to a scalable alloca, as
7674     // the result would need to be a vector of pointers.
7675     if (VF.isScalable())
7676       return InstructionCost::getInvalid();
7677     LLVM_FALLTHROUGH;
7678   default:
7679     // This opcode is unknown. Assume that it is the same as 'mul'.
7680     return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
7681   } // end of switch.
7682 }
7683 
7684 char LoopVectorize::ID = 0;
7685 
7686 static const char lv_name[] = "Loop Vectorization";
7687 
7688 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
7689 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
7690 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)
7691 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
7692 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
7693 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
7694 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
7695 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
7696 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
7697 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
7698 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
7699 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
7700 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
7701 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
7702 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy)
7703 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
7704 
7705 namespace llvm {
7706 
7707 Pass *createLoopVectorizePass() { return new LoopVectorize(); }
7708 
7709 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced,
7710                               bool VectorizeOnlyWhenForced) {
7711   return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced);
7712 }
7713 
7714 } // end namespace llvm
7715 
7716 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
7717   // Check if the pointer operand of a load or store instruction is
7718   // consecutive.
7719   if (auto *Ptr = getLoadStorePointerOperand(Inst))
7720     return Legal->isConsecutivePtr(getLoadStoreType(Inst), Ptr);
7721   return false;
7722 }
7723 
7724 void LoopVectorizationCostModel::collectValuesToIgnore() {
7725   // Ignore ephemeral values.
7726   CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
7727 
7728   // Ignore type-promoting instructions we identified during reduction
7729   // detection.
7730   for (auto &Reduction : Legal->getReductionVars()) {
7731     const RecurrenceDescriptor &RedDes = Reduction.second;
7732     const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
7733     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7734   }
7735   // Ignore type-casting instructions we identified during induction
7736   // detection.
7737   for (auto &Induction : Legal->getInductionVars()) {
7738     const InductionDescriptor &IndDes = Induction.second;
7739     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
7740     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7741   }
7742 }
7743 
7744 void LoopVectorizationCostModel::collectInLoopReductions() {
7745   for (auto &Reduction : Legal->getReductionVars()) {
7746     PHINode *Phi = Reduction.first;
7747     const RecurrenceDescriptor &RdxDesc = Reduction.second;
7748 
7749     // We don't collect reductions that are type promoted (yet).
7750     if (RdxDesc.getRecurrenceType() != Phi->getType())
7751       continue;
7752 
7753     // If the target would prefer this reduction to happen "in-loop", then we
7754     // want to record it as such.
7755     unsigned Opcode = RdxDesc.getOpcode();
7756     if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) &&
7757         !TTI.preferInLoopReduction(Opcode, Phi->getType(),
7758                                    TargetTransformInfo::ReductionFlags()))
7759       continue;
7760 
7761     // Check that we can correctly put the reductions into the loop, by
7762     // finding the chain of operations that leads from the phi to the loop
7763     // exit value.
7764     SmallVector<Instruction *, 4> ReductionOperations =
7765         RdxDesc.getReductionOpChain(Phi, TheLoop);
7766     bool InLoop = !ReductionOperations.empty();
7767     if (InLoop) {
7768       InLoopReductionChains[Phi] = ReductionOperations;
7769       // Add the elements to InLoopReductionImmediateChains for cost modelling.
7770       Instruction *LastChain = Phi;
7771       for (auto *I : ReductionOperations) {
7772         InLoopReductionImmediateChains[I] = LastChain;
7773         LastChain = I;
7774       }
7775     }
7776     LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop")
7777                       << " reduction for phi: " << *Phi << "\n");
7778   }
7779 }
7780 
7781 // TODO: we could return a pair of values that specify the max VF and
7782 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
7783 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
7784 // doesn't have a cost model that can choose which plan to execute if
7785 // more than one is generated.
7786 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits,
7787                                  LoopVectorizationCostModel &CM) {
7788   unsigned WidestType;
7789   std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
7790   return WidestVectorRegBits / WidestType;
7791 }
7792 
7793 VectorizationFactor
7794 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
7795   assert(!UserVF.isScalable() && "scalable vectors not yet supported");
7796   ElementCount VF = UserVF;
7797   // Outer loop handling: They may require CFG and instruction level
7798   // transformations before even evaluating whether vectorization is profitable.
7799   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
7800   // the vectorization pipeline.
7801   if (!OrigLoop->isInnermost()) {
7802     // If the user doesn't provide a vectorization factor, determine a
7803     // reasonable one.
7804     if (UserVF.isZero()) {
7805       VF = ElementCount::getFixed(determineVPlanVF(
7806           TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)
7807               .getFixedSize(),
7808           CM));
7809       LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
7810 
7811       // Make sure we have a VF > 1 for stress testing.
7812       if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) {
7813         LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
7814                           << "overriding computed VF.\n");
7815         VF = ElementCount::getFixed(4);
7816       }
7817     }
7818     assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
7819     assert(isPowerOf2_32(VF.getKnownMinValue()) &&
7820            "VF needs to be a power of two");
7821     LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "")
7822                       << "VF " << VF << " to build VPlans.\n");
7823     buildVPlans(VF, VF);
7824 
7825     // For VPlan build stress testing, we bail out after VPlan construction.
7826     if (VPlanBuildStressTest)
7827       return VectorizationFactor::Disabled();
7828 
7829     return {VF, 0 /*Cost*/};
7830   }
7831 
7832   LLVM_DEBUG(
7833       dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
7834                 "VPlan-native path.\n");
7835   return VectorizationFactor::Disabled();
7836 }
7837 
7838 Optional<VectorizationFactor>
7839 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
7840   assert(OrigLoop->isInnermost() && "Inner loop expected.");
7841   FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC);
7842   if (!MaxFactors) // Cases that should not to be vectorized nor interleaved.
7843     return None;
7844 
7845   // Invalidate interleave groups if all blocks of loop will be predicated.
7846   if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) &&
7847       !useMaskedInterleavedAccesses(*TTI)) {
7848     LLVM_DEBUG(
7849         dbgs()
7850         << "LV: Invalidate all interleaved groups due to fold-tail by masking "
7851            "which requires masked-interleaved support.\n");
7852     if (CM.InterleaveInfo.invalidateGroups())
7853       // Invalidating interleave groups also requires invalidating all decisions
7854       // based on them, which includes widening decisions and uniform and scalar
7855       // values.
7856       CM.invalidateCostModelingDecisions();
7857   }
7858 
7859   ElementCount MaxUserVF =
7860       UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF;
7861   bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxUserVF);
7862   if (!UserVF.isZero() && UserVFIsLegal) {
7863     assert(isPowerOf2_32(UserVF.getKnownMinValue()) &&
7864            "VF needs to be a power of two");
7865     // Collect the instructions (and their associated costs) that will be more
7866     // profitable to scalarize.
7867     if (CM.selectUserVectorizationFactor(UserVF)) {
7868       LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
7869       CM.collectInLoopReductions();
7870       buildVPlansWithVPRecipes(UserVF, UserVF);
7871       LLVM_DEBUG(printPlans(dbgs()));
7872       return {{UserVF, 0}};
7873     } else
7874       reportVectorizationInfo("UserVF ignored because of invalid costs.",
7875                               "InvalidCost", ORE, OrigLoop);
7876   }
7877 
7878   // Populate the set of Vectorization Factor Candidates.
7879   ElementCountSet VFCandidates;
7880   for (auto VF = ElementCount::getFixed(1);
7881        ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2)
7882     VFCandidates.insert(VF);
7883   for (auto VF = ElementCount::getScalable(1);
7884        ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2)
7885     VFCandidates.insert(VF);
7886 
7887   for (const auto &VF : VFCandidates) {
7888     // Collect Uniform and Scalar instructions after vectorization with VF.
7889     CM.collectUniformsAndScalars(VF);
7890 
7891     // Collect the instructions (and their associated costs) that will be more
7892     // profitable to scalarize.
7893     if (VF.isVector())
7894       CM.collectInstsToScalarize(VF);
7895   }
7896 
7897   CM.collectInLoopReductions();
7898   buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF);
7899   buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF);
7900 
7901   LLVM_DEBUG(printPlans(dbgs()));
7902   if (!MaxFactors.hasVector())
7903     return VectorizationFactor::Disabled();
7904 
7905   // Select the optimal vectorization factor.
7906   auto SelectedVF = CM.selectVectorizationFactor(VFCandidates);
7907 
7908   // Check if it is profitable to vectorize with runtime checks.
7909   unsigned NumRuntimePointerChecks = Requirements.getNumRuntimePointerChecks();
7910   if (SelectedVF.Width.getKnownMinValue() > 1 && NumRuntimePointerChecks) {
7911     bool PragmaThresholdReached =
7912         NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold;
7913     bool ThresholdReached =
7914         NumRuntimePointerChecks > VectorizerParams::RuntimeMemoryCheckThreshold;
7915     if ((ThresholdReached && !Hints.allowReordering()) ||
7916         PragmaThresholdReached) {
7917       ORE->emit([&]() {
7918         return OptimizationRemarkAnalysisAliasing(
7919                    DEBUG_TYPE, "CantReorderMemOps", OrigLoop->getStartLoc(),
7920                    OrigLoop->getHeader())
7921                << "loop not vectorized: cannot prove it is safe to reorder "
7922                   "memory operations";
7923       });
7924       LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
7925       Hints.emitRemarkWithHints();
7926       return VectorizationFactor::Disabled();
7927     }
7928   }
7929   return SelectedVF;
7930 }
7931 
7932 VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const {
7933   assert(count_if(VPlans,
7934                   [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) ==
7935              1 &&
7936          "Best VF has not a single VPlan.");
7937 
7938   for (const VPlanPtr &Plan : VPlans) {
7939     if (Plan->hasVF(VF))
7940       return *Plan.get();
7941   }
7942   llvm_unreachable("No plan found!");
7943 }
7944 
7945 void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF,
7946                                            VPlan &BestVPlan,
7947                                            InnerLoopVectorizer &ILV,
7948                                            DominatorTree *DT) {
7949   LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF << ", UF=" << BestUF
7950                     << '\n');
7951 
7952   // Perform the actual loop transformation.
7953 
7954   // 1. Create a new empty loop. Unlink the old loop and connect the new one.
7955   VPTransformState State{BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan};
7956   State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
7957   State.TripCount = ILV.getOrCreateTripCount(nullptr);
7958   State.CanonicalIV = ILV.Induction;
7959   ILV.collectPoisonGeneratingRecipes(State);
7960 
7961   ILV.printDebugTracesAtStart();
7962 
7963   //===------------------------------------------------===//
7964   //
7965   // Notice: any optimization or new instruction that go
7966   // into the code below should also be implemented in
7967   // the cost-model.
7968   //
7969   //===------------------------------------------------===//
7970 
7971   // 2. Copy and widen instructions from the old loop into the new loop.
7972   BestVPlan.execute(&State);
7973 
7974   // 3. Fix the vectorized code: take care of header phi's, live-outs,
7975   //    predication, updating analyses.
7976   ILV.fixVectorizedLoop(State);
7977 
7978   ILV.printDebugTracesAtEnd();
7979 }
7980 
7981 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
7982 void LoopVectorizationPlanner::printPlans(raw_ostream &O) {
7983   for (const auto &Plan : VPlans)
7984     if (PrintVPlansInDotFormat)
7985       Plan->printDOT(O);
7986     else
7987       Plan->print(O);
7988 }
7989 #endif
7990 
7991 void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
7992     SmallPtrSetImpl<Instruction *> &DeadInstructions) {
7993 
7994   // We create new control-flow for the vectorized loop, so the original exit
7995   // conditions will be dead after vectorization if it's only used by the
7996   // terminator
7997   SmallVector<BasicBlock*> ExitingBlocks;
7998   OrigLoop->getExitingBlocks(ExitingBlocks);
7999   for (auto *BB : ExitingBlocks) {
8000     auto *Cmp = dyn_cast<Instruction>(BB->getTerminator()->getOperand(0));
8001     if (!Cmp || !Cmp->hasOneUse())
8002       continue;
8003 
8004     // TODO: we should introduce a getUniqueExitingBlocks on Loop
8005     if (!DeadInstructions.insert(Cmp).second)
8006       continue;
8007 
8008     // The operands of the icmp is often a dead trunc, used by IndUpdate.
8009     // TODO: can recurse through operands in general
8010     for (Value *Op : Cmp->operands()) {
8011       if (isa<TruncInst>(Op) && Op->hasOneUse())
8012           DeadInstructions.insert(cast<Instruction>(Op));
8013     }
8014   }
8015 
8016   // We create new "steps" for induction variable updates to which the original
8017   // induction variables map. An original update instruction will be dead if
8018   // all its users except the induction variable are dead.
8019   auto *Latch = OrigLoop->getLoopLatch();
8020   for (auto &Induction : Legal->getInductionVars()) {
8021     PHINode *Ind = Induction.first;
8022     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
8023 
8024     // If the tail is to be folded by masking, the primary induction variable,
8025     // if exists, isn't dead: it will be used for masking. Don't kill it.
8026     if (CM.foldTailByMasking() && IndUpdate == Legal->getPrimaryInduction())
8027       continue;
8028 
8029     if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
8030           return U == Ind || DeadInstructions.count(cast<Instruction>(U));
8031         }))
8032       DeadInstructions.insert(IndUpdate);
8033   }
8034 }
8035 
8036 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; }
8037 
8038 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; }
8039 
8040 Value *InnerLoopUnroller::getStepVector(Value *Val, Value *StartIdx,
8041                                         Value *Step,
8042                                         Instruction::BinaryOps BinOp) {
8043   // When unrolling and the VF is 1, we only need to add a simple scalar.
8044   Type *Ty = Val->getType();
8045   assert(!Ty->isVectorTy() && "Val must be a scalar");
8046 
8047   if (Ty->isFloatingPointTy()) {
8048     // Floating-point operations inherit FMF via the builder's flags.
8049     Value *MulOp = Builder.CreateFMul(StartIdx, Step);
8050     return Builder.CreateBinOp(BinOp, Val, MulOp);
8051   }
8052   return Builder.CreateAdd(Val, Builder.CreateMul(StartIdx, Step), "induction");
8053 }
8054 
8055 static void AddRuntimeUnrollDisableMetaData(Loop *L) {
8056   SmallVector<Metadata *, 4> MDs;
8057   // Reserve first location for self reference to the LoopID metadata node.
8058   MDs.push_back(nullptr);
8059   bool IsUnrollMetadata = false;
8060   MDNode *LoopID = L->getLoopID();
8061   if (LoopID) {
8062     // First find existing loop unrolling disable metadata.
8063     for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
8064       auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
8065       if (MD) {
8066         const auto *S = dyn_cast<MDString>(MD->getOperand(0));
8067         IsUnrollMetadata =
8068             S && S->getString().startswith("llvm.loop.unroll.disable");
8069       }
8070       MDs.push_back(LoopID->getOperand(i));
8071     }
8072   }
8073 
8074   if (!IsUnrollMetadata) {
8075     // Add runtime unroll disable metadata.
8076     LLVMContext &Context = L->getHeader()->getContext();
8077     SmallVector<Metadata *, 1> DisableOperands;
8078     DisableOperands.push_back(
8079         MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
8080     MDNode *DisableNode = MDNode::get(Context, DisableOperands);
8081     MDs.push_back(DisableNode);
8082     MDNode *NewLoopID = MDNode::get(Context, MDs);
8083     // Set operand 0 to refer to the loop id itself.
8084     NewLoopID->replaceOperandWith(0, NewLoopID);
8085     L->setLoopID(NewLoopID);
8086   }
8087 }
8088 
8089 //===--------------------------------------------------------------------===//
8090 // EpilogueVectorizerMainLoop
8091 //===--------------------------------------------------------------------===//
8092 
8093 /// This function is partially responsible for generating the control flow
8094 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
8095 BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() {
8096   MDNode *OrigLoopID = OrigLoop->getLoopID();
8097   Loop *Lp = createVectorLoopSkeleton("");
8098 
8099   // Generate the code to check the minimum iteration count of the vector
8100   // epilogue (see below).
8101   EPI.EpilogueIterationCountCheck =
8102       emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, true);
8103   EPI.EpilogueIterationCountCheck->setName("iter.check");
8104 
8105   // Generate the code to check any assumptions that we've made for SCEV
8106   // expressions.
8107   EPI.SCEVSafetyCheck = emitSCEVChecks(Lp, LoopScalarPreHeader);
8108 
8109   // Generate the code that checks at runtime if arrays overlap. We put the
8110   // checks into a separate block to make the more common case of few elements
8111   // faster.
8112   EPI.MemSafetyCheck = emitMemRuntimeChecks(Lp, LoopScalarPreHeader);
8113 
8114   // Generate the iteration count check for the main loop, *after* the check
8115   // for the epilogue loop, so that the path-length is shorter for the case
8116   // that goes directly through the vector epilogue. The longer-path length for
8117   // the main loop is compensated for, by the gain from vectorizing the larger
8118   // trip count. Note: the branch will get updated later on when we vectorize
8119   // the epilogue.
8120   EPI.MainLoopIterationCountCheck =
8121       emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, false);
8122 
8123   // Generate the induction variable.
8124   OldInduction = Legal->getPrimaryInduction();
8125   Type *IdxTy = Legal->getWidestInductionType();
8126   Value *StartIdx = ConstantInt::get(IdxTy, 0);
8127 
8128   IRBuilder<> B(&*Lp->getLoopPreheader()->getFirstInsertionPt());
8129   Value *Step = getRuntimeVF(B, IdxTy, VF * UF);
8130   Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
8131   EPI.VectorTripCount = CountRoundDown;
8132   Induction =
8133       createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
8134                               getDebugLocFromInstOrOperands(OldInduction));
8135 
8136   // Skip induction resume value creation here because they will be created in
8137   // the second pass. If we created them here, they wouldn't be used anyway,
8138   // because the vplan in the second pass still contains the inductions from the
8139   // original loop.
8140 
8141   return completeLoopSkeleton(Lp, OrigLoopID);
8142 }
8143 
8144 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() {
8145   LLVM_DEBUG({
8146     dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n"
8147            << "Main Loop VF:" << EPI.MainLoopVF
8148            << ", Main Loop UF:" << EPI.MainLoopUF
8149            << ", Epilogue Loop VF:" << EPI.EpilogueVF
8150            << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
8151   });
8152 }
8153 
8154 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() {
8155   DEBUG_WITH_TYPE(VerboseDebug, {
8156     dbgs() << "intermediate fn:\n"
8157            << *OrigLoop->getHeader()->getParent() << "\n";
8158   });
8159 }
8160 
8161 BasicBlock *EpilogueVectorizerMainLoop::emitMinimumIterationCountCheck(
8162     Loop *L, BasicBlock *Bypass, bool ForEpilogue) {
8163   assert(L && "Expected valid Loop.");
8164   assert(Bypass && "Expected valid bypass basic block.");
8165   ElementCount VFactor = ForEpilogue ? EPI.EpilogueVF : VF;
8166   unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF;
8167   Value *Count = getOrCreateTripCount(L);
8168   // Reuse existing vector loop preheader for TC checks.
8169   // Note that new preheader block is generated for vector loop.
8170   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
8171   IRBuilder<> Builder(TCCheckBlock->getTerminator());
8172 
8173   // Generate code to check if the loop's trip count is less than VF * UF of the
8174   // main vector loop.
8175   auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF : VF) ?
8176       ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
8177 
8178   Value *CheckMinIters = Builder.CreateICmp(
8179       P, Count, createStepForVF(Builder, Count->getType(), VFactor, UFactor),
8180       "min.iters.check");
8181 
8182   if (!ForEpilogue)
8183     TCCheckBlock->setName("vector.main.loop.iter.check");
8184 
8185   // Create new preheader for vector loop.
8186   LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(),
8187                                    DT, LI, nullptr, "vector.ph");
8188 
8189   if (ForEpilogue) {
8190     assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
8191                                  DT->getNode(Bypass)->getIDom()) &&
8192            "TC check is expected to dominate Bypass");
8193 
8194     // Update dominator for Bypass & LoopExit.
8195     DT->changeImmediateDominator(Bypass, TCCheckBlock);
8196     if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF))
8197       // For loops with multiple exits, there's no edge from the middle block
8198       // to exit blocks (as the epilogue must run) and thus no need to update
8199       // the immediate dominator of the exit blocks.
8200       DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
8201 
8202     LoopBypassBlocks.push_back(TCCheckBlock);
8203 
8204     // Save the trip count so we don't have to regenerate it in the
8205     // vec.epilog.iter.check. This is safe to do because the trip count
8206     // generated here dominates the vector epilog iter check.
8207     EPI.TripCount = Count;
8208   }
8209 
8210   ReplaceInstWithInst(
8211       TCCheckBlock->getTerminator(),
8212       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
8213 
8214   return TCCheckBlock;
8215 }
8216 
8217 //===--------------------------------------------------------------------===//
8218 // EpilogueVectorizerEpilogueLoop
8219 //===--------------------------------------------------------------------===//
8220 
8221 /// This function is partially responsible for generating the control flow
8222 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
8223 BasicBlock *
8224 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
8225   MDNode *OrigLoopID = OrigLoop->getLoopID();
8226   Loop *Lp = createVectorLoopSkeleton("vec.epilog.");
8227 
8228   // Now, compare the remaining count and if there aren't enough iterations to
8229   // execute the vectorized epilogue skip to the scalar part.
8230   BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader;
8231   VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check");
8232   LoopVectorPreHeader =
8233       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
8234                  LI, nullptr, "vec.epilog.ph");
8235   emitMinimumVectorEpilogueIterCountCheck(Lp, LoopScalarPreHeader,
8236                                           VecEpilogueIterationCountCheck);
8237 
8238   // Adjust the control flow taking the state info from the main loop
8239   // vectorization into account.
8240   assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck &&
8241          "expected this to be saved from the previous pass.");
8242   EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith(
8243       VecEpilogueIterationCountCheck, LoopVectorPreHeader);
8244 
8245   DT->changeImmediateDominator(LoopVectorPreHeader,
8246                                EPI.MainLoopIterationCountCheck);
8247 
8248   EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith(
8249       VecEpilogueIterationCountCheck, LoopScalarPreHeader);
8250 
8251   if (EPI.SCEVSafetyCheck)
8252     EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith(
8253         VecEpilogueIterationCountCheck, LoopScalarPreHeader);
8254   if (EPI.MemSafetyCheck)
8255     EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith(
8256         VecEpilogueIterationCountCheck, LoopScalarPreHeader);
8257 
8258   DT->changeImmediateDominator(
8259       VecEpilogueIterationCountCheck,
8260       VecEpilogueIterationCountCheck->getSinglePredecessor());
8261 
8262   DT->changeImmediateDominator(LoopScalarPreHeader,
8263                                EPI.EpilogueIterationCountCheck);
8264   if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF))
8265     // If there is an epilogue which must run, there's no edge from the
8266     // middle block to exit blocks  and thus no need to update the immediate
8267     // dominator of the exit blocks.
8268     DT->changeImmediateDominator(LoopExitBlock,
8269                                  EPI.EpilogueIterationCountCheck);
8270 
8271   // Keep track of bypass blocks, as they feed start values to the induction
8272   // phis in the scalar loop preheader.
8273   if (EPI.SCEVSafetyCheck)
8274     LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck);
8275   if (EPI.MemSafetyCheck)
8276     LoopBypassBlocks.push_back(EPI.MemSafetyCheck);
8277   LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck);
8278 
8279   // Generate a resume induction for the vector epilogue and put it in the
8280   // vector epilogue preheader
8281   Type *IdxTy = Legal->getWidestInductionType();
8282   PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val",
8283                                          LoopVectorPreHeader->getFirstNonPHI());
8284   EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck);
8285   EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0),
8286                            EPI.MainLoopIterationCountCheck);
8287 
8288   // Generate the induction variable.
8289   OldInduction = Legal->getPrimaryInduction();
8290   Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
8291   Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF);
8292   Value *StartIdx = EPResumeVal;
8293   Induction =
8294       createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
8295                               getDebugLocFromInstOrOperands(OldInduction));
8296 
8297   // Generate induction resume values. These variables save the new starting
8298   // indexes for the scalar loop. They are used to test if there are any tail
8299   // iterations left once the vector loop has completed.
8300   // Note that when the vectorized epilogue is skipped due to iteration count
8301   // check, then the resume value for the induction variable comes from
8302   // the trip count of the main vector loop, hence passing the AdditionalBypass
8303   // argument.
8304   createInductionResumeValues(Lp, CountRoundDown,
8305                               {VecEpilogueIterationCountCheck,
8306                                EPI.VectorTripCount} /* AdditionalBypass */);
8307 
8308   AddRuntimeUnrollDisableMetaData(Lp);
8309   return completeLoopSkeleton(Lp, OrigLoopID);
8310 }
8311 
8312 BasicBlock *
8313 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
8314     Loop *L, BasicBlock *Bypass, BasicBlock *Insert) {
8315 
8316   assert(EPI.TripCount &&
8317          "Expected trip count to have been safed in the first pass.");
8318   assert(
8319       (!isa<Instruction>(EPI.TripCount) ||
8320        DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) &&
8321       "saved trip count does not dominate insertion point.");
8322   Value *TC = EPI.TripCount;
8323   IRBuilder<> Builder(Insert->getTerminator());
8324   Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining");
8325 
8326   // Generate code to check if the loop's trip count is less than VF * UF of the
8327   // vector epilogue loop.
8328   auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF) ?
8329       ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
8330 
8331   Value *CheckMinIters =
8332       Builder.CreateICmp(P, Count,
8333                          createStepForVF(Builder, Count->getType(),
8334                                          EPI.EpilogueVF, EPI.EpilogueUF),
8335                          "min.epilog.iters.check");
8336 
8337   ReplaceInstWithInst(
8338       Insert->getTerminator(),
8339       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
8340 
8341   LoopBypassBlocks.push_back(Insert);
8342   return Insert;
8343 }
8344 
8345 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() {
8346   LLVM_DEBUG({
8347     dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n"
8348            << "Epilogue Loop VF:" << EPI.EpilogueVF
8349            << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
8350   });
8351 }
8352 
8353 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() {
8354   DEBUG_WITH_TYPE(VerboseDebug, {
8355     dbgs() << "final fn:\n" << *OrigLoop->getHeader()->getParent() << "\n";
8356   });
8357 }
8358 
8359 bool LoopVectorizationPlanner::getDecisionAndClampRange(
8360     const std::function<bool(ElementCount)> &Predicate, VFRange &Range) {
8361   assert(!Range.isEmpty() && "Trying to test an empty VF range.");
8362   bool PredicateAtRangeStart = Predicate(Range.Start);
8363 
8364   for (ElementCount TmpVF = Range.Start * 2;
8365        ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2)
8366     if (Predicate(TmpVF) != PredicateAtRangeStart) {
8367       Range.End = TmpVF;
8368       break;
8369     }
8370 
8371   return PredicateAtRangeStart;
8372 }
8373 
8374 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
8375 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
8376 /// of VF's starting at a given VF and extending it as much as possible. Each
8377 /// vectorization decision can potentially shorten this sub-range during
8378 /// buildVPlan().
8379 void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF,
8380                                            ElementCount MaxVF) {
8381   auto MaxVFPlusOne = MaxVF.getWithIncrement(1);
8382   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) {
8383     VFRange SubRange = {VF, MaxVFPlusOne};
8384     VPlans.push_back(buildVPlan(SubRange));
8385     VF = SubRange.End;
8386   }
8387 }
8388 
8389 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
8390                                          VPlanPtr &Plan) {
8391   assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
8392 
8393   // Look for cached value.
8394   std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
8395   EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
8396   if (ECEntryIt != EdgeMaskCache.end())
8397     return ECEntryIt->second;
8398 
8399   VPValue *SrcMask = createBlockInMask(Src, Plan);
8400 
8401   // The terminator has to be a branch inst!
8402   BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
8403   assert(BI && "Unexpected terminator found");
8404 
8405   if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1))
8406     return EdgeMaskCache[Edge] = SrcMask;
8407 
8408   // If source is an exiting block, we know the exit edge is dynamically dead
8409   // in the vector loop, and thus we don't need to restrict the mask.  Avoid
8410   // adding uses of an otherwise potentially dead instruction.
8411   if (OrigLoop->isLoopExiting(Src))
8412     return EdgeMaskCache[Edge] = SrcMask;
8413 
8414   VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition());
8415   assert(EdgeMask && "No Edge Mask found for condition");
8416 
8417   if (BI->getSuccessor(0) != Dst)
8418     EdgeMask = Builder.createNot(EdgeMask, BI->getDebugLoc());
8419 
8420   if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND.
8421     // The condition is 'SrcMask && EdgeMask', which is equivalent to
8422     // 'select i1 SrcMask, i1 EdgeMask, i1 false'.
8423     // The select version does not introduce new UB if SrcMask is false and
8424     // EdgeMask is poison. Using 'and' here introduces undefined behavior.
8425     VPValue *False = Plan->getOrAddVPValue(
8426         ConstantInt::getFalse(BI->getCondition()->getType()));
8427     EdgeMask =
8428         Builder.createSelect(SrcMask, EdgeMask, False, BI->getDebugLoc());
8429   }
8430 
8431   return EdgeMaskCache[Edge] = EdgeMask;
8432 }
8433 
8434 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
8435   assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
8436 
8437   // Look for cached value.
8438   BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB);
8439   if (BCEntryIt != BlockMaskCache.end())
8440     return BCEntryIt->second;
8441 
8442   // All-one mask is modelled as no-mask following the convention for masked
8443   // load/store/gather/scatter. Initialize BlockMask to no-mask.
8444   VPValue *BlockMask = nullptr;
8445 
8446   if (OrigLoop->getHeader() == BB) {
8447     if (!CM.blockNeedsPredicationForAnyReason(BB))
8448       return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one.
8449 
8450     // Create the block in mask as the first non-phi instruction in the block.
8451     VPBuilder::InsertPointGuard Guard(Builder);
8452     auto NewInsertionPoint = Builder.getInsertBlock()->getFirstNonPhi();
8453     Builder.setInsertPoint(Builder.getInsertBlock(), NewInsertionPoint);
8454 
8455     // Introduce the early-exit compare IV <= BTC to form header block mask.
8456     // This is used instead of IV < TC because TC may wrap, unlike BTC.
8457     // Start by constructing the desired canonical IV.
8458     VPValue *IV = nullptr;
8459     if (Legal->getPrimaryInduction())
8460       IV = Plan->getOrAddVPValue(Legal->getPrimaryInduction());
8461     else {
8462       auto *IVRecipe = new VPWidenCanonicalIVRecipe();
8463       Builder.getInsertBlock()->insert(IVRecipe, NewInsertionPoint);
8464       IV = IVRecipe;
8465     }
8466     VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
8467     bool TailFolded = !CM.isScalarEpilogueAllowed();
8468 
8469     if (TailFolded && CM.TTI.emitGetActiveLaneMask()) {
8470       // While ActiveLaneMask is a binary op that consumes the loop tripcount
8471       // as a second argument, we only pass the IV here and extract the
8472       // tripcount from the transform state where codegen of the VP instructions
8473       // happen.
8474       BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV});
8475     } else {
8476       BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
8477     }
8478     return BlockMaskCache[BB] = BlockMask;
8479   }
8480 
8481   // This is the block mask. We OR all incoming edges.
8482   for (auto *Predecessor : predecessors(BB)) {
8483     VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan);
8484     if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too.
8485       return BlockMaskCache[BB] = EdgeMask;
8486 
8487     if (!BlockMask) { // BlockMask has its initialized nullptr value.
8488       BlockMask = EdgeMask;
8489       continue;
8490     }
8491 
8492     BlockMask = Builder.createOr(BlockMask, EdgeMask, {});
8493   }
8494 
8495   return BlockMaskCache[BB] = BlockMask;
8496 }
8497 
8498 VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I,
8499                                                 ArrayRef<VPValue *> Operands,
8500                                                 VFRange &Range,
8501                                                 VPlanPtr &Plan) {
8502   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
8503          "Must be called with either a load or store");
8504 
8505   auto willWiden = [&](ElementCount VF) -> bool {
8506     if (VF.isScalar())
8507       return false;
8508     LoopVectorizationCostModel::InstWidening Decision =
8509         CM.getWideningDecision(I, VF);
8510     assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
8511            "CM decision should be taken at this point.");
8512     if (Decision == LoopVectorizationCostModel::CM_Interleave)
8513       return true;
8514     if (CM.isScalarAfterVectorization(I, VF) ||
8515         CM.isProfitableToScalarize(I, VF))
8516       return false;
8517     return Decision != LoopVectorizationCostModel::CM_Scalarize;
8518   };
8519 
8520   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
8521     return nullptr;
8522 
8523   VPValue *Mask = nullptr;
8524   if (Legal->isMaskRequired(I))
8525     Mask = createBlockInMask(I->getParent(), Plan);
8526 
8527   // Determine if the pointer operand of the access is either consecutive or
8528   // reverse consecutive.
8529   LoopVectorizationCostModel::InstWidening Decision =
8530       CM.getWideningDecision(I, Range.Start);
8531   bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse;
8532   bool Consecutive =
8533       Reverse || Decision == LoopVectorizationCostModel::CM_Widen;
8534 
8535   if (LoadInst *Load = dyn_cast<LoadInst>(I))
8536     return new VPWidenMemoryInstructionRecipe(*Load, Operands[0], Mask,
8537                                               Consecutive, Reverse);
8538 
8539   StoreInst *Store = cast<StoreInst>(I);
8540   return new VPWidenMemoryInstructionRecipe(*Store, Operands[1], Operands[0],
8541                                             Mask, Consecutive, Reverse);
8542 }
8543 
8544 VPWidenIntOrFpInductionRecipe *
8545 VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi,
8546                                            ArrayRef<VPValue *> Operands) const {
8547   // Check if this is an integer or fp induction. If so, build the recipe that
8548   // produces its scalar and vector values.
8549   if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi)) {
8550     assert(II->getStartValue() ==
8551            Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()));
8552     return new VPWidenIntOrFpInductionRecipe(Phi, Operands[0], *II);
8553   }
8554 
8555   return nullptr;
8556 }
8557 
8558 VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate(
8559     TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range,
8560     VPlan &Plan) const {
8561   // Optimize the special case where the source is a constant integer
8562   // induction variable. Notice that we can only optimize the 'trunc' case
8563   // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
8564   // (c) other casts depend on pointer size.
8565 
8566   // Determine whether \p K is a truncation based on an induction variable that
8567   // can be optimized.
8568   auto isOptimizableIVTruncate =
8569       [&](Instruction *K) -> std::function<bool(ElementCount)> {
8570     return [=](ElementCount VF) -> bool {
8571       return CM.isOptimizableIVTruncate(K, VF);
8572     };
8573   };
8574 
8575   if (LoopVectorizationPlanner::getDecisionAndClampRange(
8576           isOptimizableIVTruncate(I), Range)) {
8577 
8578     auto *Phi = cast<PHINode>(I->getOperand(0));
8579     const InductionDescriptor &II = *Legal->getIntOrFpInductionDescriptor(Phi);
8580     VPValue *Start = Plan.getOrAddVPValue(II.getStartValue());
8581     return new VPWidenIntOrFpInductionRecipe(Phi, Start, II, I);
8582   }
8583   return nullptr;
8584 }
8585 
8586 VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi,
8587                                                 ArrayRef<VPValue *> Operands,
8588                                                 VPlanPtr &Plan) {
8589   // If all incoming values are equal, the incoming VPValue can be used directly
8590   // instead of creating a new VPBlendRecipe.
8591   VPValue *FirstIncoming = Operands[0];
8592   if (all_of(Operands, [FirstIncoming](const VPValue *Inc) {
8593         return FirstIncoming == Inc;
8594       })) {
8595     return Operands[0];
8596   }
8597 
8598   // We know that all PHIs in non-header blocks are converted into selects, so
8599   // we don't have to worry about the insertion order and we can just use the
8600   // builder. At this point we generate the predication tree. There may be
8601   // duplications since this is a simple recursive scan, but future
8602   // optimizations will clean it up.
8603   SmallVector<VPValue *, 2> OperandsWithMask;
8604   unsigned NumIncoming = Phi->getNumIncomingValues();
8605 
8606   for (unsigned In = 0; In < NumIncoming; In++) {
8607     VPValue *EdgeMask =
8608       createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan);
8609     assert((EdgeMask || NumIncoming == 1) &&
8610            "Multiple predecessors with one having a full mask");
8611     OperandsWithMask.push_back(Operands[In]);
8612     if (EdgeMask)
8613       OperandsWithMask.push_back(EdgeMask);
8614   }
8615   return toVPRecipeResult(new VPBlendRecipe(Phi, OperandsWithMask));
8616 }
8617 
8618 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
8619                                                    ArrayRef<VPValue *> Operands,
8620                                                    VFRange &Range) const {
8621 
8622   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
8623       [this, CI](ElementCount VF) { return CM.isScalarWithPredication(CI); },
8624       Range);
8625 
8626   if (IsPredicated)
8627     return nullptr;
8628 
8629   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
8630   if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
8631              ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect ||
8632              ID == Intrinsic::pseudoprobe ||
8633              ID == Intrinsic::experimental_noalias_scope_decl))
8634     return nullptr;
8635 
8636   auto willWiden = [&](ElementCount VF) -> bool {
8637     Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
8638     // The following case may be scalarized depending on the VF.
8639     // The flag shows whether we use Intrinsic or a usual Call for vectorized
8640     // version of the instruction.
8641     // Is it beneficial to perform intrinsic call compared to lib call?
8642     bool NeedToScalarize = false;
8643     InstructionCost CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize);
8644     InstructionCost IntrinsicCost = ID ? CM.getVectorIntrinsicCost(CI, VF) : 0;
8645     bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost;
8646     return UseVectorIntrinsic || !NeedToScalarize;
8647   };
8648 
8649   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
8650     return nullptr;
8651 
8652   ArrayRef<VPValue *> Ops = Operands.take_front(CI->arg_size());
8653   return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()));
8654 }
8655 
8656 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
8657   assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) &&
8658          !isa<StoreInst>(I) && "Instruction should have been handled earlier");
8659   // Instruction should be widened, unless it is scalar after vectorization,
8660   // scalarization is profitable or it is predicated.
8661   auto WillScalarize = [this, I](ElementCount VF) -> bool {
8662     return CM.isScalarAfterVectorization(I, VF) ||
8663            CM.isProfitableToScalarize(I, VF) || CM.isScalarWithPredication(I);
8664   };
8665   return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize,
8666                                                              Range);
8667 }
8668 
8669 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,
8670                                            ArrayRef<VPValue *> Operands) const {
8671   auto IsVectorizableOpcode = [](unsigned Opcode) {
8672     switch (Opcode) {
8673     case Instruction::Add:
8674     case Instruction::And:
8675     case Instruction::AShr:
8676     case Instruction::BitCast:
8677     case Instruction::FAdd:
8678     case Instruction::FCmp:
8679     case Instruction::FDiv:
8680     case Instruction::FMul:
8681     case Instruction::FNeg:
8682     case Instruction::FPExt:
8683     case Instruction::FPToSI:
8684     case Instruction::FPToUI:
8685     case Instruction::FPTrunc:
8686     case Instruction::FRem:
8687     case Instruction::FSub:
8688     case Instruction::ICmp:
8689     case Instruction::IntToPtr:
8690     case Instruction::LShr:
8691     case Instruction::Mul:
8692     case Instruction::Or:
8693     case Instruction::PtrToInt:
8694     case Instruction::SDiv:
8695     case Instruction::Select:
8696     case Instruction::SExt:
8697     case Instruction::Shl:
8698     case Instruction::SIToFP:
8699     case Instruction::SRem:
8700     case Instruction::Sub:
8701     case Instruction::Trunc:
8702     case Instruction::UDiv:
8703     case Instruction::UIToFP:
8704     case Instruction::URem:
8705     case Instruction::Xor:
8706     case Instruction::ZExt:
8707       return true;
8708     }
8709     return false;
8710   };
8711 
8712   if (!IsVectorizableOpcode(I->getOpcode()))
8713     return nullptr;
8714 
8715   // Success: widen this instruction.
8716   return new VPWidenRecipe(*I, make_range(Operands.begin(), Operands.end()));
8717 }
8718 
8719 void VPRecipeBuilder::fixHeaderPhis() {
8720   BasicBlock *OrigLatch = OrigLoop->getLoopLatch();
8721   for (VPWidenPHIRecipe *R : PhisToFix) {
8722     auto *PN = cast<PHINode>(R->getUnderlyingValue());
8723     VPRecipeBase *IncR =
8724         getRecipe(cast<Instruction>(PN->getIncomingValueForBlock(OrigLatch)));
8725     R->addOperand(IncR->getVPSingleValue());
8726   }
8727 }
8728 
8729 VPBasicBlock *VPRecipeBuilder::handleReplication(
8730     Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
8731     VPlanPtr &Plan) {
8732   bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
8733       [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },
8734       Range);
8735 
8736   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
8737       [&](ElementCount VF) { return CM.isPredicatedInst(I, IsUniform); },
8738       Range);
8739 
8740   // Even if the instruction is not marked as uniform, there are certain
8741   // intrinsic calls that can be effectively treated as such, so we check for
8742   // them here. Conservatively, we only do this for scalable vectors, since
8743   // for fixed-width VFs we can always fall back on full scalarization.
8744   if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(I)) {
8745     switch (cast<IntrinsicInst>(I)->getIntrinsicID()) {
8746     case Intrinsic::assume:
8747     case Intrinsic::lifetime_start:
8748     case Intrinsic::lifetime_end:
8749       // For scalable vectors if one of the operands is variant then we still
8750       // want to mark as uniform, which will generate one instruction for just
8751       // the first lane of the vector. We can't scalarize the call in the same
8752       // way as for fixed-width vectors because we don't know how many lanes
8753       // there are.
8754       //
8755       // The reasons for doing it this way for scalable vectors are:
8756       //   1. For the assume intrinsic generating the instruction for the first
8757       //      lane is still be better than not generating any at all. For
8758       //      example, the input may be a splat across all lanes.
8759       //   2. For the lifetime start/end intrinsics the pointer operand only
8760       //      does anything useful when the input comes from a stack object,
8761       //      which suggests it should always be uniform. For non-stack objects
8762       //      the effect is to poison the object, which still allows us to
8763       //      remove the call.
8764       IsUniform = true;
8765       break;
8766     default:
8767       break;
8768     }
8769   }
8770 
8771   auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()),
8772                                        IsUniform, IsPredicated);
8773   setRecipe(I, Recipe);
8774   Plan->addVPValue(I, Recipe);
8775 
8776   // Find if I uses a predicated instruction. If so, it will use its scalar
8777   // value. Avoid hoisting the insert-element which packs the scalar value into
8778   // a vector value, as that happens iff all users use the vector value.
8779   for (VPValue *Op : Recipe->operands()) {
8780     auto *PredR = dyn_cast_or_null<VPPredInstPHIRecipe>(Op->getDef());
8781     if (!PredR)
8782       continue;
8783     auto *RepR =
8784         cast_or_null<VPReplicateRecipe>(PredR->getOperand(0)->getDef());
8785     assert(RepR->isPredicated() &&
8786            "expected Replicate recipe to be predicated");
8787     RepR->setAlsoPack(false);
8788   }
8789 
8790   // Finalize the recipe for Instr, first if it is not predicated.
8791   if (!IsPredicated) {
8792     LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
8793     VPBB->appendRecipe(Recipe);
8794     return VPBB;
8795   }
8796   LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
8797   assert(VPBB->getSuccessors().empty() &&
8798          "VPBB has successors when handling predicated replication.");
8799   // Record predicated instructions for above packing optimizations.
8800   VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan);
8801   VPBlockUtils::insertBlockAfter(Region, VPBB);
8802   auto *RegSucc = new VPBasicBlock();
8803   VPBlockUtils::insertBlockAfter(RegSucc, Region);
8804   return RegSucc;
8805 }
8806 
8807 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr,
8808                                                       VPRecipeBase *PredRecipe,
8809                                                       VPlanPtr &Plan) {
8810   // Instructions marked for predication are replicated and placed under an
8811   // if-then construct to prevent side-effects.
8812 
8813   // Generate recipes to compute the block mask for this region.
8814   VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan);
8815 
8816   // Build the triangular if-then region.
8817   std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
8818   assert(Instr->getParent() && "Predicated instruction not in any basic block");
8819   auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask);
8820   auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
8821   auto *PHIRecipe = Instr->getType()->isVoidTy()
8822                         ? nullptr
8823                         : new VPPredInstPHIRecipe(Plan->getOrAddVPValue(Instr));
8824   if (PHIRecipe) {
8825     Plan->removeVPValueFor(Instr);
8826     Plan->addVPValue(Instr, PHIRecipe);
8827   }
8828   auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
8829   auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe);
8830   VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true);
8831 
8832   // Note: first set Entry as region entry and then connect successors starting
8833   // from it in order, to propagate the "parent" of each VPBasicBlock.
8834   VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry);
8835   VPBlockUtils::connectBlocks(Pred, Exit);
8836 
8837   return Region;
8838 }
8839 
8840 VPRecipeOrVPValueTy
8841 VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
8842                                         ArrayRef<VPValue *> Operands,
8843                                         VFRange &Range, VPlanPtr &Plan) {
8844   // First, check for specific widening recipes that deal with calls, memory
8845   // operations, inductions and Phi nodes.
8846   if (auto *CI = dyn_cast<CallInst>(Instr))
8847     return toVPRecipeResult(tryToWidenCall(CI, Operands, Range));
8848 
8849   if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
8850     return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan));
8851 
8852   VPRecipeBase *Recipe;
8853   if (auto Phi = dyn_cast<PHINode>(Instr)) {
8854     if (Phi->getParent() != OrigLoop->getHeader())
8855       return tryToBlend(Phi, Operands, Plan);
8856     if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands)))
8857       return toVPRecipeResult(Recipe);
8858 
8859     VPWidenPHIRecipe *PhiRecipe = nullptr;
8860     if (Legal->isReductionVariable(Phi) || Legal->isFirstOrderRecurrence(Phi)) {
8861       VPValue *StartV = Operands[0];
8862       if (Legal->isReductionVariable(Phi)) {
8863         const RecurrenceDescriptor &RdxDesc =
8864             Legal->getReductionVars().find(Phi)->second;
8865         assert(RdxDesc.getRecurrenceStartValue() ==
8866                Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()));
8867         PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV,
8868                                              CM.isInLoopReduction(Phi),
8869                                              CM.useOrderedReductions(RdxDesc));
8870       } else {
8871         PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV);
8872       }
8873 
8874       // Record the incoming value from the backedge, so we can add the incoming
8875       // value from the backedge after all recipes have been created.
8876       recordRecipeOf(cast<Instruction>(
8877           Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch())));
8878       PhisToFix.push_back(PhiRecipe);
8879     } else {
8880       // TODO: record start and backedge value for remaining pointer induction
8881       // phis.
8882       assert(Phi->getType()->isPointerTy() &&
8883              "only pointer phis should be handled here");
8884       PhiRecipe = new VPWidenPHIRecipe(Phi);
8885     }
8886 
8887     return toVPRecipeResult(PhiRecipe);
8888   }
8889 
8890   if (isa<TruncInst>(Instr) &&
8891       (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Operands,
8892                                                Range, *Plan)))
8893     return toVPRecipeResult(Recipe);
8894 
8895   if (!shouldWiden(Instr, Range))
8896     return nullptr;
8897 
8898   if (auto GEP = dyn_cast<GetElementPtrInst>(Instr))
8899     return toVPRecipeResult(new VPWidenGEPRecipe(
8900         GEP, make_range(Operands.begin(), Operands.end()), OrigLoop));
8901 
8902   if (auto *SI = dyn_cast<SelectInst>(Instr)) {
8903     bool InvariantCond =
8904         PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop);
8905     return toVPRecipeResult(new VPWidenSelectRecipe(
8906         *SI, make_range(Operands.begin(), Operands.end()), InvariantCond));
8907   }
8908 
8909   return toVPRecipeResult(tryToWiden(Instr, Operands));
8910 }
8911 
8912 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
8913                                                         ElementCount MaxVF) {
8914   assert(OrigLoop->isInnermost() && "Inner loop expected.");
8915 
8916   // Collect instructions from the original loop that will become trivially dead
8917   // in the vectorized loop. We don't need to vectorize these instructions. For
8918   // example, original induction update instructions can become dead because we
8919   // separately emit induction "steps" when generating code for the new loop.
8920   // Similarly, we create a new latch condition when setting up the structure
8921   // of the new loop, so the old one can become dead.
8922   SmallPtrSet<Instruction *, 4> DeadInstructions;
8923   collectTriviallyDeadInstructions(DeadInstructions);
8924 
8925   // Add assume instructions we need to drop to DeadInstructions, to prevent
8926   // them from being added to the VPlan.
8927   // TODO: We only need to drop assumes in blocks that get flattend. If the
8928   // control flow is preserved, we should keep them.
8929   auto &ConditionalAssumes = Legal->getConditionalAssumes();
8930   DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end());
8931 
8932   MapVector<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter();
8933   // Dead instructions do not need sinking. Remove them from SinkAfter.
8934   for (Instruction *I : DeadInstructions)
8935     SinkAfter.erase(I);
8936 
8937   // Cannot sink instructions after dead instructions (there won't be any
8938   // recipes for them). Instead, find the first non-dead previous instruction.
8939   for (auto &P : Legal->getSinkAfter()) {
8940     Instruction *SinkTarget = P.second;
8941     Instruction *FirstInst = &*SinkTarget->getParent()->begin();
8942     (void)FirstInst;
8943     while (DeadInstructions.contains(SinkTarget)) {
8944       assert(
8945           SinkTarget != FirstInst &&
8946           "Must find a live instruction (at least the one feeding the "
8947           "first-order recurrence PHI) before reaching beginning of the block");
8948       SinkTarget = SinkTarget->getPrevNode();
8949       assert(SinkTarget != P.first &&
8950              "sink source equals target, no sinking required");
8951     }
8952     P.second = SinkTarget;
8953   }
8954 
8955   auto MaxVFPlusOne = MaxVF.getWithIncrement(1);
8956   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) {
8957     VFRange SubRange = {VF, MaxVFPlusOne};
8958     VPlans.push_back(
8959         buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter));
8960     VF = SubRange.End;
8961   }
8962 }
8963 
8964 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
8965     VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions,
8966     const MapVector<Instruction *, Instruction *> &SinkAfter) {
8967 
8968   SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
8969 
8970   VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder);
8971 
8972   // ---------------------------------------------------------------------------
8973   // Pre-construction: record ingredients whose recipes we'll need to further
8974   // process after constructing the initial VPlan.
8975   // ---------------------------------------------------------------------------
8976 
8977   // Mark instructions we'll need to sink later and their targets as
8978   // ingredients whose recipe we'll need to record.
8979   for (auto &Entry : SinkAfter) {
8980     RecipeBuilder.recordRecipeOf(Entry.first);
8981     RecipeBuilder.recordRecipeOf(Entry.second);
8982   }
8983   for (auto &Reduction : CM.getInLoopReductionChains()) {
8984     PHINode *Phi = Reduction.first;
8985     RecurKind Kind =
8986         Legal->getReductionVars().find(Phi)->second.getRecurrenceKind();
8987     const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
8988 
8989     RecipeBuilder.recordRecipeOf(Phi);
8990     for (auto &R : ReductionOperations) {
8991       RecipeBuilder.recordRecipeOf(R);
8992       // For min/max reducitons, where we have a pair of icmp/select, we also
8993       // need to record the ICmp recipe, so it can be removed later.
8994       assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) &&
8995              "Only min/max recurrences allowed for inloop reductions");
8996       if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind))
8997         RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0)));
8998     }
8999   }
9000 
9001   // For each interleave group which is relevant for this (possibly trimmed)
9002   // Range, add it to the set of groups to be later applied to the VPlan and add
9003   // placeholders for its members' Recipes which we'll be replacing with a
9004   // single VPInterleaveRecipe.
9005   for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
9006     auto applyIG = [IG, this](ElementCount VF) -> bool {
9007       return (VF.isVector() && // Query is illegal for VF == 1
9008               CM.getWideningDecision(IG->getInsertPos(), VF) ==
9009                   LoopVectorizationCostModel::CM_Interleave);
9010     };
9011     if (!getDecisionAndClampRange(applyIG, Range))
9012       continue;
9013     InterleaveGroups.insert(IG);
9014     for (unsigned i = 0; i < IG->getFactor(); i++)
9015       if (Instruction *Member = IG->getMember(i))
9016         RecipeBuilder.recordRecipeOf(Member);
9017   };
9018 
9019   // ---------------------------------------------------------------------------
9020   // Build initial VPlan: Scan the body of the loop in a topological order to
9021   // visit each basic block after having visited its predecessor basic blocks.
9022   // ---------------------------------------------------------------------------
9023 
9024   auto Plan = std::make_unique<VPlan>();
9025 
9026   // Scan the body of the loop in a topological order to visit each basic block
9027   // after having visited its predecessor basic blocks.
9028   LoopBlocksDFS DFS(OrigLoop);
9029   DFS.perform(LI);
9030 
9031   VPBasicBlock *VPBB = nullptr;
9032   VPBasicBlock *HeaderVPBB = nullptr;
9033   SmallVector<VPWidenIntOrFpInductionRecipe *> InductionsToMove;
9034   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
9035     // Relevant instructions from basic block BB will be grouped into VPRecipe
9036     // ingredients and fill a new VPBasicBlock.
9037     unsigned VPBBsForBB = 0;
9038     auto *FirstVPBBForBB = new VPBasicBlock(BB->getName());
9039     if (VPBB)
9040       VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB);
9041     else {
9042       auto *TopRegion = new VPRegionBlock("vector loop");
9043       TopRegion->setEntry(FirstVPBBForBB);
9044       Plan->setEntry(TopRegion);
9045       HeaderVPBB = FirstVPBBForBB;
9046     }
9047     VPBB = FirstVPBBForBB;
9048     Builder.setInsertPoint(VPBB);
9049 
9050     // Introduce each ingredient into VPlan.
9051     // TODO: Model and preserve debug instrinsics in VPlan.
9052     for (Instruction &I : BB->instructionsWithoutDebug()) {
9053       Instruction *Instr = &I;
9054 
9055       // First filter out irrelevant instructions, to ensure no recipes are
9056       // built for them.
9057       if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr))
9058         continue;
9059 
9060       SmallVector<VPValue *, 4> Operands;
9061       auto *Phi = dyn_cast<PHINode>(Instr);
9062       if (Phi && Phi->getParent() == OrigLoop->getHeader()) {
9063         Operands.push_back(Plan->getOrAddVPValue(
9064             Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())));
9065       } else {
9066         auto OpRange = Plan->mapToVPValues(Instr->operands());
9067         Operands = {OpRange.begin(), OpRange.end()};
9068       }
9069       if (auto RecipeOrValue = RecipeBuilder.tryToCreateWidenRecipe(
9070               Instr, Operands, Range, Plan)) {
9071         // If Instr can be simplified to an existing VPValue, use it.
9072         if (RecipeOrValue.is<VPValue *>()) {
9073           auto *VPV = RecipeOrValue.get<VPValue *>();
9074           Plan->addVPValue(Instr, VPV);
9075           // If the re-used value is a recipe, register the recipe for the
9076           // instruction, in case the recipe for Instr needs to be recorded.
9077           if (auto *R = dyn_cast_or_null<VPRecipeBase>(VPV->getDef()))
9078             RecipeBuilder.setRecipe(Instr, R);
9079           continue;
9080         }
9081         // Otherwise, add the new recipe.
9082         VPRecipeBase *Recipe = RecipeOrValue.get<VPRecipeBase *>();
9083         for (auto *Def : Recipe->definedValues()) {
9084           auto *UV = Def->getUnderlyingValue();
9085           Plan->addVPValue(UV, Def);
9086         }
9087 
9088         if (isa<VPWidenIntOrFpInductionRecipe>(Recipe) &&
9089             HeaderVPBB->getFirstNonPhi() != VPBB->end()) {
9090           // Keep track of VPWidenIntOrFpInductionRecipes not in the phi section
9091           // of the header block. That can happen for truncates of induction
9092           // variables. Those recipes are moved to the phi section of the header
9093           // block after applying SinkAfter, which relies on the original
9094           // position of the trunc.
9095           assert(isa<TruncInst>(Instr));
9096           InductionsToMove.push_back(
9097               cast<VPWidenIntOrFpInductionRecipe>(Recipe));
9098         }
9099         RecipeBuilder.setRecipe(Instr, Recipe);
9100         VPBB->appendRecipe(Recipe);
9101         continue;
9102       }
9103 
9104       // Otherwise, if all widening options failed, Instruction is to be
9105       // replicated. This may create a successor for VPBB.
9106       VPBasicBlock *NextVPBB =
9107           RecipeBuilder.handleReplication(Instr, Range, VPBB, Plan);
9108       if (NextVPBB != VPBB) {
9109         VPBB = NextVPBB;
9110         VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++)
9111                                     : "");
9112       }
9113     }
9114   }
9115 
9116   assert(isa<VPRegionBlock>(Plan->getEntry()) &&
9117          !Plan->getEntry()->getEntryBasicBlock()->empty() &&
9118          "entry block must be set to a VPRegionBlock having a non-empty entry "
9119          "VPBasicBlock");
9120   RecipeBuilder.fixHeaderPhis();
9121 
9122   // ---------------------------------------------------------------------------
9123   // Transform initial VPlan: Apply previously taken decisions, in order, to
9124   // bring the VPlan to its final state.
9125   // ---------------------------------------------------------------------------
9126 
9127   // Apply Sink-After legal constraints.
9128   auto GetReplicateRegion = [](VPRecipeBase *R) -> VPRegionBlock * {
9129     auto *Region = dyn_cast_or_null<VPRegionBlock>(R->getParent()->getParent());
9130     if (Region && Region->isReplicator()) {
9131       assert(Region->getNumSuccessors() == 1 &&
9132              Region->getNumPredecessors() == 1 && "Expected SESE region!");
9133       assert(R->getParent()->size() == 1 &&
9134              "A recipe in an original replicator region must be the only "
9135              "recipe in its block");
9136       return Region;
9137     }
9138     return nullptr;
9139   };
9140   for (auto &Entry : SinkAfter) {
9141     VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first);
9142     VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second);
9143 
9144     auto *TargetRegion = GetReplicateRegion(Target);
9145     auto *SinkRegion = GetReplicateRegion(Sink);
9146     if (!SinkRegion) {
9147       // If the sink source is not a replicate region, sink the recipe directly.
9148       if (TargetRegion) {
9149         // The target is in a replication region, make sure to move Sink to
9150         // the block after it, not into the replication region itself.
9151         VPBasicBlock *NextBlock =
9152             cast<VPBasicBlock>(TargetRegion->getSuccessors().front());
9153         Sink->moveBefore(*NextBlock, NextBlock->getFirstNonPhi());
9154       } else
9155         Sink->moveAfter(Target);
9156       continue;
9157     }
9158 
9159     // The sink source is in a replicate region. Unhook the region from the CFG.
9160     auto *SinkPred = SinkRegion->getSinglePredecessor();
9161     auto *SinkSucc = SinkRegion->getSingleSuccessor();
9162     VPBlockUtils::disconnectBlocks(SinkPred, SinkRegion);
9163     VPBlockUtils::disconnectBlocks(SinkRegion, SinkSucc);
9164     VPBlockUtils::connectBlocks(SinkPred, SinkSucc);
9165 
9166     if (TargetRegion) {
9167       // The target recipe is also in a replicate region, move the sink region
9168       // after the target region.
9169       auto *TargetSucc = TargetRegion->getSingleSuccessor();
9170       VPBlockUtils::disconnectBlocks(TargetRegion, TargetSucc);
9171       VPBlockUtils::connectBlocks(TargetRegion, SinkRegion);
9172       VPBlockUtils::connectBlocks(SinkRegion, TargetSucc);
9173     } else {
9174       // The sink source is in a replicate region, we need to move the whole
9175       // replicate region, which should only contain a single recipe in the
9176       // main block.
9177       auto *SplitBlock =
9178           Target->getParent()->splitAt(std::next(Target->getIterator()));
9179 
9180       auto *SplitPred = SplitBlock->getSinglePredecessor();
9181 
9182       VPBlockUtils::disconnectBlocks(SplitPred, SplitBlock);
9183       VPBlockUtils::connectBlocks(SplitPred, SinkRegion);
9184       VPBlockUtils::connectBlocks(SinkRegion, SplitBlock);
9185       if (VPBB == SplitPred)
9186         VPBB = SplitBlock;
9187     }
9188   }
9189 
9190   cast<VPRegionBlock>(Plan->getEntry())->setExit(VPBB);
9191 
9192   VPlanTransforms::removeRedundantInductionCasts(*Plan);
9193 
9194   // Now that sink-after is done, move induction recipes for optimized truncates
9195   // to the phi section of the header block.
9196   for (VPWidenIntOrFpInductionRecipe *Ind : InductionsToMove)
9197     Ind->moveBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi());
9198 
9199   // Adjust the recipes for any inloop reductions.
9200   adjustRecipesForReductions(VPBB, Plan, RecipeBuilder, Range.Start);
9201 
9202   // Introduce a recipe to combine the incoming and previous values of a
9203   // first-order recurrence.
9204   for (VPRecipeBase &R : Plan->getEntry()->getEntryBasicBlock()->phis()) {
9205     auto *RecurPhi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R);
9206     if (!RecurPhi)
9207       continue;
9208 
9209     VPRecipeBase *PrevRecipe = RecurPhi->getBackedgeRecipe();
9210     VPBasicBlock *InsertBlock = PrevRecipe->getParent();
9211     auto *Region = GetReplicateRegion(PrevRecipe);
9212     if (Region)
9213       InsertBlock = cast<VPBasicBlock>(Region->getSingleSuccessor());
9214     if (Region || PrevRecipe->isPhi())
9215       Builder.setInsertPoint(InsertBlock, InsertBlock->getFirstNonPhi());
9216     else
9217       Builder.setInsertPoint(InsertBlock, std::next(PrevRecipe->getIterator()));
9218 
9219     auto *RecurSplice = cast<VPInstruction>(
9220         Builder.createNaryOp(VPInstruction::FirstOrderRecurrenceSplice,
9221                              {RecurPhi, RecurPhi->getBackedgeValue()}));
9222 
9223     RecurPhi->replaceAllUsesWith(RecurSplice);
9224     // Set the first operand of RecurSplice to RecurPhi again, after replacing
9225     // all users.
9226     RecurSplice->setOperand(0, RecurPhi);
9227   }
9228 
9229   // Interleave memory: for each Interleave Group we marked earlier as relevant
9230   // for this VPlan, replace the Recipes widening its memory instructions with a
9231   // single VPInterleaveRecipe at its insertion point.
9232   for (auto IG : InterleaveGroups) {
9233     auto *Recipe = cast<VPWidenMemoryInstructionRecipe>(
9234         RecipeBuilder.getRecipe(IG->getInsertPos()));
9235     SmallVector<VPValue *, 4> StoredValues;
9236     for (unsigned i = 0; i < IG->getFactor(); ++i)
9237       if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) {
9238         auto *StoreR =
9239             cast<VPWidenMemoryInstructionRecipe>(RecipeBuilder.getRecipe(SI));
9240         StoredValues.push_back(StoreR->getStoredValue());
9241       }
9242 
9243     auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues,
9244                                         Recipe->getMask());
9245     VPIG->insertBefore(Recipe);
9246     unsigned J = 0;
9247     for (unsigned i = 0; i < IG->getFactor(); ++i)
9248       if (Instruction *Member = IG->getMember(i)) {
9249         if (!Member->getType()->isVoidTy()) {
9250           VPValue *OriginalV = Plan->getVPValue(Member);
9251           Plan->removeVPValueFor(Member);
9252           Plan->addVPValue(Member, VPIG->getVPValue(J));
9253           OriginalV->replaceAllUsesWith(VPIG->getVPValue(J));
9254           J++;
9255         }
9256         RecipeBuilder.getRecipe(Member)->eraseFromParent();
9257       }
9258   }
9259 
9260   // From this point onwards, VPlan-to-VPlan transformations may change the plan
9261   // in ways that accessing values using original IR values is incorrect.
9262   Plan->disableValue2VPValue();
9263 
9264   VPlanTransforms::sinkScalarOperands(*Plan);
9265   VPlanTransforms::mergeReplicateRegions(*Plan);
9266 
9267   std::string PlanName;
9268   raw_string_ostream RSO(PlanName);
9269   ElementCount VF = Range.Start;
9270   Plan->addVF(VF);
9271   RSO << "Initial VPlan for VF={" << VF;
9272   for (VF *= 2; ElementCount::isKnownLT(VF, Range.End); VF *= 2) {
9273     Plan->addVF(VF);
9274     RSO << "," << VF;
9275   }
9276   RSO << "},UF>=1";
9277   RSO.flush();
9278   Plan->setName(PlanName);
9279 
9280   assert(VPlanVerifier::verifyPlanIsValid(*Plan) && "VPlan is invalid");
9281   return Plan;
9282 }
9283 
9284 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
9285   // Outer loop handling: They may require CFG and instruction level
9286   // transformations before even evaluating whether vectorization is profitable.
9287   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
9288   // the vectorization pipeline.
9289   assert(!OrigLoop->isInnermost());
9290   assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
9291 
9292   // Create new empty VPlan
9293   auto Plan = std::make_unique<VPlan>();
9294 
9295   // Build hierarchical CFG
9296   VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
9297   HCFGBuilder.buildHierarchicalCFG();
9298 
9299   for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End);
9300        VF *= 2)
9301     Plan->addVF(VF);
9302 
9303   if (EnableVPlanPredication) {
9304     VPlanPredicator VPP(*Plan);
9305     VPP.predicate();
9306 
9307     // Avoid running transformation to recipes until masked code generation in
9308     // VPlan-native path is in place.
9309     return Plan;
9310   }
9311 
9312   SmallPtrSet<Instruction *, 1> DeadInstructions;
9313   VPlanTransforms::VPInstructionsToVPRecipes(
9314       OrigLoop, Plan,
9315       [this](PHINode *P) { return Legal->getIntOrFpInductionDescriptor(P); },
9316       DeadInstructions, *PSE.getSE());
9317   return Plan;
9318 }
9319 
9320 // Adjust the recipes for reductions. For in-loop reductions the chain of
9321 // instructions leading from the loop exit instr to the phi need to be converted
9322 // to reductions, with one operand being vector and the other being the scalar
9323 // reduction chain. For other reductions, a select is introduced between the phi
9324 // and live-out recipes when folding the tail.
9325 void LoopVectorizationPlanner::adjustRecipesForReductions(
9326     VPBasicBlock *LatchVPBB, VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder,
9327     ElementCount MinVF) {
9328   for (auto &Reduction : CM.getInLoopReductionChains()) {
9329     PHINode *Phi = Reduction.first;
9330     const RecurrenceDescriptor &RdxDesc =
9331         Legal->getReductionVars().find(Phi)->second;
9332     const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
9333 
9334     if (MinVF.isScalar() && !CM.useOrderedReductions(RdxDesc))
9335       continue;
9336 
9337     // ReductionOperations are orders top-down from the phi's use to the
9338     // LoopExitValue. We keep a track of the previous item (the Chain) to tell
9339     // which of the two operands will remain scalar and which will be reduced.
9340     // For minmax the chain will be the select instructions.
9341     Instruction *Chain = Phi;
9342     for (Instruction *R : ReductionOperations) {
9343       VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R);
9344       RecurKind Kind = RdxDesc.getRecurrenceKind();
9345 
9346       VPValue *ChainOp = Plan->getVPValue(Chain);
9347       unsigned FirstOpId;
9348       assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) &&
9349              "Only min/max recurrences allowed for inloop reductions");
9350       // Recognize a call to the llvm.fmuladd intrinsic.
9351       bool IsFMulAdd = (Kind == RecurKind::FMulAdd);
9352       assert((!IsFMulAdd || RecurrenceDescriptor::isFMulAddIntrinsic(R)) &&
9353              "Expected instruction to be a call to the llvm.fmuladd intrinsic");
9354       if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
9355         assert(isa<VPWidenSelectRecipe>(WidenRecipe) &&
9356                "Expected to replace a VPWidenSelectSC");
9357         FirstOpId = 1;
9358       } else {
9359         assert((MinVF.isScalar() || isa<VPWidenRecipe>(WidenRecipe) ||
9360                 (IsFMulAdd && isa<VPWidenCallRecipe>(WidenRecipe))) &&
9361                "Expected to replace a VPWidenSC");
9362         FirstOpId = 0;
9363       }
9364       unsigned VecOpId =
9365           R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId;
9366       VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId));
9367 
9368       auto *CondOp = CM.foldTailByMasking()
9369                          ? RecipeBuilder.createBlockInMask(R->getParent(), Plan)
9370                          : nullptr;
9371 
9372       if (IsFMulAdd) {
9373         // If the instruction is a call to the llvm.fmuladd intrinsic then we
9374         // need to create an fmul recipe to use as the vector operand for the
9375         // fadd reduction.
9376         VPInstruction *FMulRecipe = new VPInstruction(
9377             Instruction::FMul, {VecOp, Plan->getVPValue(R->getOperand(1))});
9378         FMulRecipe->setFastMathFlags(R->getFastMathFlags());
9379         WidenRecipe->getParent()->insert(FMulRecipe,
9380                                          WidenRecipe->getIterator());
9381         VecOp = FMulRecipe;
9382       }
9383       VPReductionRecipe *RedRecipe =
9384           new VPReductionRecipe(&RdxDesc, R, ChainOp, VecOp, CondOp, TTI);
9385       WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe);
9386       Plan->removeVPValueFor(R);
9387       Plan->addVPValue(R, RedRecipe);
9388       WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator());
9389       WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe);
9390       WidenRecipe->eraseFromParent();
9391 
9392       if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
9393         VPRecipeBase *CompareRecipe =
9394             RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0)));
9395         assert(isa<VPWidenRecipe>(CompareRecipe) &&
9396                "Expected to replace a VPWidenSC");
9397         assert(cast<VPWidenRecipe>(CompareRecipe)->getNumUsers() == 0 &&
9398                "Expected no remaining users");
9399         CompareRecipe->eraseFromParent();
9400       }
9401       Chain = R;
9402     }
9403   }
9404 
9405   // If tail is folded by masking, introduce selects between the phi
9406   // and the live-out instruction of each reduction, at the end of the latch.
9407   if (CM.foldTailByMasking()) {
9408     for (VPRecipeBase &R : Plan->getEntry()->getEntryBasicBlock()->phis()) {
9409       VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
9410       if (!PhiR || PhiR->isInLoop())
9411         continue;
9412       Builder.setInsertPoint(LatchVPBB);
9413       VPValue *Cond =
9414           RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan);
9415       VPValue *Red = PhiR->getBackedgeValue();
9416       Builder.createNaryOp(Instruction::Select, {Cond, Red, PhiR});
9417     }
9418   }
9419 }
9420 
9421 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
9422 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
9423                                VPSlotTracker &SlotTracker) const {
9424   O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
9425   IG->getInsertPos()->printAsOperand(O, false);
9426   O << ", ";
9427   getAddr()->printAsOperand(O, SlotTracker);
9428   VPValue *Mask = getMask();
9429   if (Mask) {
9430     O << ", ";
9431     Mask->printAsOperand(O, SlotTracker);
9432   }
9433 
9434   unsigned OpIdx = 0;
9435   for (unsigned i = 0; i < IG->getFactor(); ++i) {
9436     if (!IG->getMember(i))
9437       continue;
9438     if (getNumStoreOperands() > 0) {
9439       O << "\n" << Indent << "  store ";
9440       getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker);
9441       O << " to index " << i;
9442     } else {
9443       O << "\n" << Indent << "  ";
9444       getVPValue(OpIdx)->printAsOperand(O, SlotTracker);
9445       O << " = load from index " << i;
9446     }
9447     ++OpIdx;
9448   }
9449 }
9450 #endif
9451 
9452 void VPWidenCallRecipe::execute(VPTransformState &State) {
9453   State.ILV->widenCallInstruction(*cast<CallInst>(getUnderlyingInstr()), this,
9454                                   *this, State);
9455 }
9456 
9457 void VPWidenSelectRecipe::execute(VPTransformState &State) {
9458   auto &I = *cast<SelectInst>(getUnderlyingInstr());
9459   State.ILV->setDebugLocFromInst(&I);
9460 
9461   // The condition can be loop invariant  but still defined inside the
9462   // loop. This means that we can't just use the original 'cond' value.
9463   // We have to take the 'vectorized' value and pick the first lane.
9464   // Instcombine will make this a no-op.
9465   auto *InvarCond =
9466       InvariantCond ? State.get(getOperand(0), VPIteration(0, 0)) : nullptr;
9467 
9468   for (unsigned Part = 0; Part < State.UF; ++Part) {
9469     Value *Cond = InvarCond ? InvarCond : State.get(getOperand(0), Part);
9470     Value *Op0 = State.get(getOperand(1), Part);
9471     Value *Op1 = State.get(getOperand(2), Part);
9472     Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1);
9473     State.set(this, Sel, Part);
9474     State.ILV->addMetadata(Sel, &I);
9475   }
9476 }
9477 
9478 void VPWidenRecipe::execute(VPTransformState &State) {
9479   auto &I = *cast<Instruction>(getUnderlyingValue());
9480   auto &Builder = State.Builder;
9481   switch (I.getOpcode()) {
9482   case Instruction::Call:
9483   case Instruction::Br:
9484   case Instruction::PHI:
9485   case Instruction::GetElementPtr:
9486   case Instruction::Select:
9487     llvm_unreachable("This instruction is handled by a different recipe.");
9488   case Instruction::UDiv:
9489   case Instruction::SDiv:
9490   case Instruction::SRem:
9491   case Instruction::URem:
9492   case Instruction::Add:
9493   case Instruction::FAdd:
9494   case Instruction::Sub:
9495   case Instruction::FSub:
9496   case Instruction::FNeg:
9497   case Instruction::Mul:
9498   case Instruction::FMul:
9499   case Instruction::FDiv:
9500   case Instruction::FRem:
9501   case Instruction::Shl:
9502   case Instruction::LShr:
9503   case Instruction::AShr:
9504   case Instruction::And:
9505   case Instruction::Or:
9506   case Instruction::Xor: {
9507     // Just widen unops and binops.
9508     State.ILV->setDebugLocFromInst(&I);
9509 
9510     for (unsigned Part = 0; Part < State.UF; ++Part) {
9511       SmallVector<Value *, 2> Ops;
9512       for (VPValue *VPOp : operands())
9513         Ops.push_back(State.get(VPOp, Part));
9514 
9515       Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops);
9516 
9517       if (auto *VecOp = dyn_cast<Instruction>(V)) {
9518         VecOp->copyIRFlags(&I);
9519 
9520         // If the instruction is vectorized and was in a basic block that needed
9521         // predication, we can't propagate poison-generating flags (nuw/nsw,
9522         // exact, etc.). The control flow has been linearized and the
9523         // instruction is no longer guarded by the predicate, which could make
9524         // the flag properties to no longer hold.
9525         if (State.MayGeneratePoisonRecipes.count(this) > 0)
9526           VecOp->dropPoisonGeneratingFlags();
9527       }
9528 
9529       // Use this vector value for all users of the original instruction.
9530       State.set(this, V, Part);
9531       State.ILV->addMetadata(V, &I);
9532     }
9533 
9534     break;
9535   }
9536   case Instruction::ICmp:
9537   case Instruction::FCmp: {
9538     // Widen compares. Generate vector compares.
9539     bool FCmp = (I.getOpcode() == Instruction::FCmp);
9540     auto *Cmp = cast<CmpInst>(&I);
9541     State.ILV->setDebugLocFromInst(Cmp);
9542     for (unsigned Part = 0; Part < State.UF; ++Part) {
9543       Value *A = State.get(getOperand(0), Part);
9544       Value *B = State.get(getOperand(1), Part);
9545       Value *C = nullptr;
9546       if (FCmp) {
9547         // Propagate fast math flags.
9548         IRBuilder<>::FastMathFlagGuard FMFG(Builder);
9549         Builder.setFastMathFlags(Cmp->getFastMathFlags());
9550         C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
9551       } else {
9552         C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
9553       }
9554       State.set(this, C, Part);
9555       State.ILV->addMetadata(C, &I);
9556     }
9557 
9558     break;
9559   }
9560 
9561   case Instruction::ZExt:
9562   case Instruction::SExt:
9563   case Instruction::FPToUI:
9564   case Instruction::FPToSI:
9565   case Instruction::FPExt:
9566   case Instruction::PtrToInt:
9567   case Instruction::IntToPtr:
9568   case Instruction::SIToFP:
9569   case Instruction::UIToFP:
9570   case Instruction::Trunc:
9571   case Instruction::FPTrunc:
9572   case Instruction::BitCast: {
9573     auto *CI = cast<CastInst>(&I);
9574     State.ILV->setDebugLocFromInst(CI);
9575 
9576     /// Vectorize casts.
9577     Type *DestTy = (State.VF.isScalar())
9578                        ? CI->getType()
9579                        : VectorType::get(CI->getType(), State.VF);
9580 
9581     for (unsigned Part = 0; Part < State.UF; ++Part) {
9582       Value *A = State.get(getOperand(0), Part);
9583       Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
9584       State.set(this, Cast, Part);
9585       State.ILV->addMetadata(Cast, &I);
9586     }
9587     break;
9588   }
9589   default:
9590     // This instruction is not vectorized by simple widening.
9591     LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
9592     llvm_unreachable("Unhandled instruction!");
9593   } // end of switch.
9594 }
9595 
9596 void VPWidenGEPRecipe::execute(VPTransformState &State) {
9597   auto *GEP = cast<GetElementPtrInst>(getUnderlyingInstr());
9598   // Construct a vector GEP by widening the operands of the scalar GEP as
9599   // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
9600   // results in a vector of pointers when at least one operand of the GEP
9601   // is vector-typed. Thus, to keep the representation compact, we only use
9602   // vector-typed operands for loop-varying values.
9603 
9604   if (State.VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) {
9605     // If we are vectorizing, but the GEP has only loop-invariant operands,
9606     // the GEP we build (by only using vector-typed operands for
9607     // loop-varying values) would be a scalar pointer. Thus, to ensure we
9608     // produce a vector of pointers, we need to either arbitrarily pick an
9609     // operand to broadcast, or broadcast a clone of the original GEP.
9610     // Here, we broadcast a clone of the original.
9611     //
9612     // TODO: If at some point we decide to scalarize instructions having
9613     //       loop-invariant operands, this special case will no longer be
9614     //       required. We would add the scalarization decision to
9615     //       collectLoopScalars() and teach getVectorValue() to broadcast
9616     //       the lane-zero scalar value.
9617     auto *Clone = State.Builder.Insert(GEP->clone());
9618     for (unsigned Part = 0; Part < State.UF; ++Part) {
9619       Value *EntryPart = State.Builder.CreateVectorSplat(State.VF, Clone);
9620       State.set(this, EntryPart, Part);
9621       State.ILV->addMetadata(EntryPart, GEP);
9622     }
9623   } else {
9624     // If the GEP has at least one loop-varying operand, we are sure to
9625     // produce a vector of pointers. But if we are only unrolling, we want
9626     // to produce a scalar GEP for each unroll part. Thus, the GEP we
9627     // produce with the code below will be scalar (if VF == 1) or vector
9628     // (otherwise). Note that for the unroll-only case, we still maintain
9629     // values in the vector mapping with initVector, as we do for other
9630     // instructions.
9631     for (unsigned Part = 0; Part < State.UF; ++Part) {
9632       // The pointer operand of the new GEP. If it's loop-invariant, we
9633       // won't broadcast it.
9634       auto *Ptr = IsPtrLoopInvariant
9635                       ? State.get(getOperand(0), VPIteration(0, 0))
9636                       : State.get(getOperand(0), Part);
9637 
9638       // Collect all the indices for the new GEP. If any index is
9639       // loop-invariant, we won't broadcast it.
9640       SmallVector<Value *, 4> Indices;
9641       for (unsigned I = 1, E = getNumOperands(); I < E; I++) {
9642         VPValue *Operand = getOperand(I);
9643         if (IsIndexLoopInvariant[I - 1])
9644           Indices.push_back(State.get(Operand, VPIteration(0, 0)));
9645         else
9646           Indices.push_back(State.get(Operand, Part));
9647       }
9648 
9649       // If the GEP instruction is vectorized and was in a basic block that
9650       // needed predication, we can't propagate the poison-generating 'inbounds'
9651       // flag. The control flow has been linearized and the GEP is no longer
9652       // guarded by the predicate, which could make the 'inbounds' properties to
9653       // no longer hold.
9654       bool IsInBounds =
9655           GEP->isInBounds() && State.MayGeneratePoisonRecipes.count(this) == 0;
9656 
9657       // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
9658       // but it should be a vector, otherwise.
9659       auto *NewGEP = IsInBounds
9660                          ? State.Builder.CreateInBoundsGEP(
9661                                GEP->getSourceElementType(), Ptr, Indices)
9662                          : State.Builder.CreateGEP(GEP->getSourceElementType(),
9663                                                    Ptr, Indices);
9664       assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) &&
9665              "NewGEP is not a pointer vector");
9666       State.set(this, NewGEP, Part);
9667       State.ILV->addMetadata(NewGEP, GEP);
9668     }
9669   }
9670 }
9671 
9672 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
9673   assert(!State.Instance && "Int or FP induction being replicated.");
9674   State.ILV->widenIntOrFpInduction(IV, getInductionDescriptor(),
9675                                    getStartValue()->getLiveInIRValue(),
9676                                    getTruncInst(), getVPValue(0), State);
9677 }
9678 
9679 void VPWidenPHIRecipe::execute(VPTransformState &State) {
9680   State.ILV->widenPHIInstruction(cast<PHINode>(getUnderlyingValue()), this,
9681                                  State);
9682 }
9683 
9684 void VPBlendRecipe::execute(VPTransformState &State) {
9685   State.ILV->setDebugLocFromInst(Phi, &State.Builder);
9686   // We know that all PHIs in non-header blocks are converted into
9687   // selects, so we don't have to worry about the insertion order and we
9688   // can just use the builder.
9689   // At this point we generate the predication tree. There may be
9690   // duplications since this is a simple recursive scan, but future
9691   // optimizations will clean it up.
9692 
9693   unsigned NumIncoming = getNumIncomingValues();
9694 
9695   // Generate a sequence of selects of the form:
9696   // SELECT(Mask3, In3,
9697   //        SELECT(Mask2, In2,
9698   //               SELECT(Mask1, In1,
9699   //                      In0)))
9700   // Note that Mask0 is never used: lanes for which no path reaches this phi and
9701   // are essentially undef are taken from In0.
9702   InnerLoopVectorizer::VectorParts Entry(State.UF);
9703   for (unsigned In = 0; In < NumIncoming; ++In) {
9704     for (unsigned Part = 0; Part < State.UF; ++Part) {
9705       // We might have single edge PHIs (blocks) - use an identity
9706       // 'select' for the first PHI operand.
9707       Value *In0 = State.get(getIncomingValue(In), Part);
9708       if (In == 0)
9709         Entry[Part] = In0; // Initialize with the first incoming value.
9710       else {
9711         // Select between the current value and the previous incoming edge
9712         // based on the incoming mask.
9713         Value *Cond = State.get(getMask(In), Part);
9714         Entry[Part] =
9715             State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi");
9716       }
9717     }
9718   }
9719   for (unsigned Part = 0; Part < State.UF; ++Part)
9720     State.set(this, Entry[Part], Part);
9721 }
9722 
9723 void VPInterleaveRecipe::execute(VPTransformState &State) {
9724   assert(!State.Instance && "Interleave group being replicated.");
9725   State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(),
9726                                       getStoredValues(), getMask());
9727 }
9728 
9729 void VPReductionRecipe::execute(VPTransformState &State) {
9730   assert(!State.Instance && "Reduction being replicated.");
9731   Value *PrevInChain = State.get(getChainOp(), 0);
9732   RecurKind Kind = RdxDesc->getRecurrenceKind();
9733   bool IsOrdered = State.ILV->useOrderedReductions(*RdxDesc);
9734   // Propagate the fast-math flags carried by the underlying instruction.
9735   IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
9736   State.Builder.setFastMathFlags(RdxDesc->getFastMathFlags());
9737   for (unsigned Part = 0; Part < State.UF; ++Part) {
9738     Value *NewVecOp = State.get(getVecOp(), Part);
9739     if (VPValue *Cond = getCondOp()) {
9740       Value *NewCond = State.get(Cond, Part);
9741       VectorType *VecTy = cast<VectorType>(NewVecOp->getType());
9742       Value *Iden = RdxDesc->getRecurrenceIdentity(
9743           Kind, VecTy->getElementType(), RdxDesc->getFastMathFlags());
9744       Value *IdenVec =
9745           State.Builder.CreateVectorSplat(VecTy->getElementCount(), Iden);
9746       Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec);
9747       NewVecOp = Select;
9748     }
9749     Value *NewRed;
9750     Value *NextInChain;
9751     if (IsOrdered) {
9752       if (State.VF.isVector())
9753         NewRed = createOrderedReduction(State.Builder, *RdxDesc, NewVecOp,
9754                                         PrevInChain);
9755       else
9756         NewRed = State.Builder.CreateBinOp(
9757             (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), PrevInChain,
9758             NewVecOp);
9759       PrevInChain = NewRed;
9760     } else {
9761       PrevInChain = State.get(getChainOp(), Part);
9762       NewRed = createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp);
9763     }
9764     if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
9765       NextInChain =
9766           createMinMaxOp(State.Builder, RdxDesc->getRecurrenceKind(),
9767                          NewRed, PrevInChain);
9768     } else if (IsOrdered)
9769       NextInChain = NewRed;
9770     else
9771       NextInChain = State.Builder.CreateBinOp(
9772           (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), NewRed,
9773           PrevInChain);
9774     State.set(this, NextInChain, Part);
9775   }
9776 }
9777 
9778 void VPReplicateRecipe::execute(VPTransformState &State) {
9779   if (State.Instance) { // Generate a single instance.
9780     assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
9781     State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *State.Instance,
9782                                     IsPredicated, State);
9783     // Insert scalar instance packing it into a vector.
9784     if (AlsoPack && State.VF.isVector()) {
9785       // If we're constructing lane 0, initialize to start from poison.
9786       if (State.Instance->Lane.isFirstLane()) {
9787         assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
9788         Value *Poison = PoisonValue::get(
9789             VectorType::get(getUnderlyingValue()->getType(), State.VF));
9790         State.set(this, Poison, State.Instance->Part);
9791       }
9792       State.ILV->packScalarIntoVectorValue(this, *State.Instance, State);
9793     }
9794     return;
9795   }
9796 
9797   // Generate scalar instances for all VF lanes of all UF parts, unless the
9798   // instruction is uniform inwhich case generate only the first lane for each
9799   // of the UF parts.
9800   unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue();
9801   assert((!State.VF.isScalable() || IsUniform) &&
9802          "Can't scalarize a scalable vector");
9803   for (unsigned Part = 0; Part < State.UF; ++Part)
9804     for (unsigned Lane = 0; Lane < EndLane; ++Lane)
9805       State.ILV->scalarizeInstruction(getUnderlyingInstr(), this,
9806                                       VPIteration(Part, Lane), IsPredicated,
9807                                       State);
9808 }
9809 
9810 void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
9811   assert(State.Instance && "Branch on Mask works only on single instance.");
9812 
9813   unsigned Part = State.Instance->Part;
9814   unsigned Lane = State.Instance->Lane.getKnownLane();
9815 
9816   Value *ConditionBit = nullptr;
9817   VPValue *BlockInMask = getMask();
9818   if (BlockInMask) {
9819     ConditionBit = State.get(BlockInMask, Part);
9820     if (ConditionBit->getType()->isVectorTy())
9821       ConditionBit = State.Builder.CreateExtractElement(
9822           ConditionBit, State.Builder.getInt32(Lane));
9823   } else // Block in mask is all-one.
9824     ConditionBit = State.Builder.getTrue();
9825 
9826   // Replace the temporary unreachable terminator with a new conditional branch,
9827   // whose two destinations will be set later when they are created.
9828   auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
9829   assert(isa<UnreachableInst>(CurrentTerminator) &&
9830          "Expected to replace unreachable terminator with conditional branch.");
9831   auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
9832   CondBr->setSuccessor(0, nullptr);
9833   ReplaceInstWithInst(CurrentTerminator, CondBr);
9834 }
9835 
9836 void VPPredInstPHIRecipe::execute(VPTransformState &State) {
9837   assert(State.Instance && "Predicated instruction PHI works per instance.");
9838   Instruction *ScalarPredInst =
9839       cast<Instruction>(State.get(getOperand(0), *State.Instance));
9840   BasicBlock *PredicatedBB = ScalarPredInst->getParent();
9841   BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
9842   assert(PredicatingBB && "Predicated block has no single predecessor.");
9843   assert(isa<VPReplicateRecipe>(getOperand(0)) &&
9844          "operand must be VPReplicateRecipe");
9845 
9846   // By current pack/unpack logic we need to generate only a single phi node: if
9847   // a vector value for the predicated instruction exists at this point it means
9848   // the instruction has vector users only, and a phi for the vector value is
9849   // needed. In this case the recipe of the predicated instruction is marked to
9850   // also do that packing, thereby "hoisting" the insert-element sequence.
9851   // Otherwise, a phi node for the scalar value is needed.
9852   unsigned Part = State.Instance->Part;
9853   if (State.hasVectorValue(getOperand(0), Part)) {
9854     Value *VectorValue = State.get(getOperand(0), Part);
9855     InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
9856     PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
9857     VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
9858     VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
9859     if (State.hasVectorValue(this, Part))
9860       State.reset(this, VPhi, Part);
9861     else
9862       State.set(this, VPhi, Part);
9863     // NOTE: Currently we need to update the value of the operand, so the next
9864     // predicated iteration inserts its generated value in the correct vector.
9865     State.reset(getOperand(0), VPhi, Part);
9866   } else {
9867     Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType();
9868     PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
9869     Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()),
9870                      PredicatingBB);
9871     Phi->addIncoming(ScalarPredInst, PredicatedBB);
9872     if (State.hasScalarValue(this, *State.Instance))
9873       State.reset(this, Phi, *State.Instance);
9874     else
9875       State.set(this, Phi, *State.Instance);
9876     // NOTE: Currently we need to update the value of the operand, so the next
9877     // predicated iteration inserts its generated value in the correct vector.
9878     State.reset(getOperand(0), Phi, *State.Instance);
9879   }
9880 }
9881 
9882 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
9883   VPValue *StoredValue = isStore() ? getStoredValue() : nullptr;
9884 
9885   // Attempt to issue a wide load.
9886   LoadInst *LI = dyn_cast<LoadInst>(&Ingredient);
9887   StoreInst *SI = dyn_cast<StoreInst>(&Ingredient);
9888 
9889   assert((LI || SI) && "Invalid Load/Store instruction");
9890   assert((!SI || StoredValue) && "No stored value provided for widened store");
9891   assert((!LI || !StoredValue) && "Stored value provided for widened load");
9892 
9893   Type *ScalarDataTy = getLoadStoreType(&Ingredient);
9894 
9895   auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
9896   const Align Alignment = getLoadStoreAlignment(&Ingredient);
9897   bool CreateGatherScatter = !Consecutive;
9898 
9899   auto &Builder = State.Builder;
9900   InnerLoopVectorizer::VectorParts BlockInMaskParts(State.UF);
9901   bool isMaskRequired = getMask();
9902   if (isMaskRequired)
9903     for (unsigned Part = 0; Part < State.UF; ++Part)
9904       BlockInMaskParts[Part] = State.get(getMask(), Part);
9905 
9906   const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
9907     // Calculate the pointer for the specific unroll-part.
9908     GetElementPtrInst *PartPtr = nullptr;
9909 
9910     bool InBounds = false;
9911     if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
9912       InBounds = gep->isInBounds();
9913     if (Reverse) {
9914       // If the address is consecutive but reversed, then the
9915       // wide store needs to start at the last vector element.
9916       // RunTimeVF =  VScale * VF.getKnownMinValue()
9917       // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue()
9918       Value *RunTimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), State.VF);
9919       // NumElt = -Part * RunTimeVF
9920       Value *NumElt = Builder.CreateMul(Builder.getInt32(-Part), RunTimeVF);
9921       // LastLane = 1 - RunTimeVF
9922       Value *LastLane = Builder.CreateSub(Builder.getInt32(1), RunTimeVF);
9923       PartPtr =
9924           cast<GetElementPtrInst>(Builder.CreateGEP(ScalarDataTy, Ptr, NumElt));
9925       PartPtr->setIsInBounds(InBounds);
9926       PartPtr = cast<GetElementPtrInst>(
9927           Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane));
9928       PartPtr->setIsInBounds(InBounds);
9929       if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
9930         BlockInMaskParts[Part] =
9931             Builder.CreateVectorReverse(BlockInMaskParts[Part], "reverse");
9932     } else {
9933       Value *Increment =
9934           createStepForVF(Builder, Builder.getInt32Ty(), State.VF, Part);
9935       PartPtr = cast<GetElementPtrInst>(
9936           Builder.CreateGEP(ScalarDataTy, Ptr, Increment));
9937       PartPtr->setIsInBounds(InBounds);
9938     }
9939 
9940     unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
9941     return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
9942   };
9943 
9944   // Handle Stores:
9945   if (SI) {
9946     State.ILV->setDebugLocFromInst(SI);
9947 
9948     for (unsigned Part = 0; Part < State.UF; ++Part) {
9949       Instruction *NewSI = nullptr;
9950       Value *StoredVal = State.get(StoredValue, Part);
9951       if (CreateGatherScatter) {
9952         Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
9953         Value *VectorGep = State.get(getAddr(), Part);
9954         NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
9955                                             MaskPart);
9956       } else {
9957         if (Reverse) {
9958           // If we store to reverse consecutive memory locations, then we need
9959           // to reverse the order of elements in the stored value.
9960           StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse");
9961           // We don't want to update the value in the map as it might be used in
9962           // another expression. So don't call resetVectorValue(StoredVal).
9963         }
9964         auto *VecPtr =
9965             CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0)));
9966         if (isMaskRequired)
9967           NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
9968                                             BlockInMaskParts[Part]);
9969         else
9970           NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
9971       }
9972       State.ILV->addMetadata(NewSI, SI);
9973     }
9974     return;
9975   }
9976 
9977   // Handle loads.
9978   assert(LI && "Must have a load instruction");
9979   State.ILV->setDebugLocFromInst(LI);
9980   for (unsigned Part = 0; Part < State.UF; ++Part) {
9981     Value *NewLI;
9982     if (CreateGatherScatter) {
9983       Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
9984       Value *VectorGep = State.get(getAddr(), Part);
9985       NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart,
9986                                          nullptr, "wide.masked.gather");
9987       State.ILV->addMetadata(NewLI, LI);
9988     } else {
9989       auto *VecPtr =
9990           CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0)));
9991       if (isMaskRequired)
9992         NewLI = Builder.CreateMaskedLoad(
9993             DataTy, VecPtr, Alignment, BlockInMaskParts[Part],
9994             PoisonValue::get(DataTy), "wide.masked.load");
9995       else
9996         NewLI =
9997             Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");
9998 
9999       // Add metadata to the load, but setVectorValue to the reverse shuffle.
10000       State.ILV->addMetadata(NewLI, LI);
10001       if (Reverse)
10002         NewLI = Builder.CreateVectorReverse(NewLI, "reverse");
10003     }
10004 
10005     State.set(getVPSingleValue(), NewLI, Part);
10006   }
10007 }
10008 
10009 // Determine how to lower the scalar epilogue, which depends on 1) optimising
10010 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
10011 // predication, and 4) a TTI hook that analyses whether the loop is suitable
10012 // for predication.
10013 static ScalarEpilogueLowering getScalarEpilogueLowering(
10014     Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI,
10015     BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
10016     AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
10017     LoopVectorizationLegality &LVL) {
10018   // 1) OptSize takes precedence over all other options, i.e. if this is set,
10019   // don't look at hints or options, and don't request a scalar epilogue.
10020   // (For PGSO, as shouldOptimizeForSize isn't currently accessible from
10021   // LoopAccessInfo (due to code dependency and not being able to reliably get
10022   // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection
10023   // of strides in LoopAccessInfo::analyzeLoop() and vectorize without
10024   // versioning when the vectorization is forced, unlike hasOptSize. So revert
10025   // back to the old way and vectorize with versioning when forced. See D81345.)
10026   if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
10027                                                       PGSOQueryType::IRPass) &&
10028                           Hints.getForce() != LoopVectorizeHints::FK_Enabled))
10029     return CM_ScalarEpilogueNotAllowedOptSize;
10030 
10031   // 2) If set, obey the directives
10032   if (PreferPredicateOverEpilogue.getNumOccurrences()) {
10033     switch (PreferPredicateOverEpilogue) {
10034     case PreferPredicateTy::ScalarEpilogue:
10035       return CM_ScalarEpilogueAllowed;
10036     case PreferPredicateTy::PredicateElseScalarEpilogue:
10037       return CM_ScalarEpilogueNotNeededUsePredicate;
10038     case PreferPredicateTy::PredicateOrDontVectorize:
10039       return CM_ScalarEpilogueNotAllowedUsePredicate;
10040     };
10041   }
10042 
10043   // 3) If set, obey the hints
10044   switch (Hints.getPredicate()) {
10045   case LoopVectorizeHints::FK_Enabled:
10046     return CM_ScalarEpilogueNotNeededUsePredicate;
10047   case LoopVectorizeHints::FK_Disabled:
10048     return CM_ScalarEpilogueAllowed;
10049   };
10050 
10051   // 4) if the TTI hook indicates this is profitable, request predication.
10052   if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT,
10053                                        LVL.getLAI()))
10054     return CM_ScalarEpilogueNotNeededUsePredicate;
10055 
10056   return CM_ScalarEpilogueAllowed;
10057 }
10058 
10059 Value *VPTransformState::get(VPValue *Def, unsigned Part) {
10060   // If Values have been set for this Def return the one relevant for \p Part.
10061   if (hasVectorValue(Def, Part))
10062     return Data.PerPartOutput[Def][Part];
10063 
10064   if (!hasScalarValue(Def, {Part, 0})) {
10065     Value *IRV = Def->getLiveInIRValue();
10066     Value *B = ILV->getBroadcastInstrs(IRV);
10067     set(Def, B, Part);
10068     return B;
10069   }
10070 
10071   Value *ScalarValue = get(Def, {Part, 0});
10072   // If we aren't vectorizing, we can just copy the scalar map values over
10073   // to the vector map.
10074   if (VF.isScalar()) {
10075     set(Def, ScalarValue, Part);
10076     return ScalarValue;
10077   }
10078 
10079   auto *RepR = dyn_cast<VPReplicateRecipe>(Def);
10080   bool IsUniform = RepR && RepR->isUniform();
10081 
10082   unsigned LastLane = IsUniform ? 0 : VF.getKnownMinValue() - 1;
10083   // Check if there is a scalar value for the selected lane.
10084   if (!hasScalarValue(Def, {Part, LastLane})) {
10085     // At the moment, VPWidenIntOrFpInductionRecipes can also be uniform.
10086     assert(isa<VPWidenIntOrFpInductionRecipe>(Def->getDef()) &&
10087            "unexpected recipe found to be invariant");
10088     IsUniform = true;
10089     LastLane = 0;
10090   }
10091 
10092   auto *LastInst = cast<Instruction>(get(Def, {Part, LastLane}));
10093   // Set the insert point after the last scalarized instruction or after the
10094   // last PHI, if LastInst is a PHI. This ensures the insertelement sequence
10095   // will directly follow the scalar definitions.
10096   auto OldIP = Builder.saveIP();
10097   auto NewIP =
10098       isa<PHINode>(LastInst)
10099           ? BasicBlock::iterator(LastInst->getParent()->getFirstNonPHI())
10100           : std::next(BasicBlock::iterator(LastInst));
10101   Builder.SetInsertPoint(&*NewIP);
10102 
10103   // However, if we are vectorizing, we need to construct the vector values.
10104   // If the value is known to be uniform after vectorization, we can just
10105   // broadcast the scalar value corresponding to lane zero for each unroll
10106   // iteration. Otherwise, we construct the vector values using
10107   // insertelement instructions. Since the resulting vectors are stored in
10108   // State, we will only generate the insertelements once.
10109   Value *VectorValue = nullptr;
10110   if (IsUniform) {
10111     VectorValue = ILV->getBroadcastInstrs(ScalarValue);
10112     set(Def, VectorValue, Part);
10113   } else {
10114     // Initialize packing with insertelements to start from undef.
10115     assert(!VF.isScalable() && "VF is assumed to be non scalable.");
10116     Value *Undef = PoisonValue::get(VectorType::get(LastInst->getType(), VF));
10117     set(Def, Undef, Part);
10118     for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane)
10119       ILV->packScalarIntoVectorValue(Def, {Part, Lane}, *this);
10120     VectorValue = get(Def, Part);
10121   }
10122   Builder.restoreIP(OldIP);
10123   return VectorValue;
10124 }
10125 
10126 // Process the loop in the VPlan-native vectorization path. This path builds
10127 // VPlan upfront in the vectorization pipeline, which allows to apply
10128 // VPlan-to-VPlan transformations from the very beginning without modifying the
10129 // input LLVM IR.
10130 static bool processLoopInVPlanNativePath(
10131     Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
10132     LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
10133     TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
10134     OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
10135     ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints,
10136     LoopVectorizationRequirements &Requirements) {
10137 
10138   if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) {
10139     LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n");
10140     return false;
10141   }
10142   assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
10143   Function *F = L->getHeader()->getParent();
10144   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
10145 
10146   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
10147       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL);
10148 
10149   LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
10150                                 &Hints, IAI);
10151   // Use the planner for outer loop vectorization.
10152   // TODO: CM is not used at this point inside the planner. Turn CM into an
10153   // optional argument if we don't need it in the future.
10154   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE, Hints,
10155                                Requirements, ORE);
10156 
10157   // Get user vectorization factor.
10158   ElementCount UserVF = Hints.getWidth();
10159 
10160   CM.collectElementTypesForWidening();
10161 
10162   // Plan how to best vectorize, return the best VF and its cost.
10163   const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
10164 
10165   // If we are stress testing VPlan builds, do not attempt to generate vector
10166   // code. Masked vector code generation support will follow soon.
10167   // Also, do not attempt to vectorize if no vector code will be produced.
10168   if (VPlanBuildStressTest || EnableVPlanPredication ||
10169       VectorizationFactor::Disabled() == VF)
10170     return false;
10171 
10172   VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
10173 
10174   {
10175     GeneratedRTChecks Checks(*PSE.getSE(), DT, LI,
10176                              F->getParent()->getDataLayout());
10177     InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL,
10178                            &CM, BFI, PSI, Checks);
10179     LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
10180                       << L->getHeader()->getParent()->getName() << "\"\n");
10181     LVP.executePlan(VF.Width, 1, BestPlan, LB, DT);
10182   }
10183 
10184   // Mark the loop as already vectorized to avoid vectorizing again.
10185   Hints.setAlreadyVectorized();
10186   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
10187   return true;
10188 }
10189 
10190 // Emit a remark if there are stores to floats that required a floating point
10191 // extension. If the vectorized loop was generated with floating point there
10192 // will be a performance penalty from the conversion overhead and the change in
10193 // the vector width.
10194 static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) {
10195   SmallVector<Instruction *, 4> Worklist;
10196   for (BasicBlock *BB : L->getBlocks()) {
10197     for (Instruction &Inst : *BB) {
10198       if (auto *S = dyn_cast<StoreInst>(&Inst)) {
10199         if (S->getValueOperand()->getType()->isFloatTy())
10200           Worklist.push_back(S);
10201       }
10202     }
10203   }
10204 
10205   // Traverse the floating point stores upwards searching, for floating point
10206   // conversions.
10207   SmallPtrSet<const Instruction *, 4> Visited;
10208   SmallPtrSet<const Instruction *, 4> EmittedRemark;
10209   while (!Worklist.empty()) {
10210     auto *I = Worklist.pop_back_val();
10211     if (!L->contains(I))
10212       continue;
10213     if (!Visited.insert(I).second)
10214       continue;
10215 
10216     // Emit a remark if the floating point store required a floating
10217     // point conversion.
10218     // TODO: More work could be done to identify the root cause such as a
10219     // constant or a function return type and point the user to it.
10220     if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second)
10221       ORE->emit([&]() {
10222         return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision",
10223                                           I->getDebugLoc(), L->getHeader())
10224                << "floating point conversion changes vector width. "
10225                << "Mixed floating point precision requires an up/down "
10226                << "cast that will negatively impact performance.";
10227       });
10228 
10229     for (Use &Op : I->operands())
10230       if (auto *OpI = dyn_cast<Instruction>(Op))
10231         Worklist.push_back(OpI);
10232   }
10233 }
10234 
10235 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts)
10236     : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced ||
10237                                !EnableLoopInterleaving),
10238       VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced ||
10239                               !EnableLoopVectorization) {}
10240 
10241 bool LoopVectorizePass::processLoop(Loop *L) {
10242   assert((EnableVPlanNativePath || L->isInnermost()) &&
10243          "VPlan-native path is not enabled. Only process inner loops.");
10244 
10245 #ifndef NDEBUG
10246   const std::string DebugLocStr = getDebugLocString(L);
10247 #endif /* NDEBUG */
10248 
10249   LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \""
10250                     << L->getHeader()->getParent()->getName() << "\" from "
10251                     << DebugLocStr << "\n");
10252 
10253   LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE);
10254 
10255   LLVM_DEBUG(
10256       dbgs() << "LV: Loop hints:"
10257              << " force="
10258              << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
10259                      ? "disabled"
10260                      : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
10261                             ? "enabled"
10262                             : "?"))
10263              << " width=" << Hints.getWidth()
10264              << " interleave=" << Hints.getInterleave() << "\n");
10265 
10266   // Function containing loop
10267   Function *F = L->getHeader()->getParent();
10268 
10269   // Looking at the diagnostic output is the only way to determine if a loop
10270   // was vectorized (other than looking at the IR or machine code), so it
10271   // is important to generate an optimization remark for each loop. Most of
10272   // these messages are generated as OptimizationRemarkAnalysis. Remarks
10273   // generated as OptimizationRemark and OptimizationRemarkMissed are
10274   // less verbose reporting vectorized loops and unvectorized loops that may
10275   // benefit from vectorization, respectively.
10276 
10277   if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
10278     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
10279     return false;
10280   }
10281 
10282   PredicatedScalarEvolution PSE(*SE, *L);
10283 
10284   // Check if it is legal to vectorize the loop.
10285   LoopVectorizationRequirements Requirements;
10286   LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE,
10287                                 &Requirements, &Hints, DB, AC, BFI, PSI);
10288   if (!LVL.canVectorize(EnableVPlanNativePath)) {
10289     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
10290     Hints.emitRemarkWithHints();
10291     return false;
10292   }
10293 
10294   // Check the function attributes and profiles to find out if this function
10295   // should be optimized for size.
10296   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
10297       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL);
10298 
10299   // Entrance to the VPlan-native vectorization path. Outer loops are processed
10300   // here. They may require CFG and instruction level transformations before
10301   // even evaluating whether vectorization is profitable. Since we cannot modify
10302   // the incoming IR, we need to build VPlan upfront in the vectorization
10303   // pipeline.
10304   if (!L->isInnermost())
10305     return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
10306                                         ORE, BFI, PSI, Hints, Requirements);
10307 
10308   assert(L->isInnermost() && "Inner loop expected.");
10309 
10310   // Check the loop for a trip count threshold: vectorize loops with a tiny trip
10311   // count by optimizing for size, to minimize overheads.
10312   auto ExpectedTC = getSmallBestKnownTC(*SE, L);
10313   if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
10314     LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
10315                       << "This loop is worth vectorizing only if no scalar "
10316                       << "iteration overheads are incurred.");
10317     if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
10318       LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
10319     else {
10320       LLVM_DEBUG(dbgs() << "\n");
10321       SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
10322     }
10323   }
10324 
10325   // Check the function attributes to see if implicit floats are allowed.
10326   // FIXME: This check doesn't seem possibly correct -- what if the loop is
10327   // an integer loop and the vector instructions selected are purely integer
10328   // vector instructions?
10329   if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
10330     reportVectorizationFailure(
10331         "Can't vectorize when the NoImplicitFloat attribute is used",
10332         "loop not vectorized due to NoImplicitFloat attribute",
10333         "NoImplicitFloat", ORE, L);
10334     Hints.emitRemarkWithHints();
10335     return false;
10336   }
10337 
10338   // Check if the target supports potentially unsafe FP vectorization.
10339   // FIXME: Add a check for the type of safety issue (denormal, signaling)
10340   // for the target we're vectorizing for, to make sure none of the
10341   // additional fp-math flags can help.
10342   if (Hints.isPotentiallyUnsafe() &&
10343       TTI->isFPVectorizationPotentiallyUnsafe()) {
10344     reportVectorizationFailure(
10345         "Potentially unsafe FP op prevents vectorization",
10346         "loop not vectorized due to unsafe FP support.",
10347         "UnsafeFP", ORE, L);
10348     Hints.emitRemarkWithHints();
10349     return false;
10350   }
10351 
10352   bool AllowOrderedReductions;
10353   // If the flag is set, use that instead and override the TTI behaviour.
10354   if (ForceOrderedReductions.getNumOccurrences() > 0)
10355     AllowOrderedReductions = ForceOrderedReductions;
10356   else
10357     AllowOrderedReductions = TTI->enableOrderedReductions();
10358   if (!LVL.canVectorizeFPMath(AllowOrderedReductions)) {
10359     ORE->emit([&]() {
10360       auto *ExactFPMathInst = Requirements.getExactFPInst();
10361       return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps",
10362                                                  ExactFPMathInst->getDebugLoc(),
10363                                                  ExactFPMathInst->getParent())
10364              << "loop not vectorized: cannot prove it is safe to reorder "
10365                 "floating-point operations";
10366     });
10367     LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to "
10368                          "reorder floating-point operations\n");
10369     Hints.emitRemarkWithHints();
10370     return false;
10371   }
10372 
10373   bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
10374   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
10375 
10376   // If an override option has been passed in for interleaved accesses, use it.
10377   if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
10378     UseInterleaved = EnableInterleavedMemAccesses;
10379 
10380   // Analyze interleaved memory accesses.
10381   if (UseInterleaved) {
10382     IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
10383   }
10384 
10385   // Use the cost model.
10386   LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
10387                                 F, &Hints, IAI);
10388   CM.collectValuesToIgnore();
10389   CM.collectElementTypesForWidening();
10390 
10391   // Use the planner for vectorization.
10392   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE, Hints,
10393                                Requirements, ORE);
10394 
10395   // Get user vectorization factor and interleave count.
10396   ElementCount UserVF = Hints.getWidth();
10397   unsigned UserIC = Hints.getInterleave();
10398 
10399   // Plan how to best vectorize, return the best VF and its cost.
10400   Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC);
10401 
10402   VectorizationFactor VF = VectorizationFactor::Disabled();
10403   unsigned IC = 1;
10404 
10405   if (MaybeVF) {
10406     VF = *MaybeVF;
10407     // Select the interleave count.
10408     IC = CM.selectInterleaveCount(VF.Width, *VF.Cost.getValue());
10409   }
10410 
10411   // Identify the diagnostic messages that should be produced.
10412   std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
10413   bool VectorizeLoop = true, InterleaveLoop = true;
10414   if (VF.Width.isScalar()) {
10415     LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
10416     VecDiagMsg = std::make_pair(
10417         "VectorizationNotBeneficial",
10418         "the cost-model indicates that vectorization is not beneficial");
10419     VectorizeLoop = false;
10420   }
10421 
10422   if (!MaybeVF && UserIC > 1) {
10423     // Tell the user interleaving was avoided up-front, despite being explicitly
10424     // requested.
10425     LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
10426                          "interleaving should be avoided up front\n");
10427     IntDiagMsg = std::make_pair(
10428         "InterleavingAvoided",
10429         "Ignoring UserIC, because interleaving was avoided up front");
10430     InterleaveLoop = false;
10431   } else if (IC == 1 && UserIC <= 1) {
10432     // Tell the user interleaving is not beneficial.
10433     LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
10434     IntDiagMsg = std::make_pair(
10435         "InterleavingNotBeneficial",
10436         "the cost-model indicates that interleaving is not beneficial");
10437     InterleaveLoop = false;
10438     if (UserIC == 1) {
10439       IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
10440       IntDiagMsg.second +=
10441           " and is explicitly disabled or interleave count is set to 1";
10442     }
10443   } else if (IC > 1 && UserIC == 1) {
10444     // Tell the user interleaving is beneficial, but it explicitly disabled.
10445     LLVM_DEBUG(
10446         dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
10447     IntDiagMsg = std::make_pair(
10448         "InterleavingBeneficialButDisabled",
10449         "the cost-model indicates that interleaving is beneficial "
10450         "but is explicitly disabled or interleave count is set to 1");
10451     InterleaveLoop = false;
10452   }
10453 
10454   // Override IC if user provided an interleave count.
10455   IC = UserIC > 0 ? UserIC : IC;
10456 
10457   // Emit diagnostic messages, if any.
10458   const char *VAPassName = Hints.vectorizeAnalysisPassName();
10459   if (!VectorizeLoop && !InterleaveLoop) {
10460     // Do not vectorize or interleaving the loop.
10461     ORE->emit([&]() {
10462       return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
10463                                       L->getStartLoc(), L->getHeader())
10464              << VecDiagMsg.second;
10465     });
10466     ORE->emit([&]() {
10467       return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
10468                                       L->getStartLoc(), L->getHeader())
10469              << IntDiagMsg.second;
10470     });
10471     return false;
10472   } else if (!VectorizeLoop && InterleaveLoop) {
10473     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
10474     ORE->emit([&]() {
10475       return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
10476                                         L->getStartLoc(), L->getHeader())
10477              << VecDiagMsg.second;
10478     });
10479   } else if (VectorizeLoop && !InterleaveLoop) {
10480     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
10481                       << ") in " << DebugLocStr << '\n');
10482     ORE->emit([&]() {
10483       return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
10484                                         L->getStartLoc(), L->getHeader())
10485              << IntDiagMsg.second;
10486     });
10487   } else if (VectorizeLoop && InterleaveLoop) {
10488     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
10489                       << ") in " << DebugLocStr << '\n');
10490     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
10491   }
10492 
10493   bool DisableRuntimeUnroll = false;
10494   MDNode *OrigLoopID = L->getLoopID();
10495   {
10496     // Optimistically generate runtime checks. Drop them if they turn out to not
10497     // be profitable. Limit the scope of Checks, so the cleanup happens
10498     // immediately after vector codegeneration is done.
10499     GeneratedRTChecks Checks(*PSE.getSE(), DT, LI,
10500                              F->getParent()->getDataLayout());
10501     if (!VF.Width.isScalar() || IC > 1)
10502       Checks.Create(L, *LVL.getLAI(), PSE.getUnionPredicate());
10503 
10504     using namespace ore;
10505     if (!VectorizeLoop) {
10506       assert(IC > 1 && "interleave count should not be 1 or 0");
10507       // If we decided that it is not legal to vectorize the loop, then
10508       // interleave it.
10509       InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
10510                                  &CM, BFI, PSI, Checks);
10511 
10512       VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
10513       LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT);
10514 
10515       ORE->emit([&]() {
10516         return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
10517                                   L->getHeader())
10518                << "interleaved loop (interleaved count: "
10519                << NV("InterleaveCount", IC) << ")";
10520       });
10521     } else {
10522       // If we decided that it is *legal* to vectorize the loop, then do it.
10523 
10524       // Consider vectorizing the epilogue too if it's profitable.
10525       VectorizationFactor EpilogueVF =
10526           CM.selectEpilogueVectorizationFactor(VF.Width, LVP);
10527       if (EpilogueVF.Width.isVector()) {
10528 
10529         // The first pass vectorizes the main loop and creates a scalar epilogue
10530         // to be vectorized by executing the plan (potentially with a different
10531         // factor) again shortly afterwards.
10532         EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1);
10533         EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE,
10534                                            EPI, &LVL, &CM, BFI, PSI, Checks);
10535 
10536         VPlan &BestMainPlan = LVP.getBestPlanFor(EPI.MainLoopVF);
10537         LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF, BestMainPlan, MainILV,
10538                         DT);
10539         ++LoopsVectorized;
10540 
10541         simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
10542         formLCSSARecursively(*L, *DT, LI, SE);
10543 
10544         // Second pass vectorizes the epilogue and adjusts the control flow
10545         // edges from the first pass.
10546         EPI.MainLoopVF = EPI.EpilogueVF;
10547         EPI.MainLoopUF = EPI.EpilogueUF;
10548         EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC,
10549                                                  ORE, EPI, &LVL, &CM, BFI, PSI,
10550                                                  Checks);
10551 
10552         VPlan &BestEpiPlan = LVP.getBestPlanFor(EPI.EpilogueVF);
10553         LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV,
10554                         DT);
10555         ++LoopsEpilogueVectorized;
10556 
10557         if (!MainILV.areSafetyChecksAdded())
10558           DisableRuntimeUnroll = true;
10559       } else {
10560         InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
10561                                &LVL, &CM, BFI, PSI, Checks);
10562 
10563         VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
10564         LVP.executePlan(VF.Width, IC, BestPlan, LB, DT);
10565         ++LoopsVectorized;
10566 
10567         // Add metadata to disable runtime unrolling a scalar loop when there
10568         // are no runtime checks about strides and memory. A scalar loop that is
10569         // rarely used is not worth unrolling.
10570         if (!LB.areSafetyChecksAdded())
10571           DisableRuntimeUnroll = true;
10572       }
10573       // Report the vectorization decision.
10574       ORE->emit([&]() {
10575         return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(),
10576                                   L->getHeader())
10577                << "vectorized loop (vectorization width: "
10578                << NV("VectorizationFactor", VF.Width)
10579                << ", interleaved count: " << NV("InterleaveCount", IC) << ")";
10580       });
10581     }
10582 
10583     if (ORE->allowExtraAnalysis(LV_NAME))
10584       checkMixedPrecision(L, ORE);
10585   }
10586 
10587   Optional<MDNode *> RemainderLoopID =
10588       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
10589                                       LLVMLoopVectorizeFollowupEpilogue});
10590   if (RemainderLoopID.hasValue()) {
10591     L->setLoopID(RemainderLoopID.getValue());
10592   } else {
10593     if (DisableRuntimeUnroll)
10594       AddRuntimeUnrollDisableMetaData(L);
10595 
10596     // Mark the loop as already vectorized to avoid vectorizing again.
10597     Hints.setAlreadyVectorized();
10598   }
10599 
10600   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
10601   return true;
10602 }
10603 
10604 LoopVectorizeResult LoopVectorizePass::runImpl(
10605     Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_,
10606     DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_,
10607     DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_,
10608     std::function<const LoopAccessInfo &(Loop &)> &GetLAA_,
10609     OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) {
10610   SE = &SE_;
10611   LI = &LI_;
10612   TTI = &TTI_;
10613   DT = &DT_;
10614   BFI = &BFI_;
10615   TLI = TLI_;
10616   AA = &AA_;
10617   AC = &AC_;
10618   GetLAA = &GetLAA_;
10619   DB = &DB_;
10620   ORE = &ORE_;
10621   PSI = PSI_;
10622 
10623   // Don't attempt if
10624   // 1. the target claims to have no vector registers, and
10625   // 2. interleaving won't help ILP.
10626   //
10627   // The second condition is necessary because, even if the target has no
10628   // vector registers, loop vectorization may still enable scalar
10629   // interleaving.
10630   if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) &&
10631       TTI->getMaxInterleaveFactor(1) < 2)
10632     return LoopVectorizeResult(false, false);
10633 
10634   bool Changed = false, CFGChanged = false;
10635 
10636   // The vectorizer requires loops to be in simplified form.
10637   // Since simplification may add new inner loops, it has to run before the
10638   // legality and profitability checks. This means running the loop vectorizer
10639   // will simplify all loops, regardless of whether anything end up being
10640   // vectorized.
10641   for (auto &L : *LI)
10642     Changed |= CFGChanged |=
10643         simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
10644 
10645   // Build up a worklist of inner-loops to vectorize. This is necessary as
10646   // the act of vectorizing or partially unrolling a loop creates new loops
10647   // and can invalidate iterators across the loops.
10648   SmallVector<Loop *, 8> Worklist;
10649 
10650   for (Loop *L : *LI)
10651     collectSupportedLoops(*L, LI, ORE, Worklist);
10652 
10653   LoopsAnalyzed += Worklist.size();
10654 
10655   // Now walk the identified inner loops.
10656   while (!Worklist.empty()) {
10657     Loop *L = Worklist.pop_back_val();
10658 
10659     // For the inner loops we actually process, form LCSSA to simplify the
10660     // transform.
10661     Changed |= formLCSSARecursively(*L, *DT, LI, SE);
10662 
10663     Changed |= CFGChanged |= processLoop(L);
10664   }
10665 
10666   // Process each loop nest in the function.
10667   return LoopVectorizeResult(Changed, CFGChanged);
10668 }
10669 
10670 PreservedAnalyses LoopVectorizePass::run(Function &F,
10671                                          FunctionAnalysisManager &AM) {
10672     auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
10673     auto &LI = AM.getResult<LoopAnalysis>(F);
10674     auto &TTI = AM.getResult<TargetIRAnalysis>(F);
10675     auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
10676     auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F);
10677     auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
10678     auto &AA = AM.getResult<AAManager>(F);
10679     auto &AC = AM.getResult<AssumptionAnalysis>(F);
10680     auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
10681     auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
10682 
10683     auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
10684     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
10685         [&](Loop &L) -> const LoopAccessInfo & {
10686       LoopStandardAnalysisResults AR = {AA,  AC,  DT,      LI,      SE,
10687                                         TLI, TTI, nullptr, nullptr, nullptr};
10688       return LAM.getResult<LoopAccessAnalysis>(L, AR);
10689     };
10690     auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
10691     ProfileSummaryInfo *PSI =
10692         MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
10693     LoopVectorizeResult Result =
10694         runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI);
10695     if (!Result.MadeAnyChange)
10696       return PreservedAnalyses::all();
10697     PreservedAnalyses PA;
10698 
10699     // We currently do not preserve loopinfo/dominator analyses with outer loop
10700     // vectorization. Until this is addressed, mark these analyses as preserved
10701     // only for non-VPlan-native path.
10702     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
10703     if (!EnableVPlanNativePath) {
10704       PA.preserve<LoopAnalysis>();
10705       PA.preserve<DominatorTreeAnalysis>();
10706     }
10707 
10708     if (Result.MadeCFGChange) {
10709       // Making CFG changes likely means a loop got vectorized. Indicate that
10710       // extra simplification passes should be run.
10711       // TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only
10712       // be run if runtime checks have been added.
10713       AM.getResult<ShouldRunExtraVectorPasses>(F);
10714       PA.preserve<ShouldRunExtraVectorPasses>();
10715     } else {
10716       PA.preserveSet<CFGAnalyses>();
10717     }
10718     return PA;
10719 }
10720 
10721 void LoopVectorizePass::printPipeline(
10722     raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
10723   static_cast<PassInfoMixin<LoopVectorizePass> *>(this)->printPipeline(
10724       OS, MapClassName2PassName);
10725 
10726   OS << "<";
10727   OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;";
10728   OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;";
10729   OS << ">";
10730 }
10731